From e4cd63722d58a7a63870e141b0887878d58988bd Mon Sep 17 00:00:00 2001 From: helloaidank Date: Wed, 6 Nov 2024 18:36:14 +0000 Subject: [PATCH 1/6] adding file for calculating comparison of hn zones as well as plotting script for now --- .../hn_zones/comparison_of_hn_zones.py | 350 ++++++++++++++++++ .../analysis/hn_zones/heat_network_zones.py | 19 + .../hn_zones/plot_comparison_of_hn_zones.py | 225 +++++++++++ 3 files changed, 594 insertions(+) create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py diff --git a/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py new file mode 100644 index 0000000..bd3d07b --- /dev/null +++ b/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py @@ -0,0 +1,350 @@ +""" +This script processes and compares DESNZ heat network (HN) and heat pump suitability data for Liverpool, and outputs some files for plotting. + +Steps: +1. Load DESNZ HN GeoData and perform a spatial join with LSOA polygons. +2. Process Nesta heat pump suitability scores for Liverpool. +3. Identify LSOAs in DESNZ HN list not in HP suitability data. +4. Calculate and log average scores and Mean Absolute Error (MAE). +5. Save processed data to files. + +Outputs: +- 'liverpool_with_desnz_hn_lsoa.gpkg': GeoDataFrame with LSOA codes added to HN zones. +- 'liverpool_hp_suitability_lsoas.json': List of unique LSOA codes in Liverpool. +- 'liverpool_hp_suitability_scores_with_desnz.parquet': Processed suitability scores with DESNZ pilot scores and MAE. +- 'script_output.log': Log file with metrics such as avg HN scores and MAE. +""" + +import geopandas as gpd +import pyogrio +import polars as pl +from typing import Tuple, List, Optional +import logging +import json +import os +from asf_heat_pump_suitability import PROJECT_DIR + + +# Define file paths +LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" +LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" +NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" +LA_TO_ANALYSE = "Liverpool" + + +def setup_logging_and_file_path( + output_dir: str, log_filename: str = "script_output.log", level: int = logging.INFO +) -> None: + """ + Set up logging configuration to log messages to both a file and the console. + + Args: + output_dir (str): Path to the directory where the log file should be saved. + log_filename (str): Name of the log file. Defaults to "script_output.log". + level (int): Logging level, e.g., logging.INFO or logging.DEBUG. Defaults to logging.INFO. + """ + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Define the full path to the log file + log_file_path = os.path.join(output_dir, log_filename) + + # Clear existing logging handlers if any + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + + # Set up logging configuration + logging.basicConfig( + level=level, + format="%(levelname)s: %(message)s", + handlers=[ + logging.FileHandler(log_file_path, mode="w"), # Overwrite log file each run + logging.StreamHandler(), + ], + ) + logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.") + + +def load_hn_geodata( + desnz_hn_gpkg_path: str, lsoa_shp_path: str +) -> Tuple[gpd.GeoDataFrame, List[str]]: + """ + Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes. + + Args: + desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones. + lsoa_shp_path (str): Path to the LSOA shapefile. + + Returns: + Tuple[gpd.GeoDataFrame, List[str]]: + - GeoDataFrame with LSOA codes added via spatial join. + - List of unique LSOA codes present in the GeoDataFrame. + """ + # Load DESNZ HN polygons GeoPackage + desnz_hn_gdf = pyogrio.read_dataframe( + desnz_hn_gpkg_path, layer="heat-network-zone-map-Liverpool" + ) + + # Load the LSOA polygons file with 'LSOA21CD' codes + lsoa_gdf = gpd.read_file(lsoa_shp_path) + + # Remove 'index_right' column if it exists + if "index_right" in desnz_hn_gdf.columns: + desnz_hn_gdf = desnz_hn_gdf.drop(columns=["index_right"]) + + # Ensure both GeoDataFrames use the same CRS + if desnz_hn_gdf.crs != desnz_hn_gdf.crs: + lsoa_gdf = lsoa_gdf.to_crs(desnz_hn_gdf.crs) + + # Perform spatial join to match Liverpool polygons with LSOA codes + joined_gdf = gpd.sjoin( + desnz_hn_gdf, + lsoa_gdf[["geometry", "LSOA21CD"]], + how="left", + predicate="intersects", + ) + + # Extract unique LSOA codes from the joined GeoDataFrame + desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist() + + return joined_gdf, desnz_hn_unique_lsoas + + +def process_nesta_hp_suitability_scores( + nesta_hp_suitability_scores: str, local_authority: str +) -> Tuple[pl.DataFrame, List[str]]: + """ + Load and process Nesta heat pump suitability data for a specific local authority. + + Args: + nesta_hp_suitability_scores (str): Path to the Nesta heat pump suitability Parquet file with HN scores. + local_authority (str): Local authority name to filter the data. + + Returns: + Tuple[pl.DataFrame, List[str]]: + - DataFrame containing filtered LSOA codes and 'HN_N_avg_score_weighted' column. + - List of unique LSOA codes within the local authority. + """ + # Load the data from the Parquet file + hp_scores_df = pl.read_parquet(nesta_hp_suitability_scores) + + # Rename 'lsoa' to 'LSOA21CD' if needed to match the GeoDataFrame + if "lsoa" in hp_scores_df.columns: + hp_scores_df = hp_scores_df.rename({"lsoa": "LSOA21CD"}) + + # Filter by local authority and select relevant columns + la_filtered_scores = hp_scores_df.filter( + pl.col("lsoa_name").str.starts_with(local_authority) + ).select(["LSOA21CD", "HN_N_avg_score_weighted"]) + + # Extract unique LSOA codes + la_unique_lsoas = la_filtered_scores["LSOA21CD"].unique().to_list() + + return la_filtered_scores, la_unique_lsoas + + +def add_desnz_pilot_score( + la_hp_suitability_scores: pl.DataFrame, list_of_desnz_hn_lsoas: List[str] +) -> Tuple[pl.DataFrame, float, float]: + """ + Add a 'DESNZ_pilot_score' column to the Nesta heat pump suitability data and compute average scores. + + Args: + la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns. + list_of_desnz_hn_lsoas (List[str]): List of LSOA codes within the DESNZ HN pilot zones. + + Returns: + Tuple[pl.DataFrame, float, float]: + - Updated DataFrame with 'DESNZ_pilot_score' column added (1 if in pilot zone, otherwise 0). + - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_score' is 1. + - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_score' is 0. + """ + # Add 'DESNZ_pilot_score' column: 1 if LSOA is in DESNZ pilot zone, else 0 + la_hp_suitability_scores = la_hp_suitability_scores.with_columns( + pl.col("LSOA21CD") + .is_in(list_of_desnz_hn_lsoas) + .cast(pl.Int8) + .alias("DESNZ_pilot_score") + ) + # Calculate average scores where 'DESNZ_pilot_score' == 1 and == 0 + avg_hn_score_pilot_1 = _calculate_hn_pilot_average_score( + la_hp_suitability_scores, pilot_score=1 + ) + avg_hn_score_pilot_0 = _calculate_hn_pilot_average_score( + la_hp_suitability_scores, pilot_score=0 + ) + return la_hp_suitability_scores, avg_hn_score_pilot_1, avg_hn_score_pilot_0 + + +def _calculate_hn_pilot_average_score( + la_hp_suitability_scores: pl.DataFrame, pilot_score: int +) -> float: + """ + Calculate the average 'HN_N_avg_score_weighted' for a specified 'DESNZ_pilot_score' value. + + Args: + la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_score' and 'HN_N_avg_score_weighted' columns for a LA. + pilot_score (int): The value of 'DESNZ_pilot_score' to filter by (must be 1 or 0). + + Returns: + float: The average 'HN_N_avg_score_weighted' for rows with the specified 'DESNZ_pilot_score'. + Returns 0.0 if there are no rows with the specified score. + + Raises: + ValueError: If pilot_score is not 1 or 0. + """ + # Validate that pilot_score is either 1 or 0 + if pilot_score not in {0, 1}: + raise ValueError("pilot_score must be either 1 or 0") + + # Filter and calculate the average score + filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter( + pl.col("DESNZ_pilot_score") == pilot_score + ) + avg_hn_score_pilot = ( + filtered_la_hp_suitability_scores.select( + pl.col("HN_N_avg_score_weighted").mean() + ).item() + if filtered_la_hp_suitability_scores.height > 0 + else 0.0 + ) + return avg_hn_score_pilot + + +def calculate_mae_for_pilot_score( + hp_suitability_scores_with_desnz: pl.DataFrame, score_value: int +) -> float: + """ + Calculate and log the Mean Absolute Error (MAE) for entries with a specified 'DESNZ_pilot_score'. + + Args: + hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_score' and 'absolute_error' columns. + score_value (int): The value of 'DESNZ_pilot_score' (must be 1 or 0) for which to calculate the MAE. + + Returns: + float: The MAE for the specified 'DESNZ_pilot_score'. + + Raises: + ValueError: If score_value is not 1 or 0. + """ + # Validate that score_value is either 1 or 0 + if score_value not in {0, 1}: + raise ValueError("score_value must be either 1 or 0") + + mae = hp_suitability_scores_with_desnz.filter( + pl.col("DESNZ_pilot_score") == score_value + )["absolute_error"].mean() + + logging.info( + f"Mean Absolute Error (MAE) for DESNZ_pilot_score = {score_value}: {mae}" + ) + return mae + + +def calculate_mae_for_all( + hp_suitability_scores: pl.DataFrame, desnz_col: str, nesta_hn_score_col: str +) -> Tuple[pl.DataFrame, float]: + """ + Calculate the Mean Absolute Error (MAE) between the DESNZ pilot score and Nesta's HN suitability score, add the absolute error column to the DataFrame, + and log the result. + + Args: + hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing the DESNZ pilot (actual) and Nesta HN suitability score (predicted) columns. + desnz_col (str): Name of the column with DESNZ pilot (actual) values. + nesta_hn_score_col (str): Name of the column with Nesta HN suitability (predicted) values. + + Returns: + Tuple[pl.DataFrame, float]: + - Updated DataFrame with 'absolute_error' column added. + - The Mean Absolute Error (MAE) between the actual and predicted columns. + """ + # Calculate absolute error and add it as a new column + hp_suitability_scores_with_error = hp_suitability_scores.with_columns( + (pl.col(desnz_col) - pl.col(nesta_hn_score_col)).abs().alias("absolute_error") + ) + + # Calculate the mean of the absolute error + mae = hp_suitability_scores_with_error["absolute_error"].mean() + logging.info( + f"Mean Absolute Error (MAE) for {desnz_col} vs {nesta_hn_score_col}: {mae}" + ) + + return hp_suitability_scores_with_error, mae + + +def main(): + # Define the output directory and set up logging + output_dir = os.path.join( + PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_data/" + ) + setup_logging_and_file_path(output_dir) + + # Load the data and perform spatial join + liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata( + LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH + ) + logging.info("Loaded Liverpool with DESNZ HN LSOA data.") + liverpool_with_desnz_hn_lsoa.to_file( + os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"), driver="GPKG" + ) + + # Process Nesta heat pump suitability scores + liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = ( + process_nesta_hp_suitability_scores( + NESTA_HP_SUITABILITY_PARQUET_PATH, LA_TO_ANALYSE + ) + ) + logging.info("Processed Nesta HP suitability scores for Liverpool.") + + with open( + os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w" + ) as file: + json.dump(liverpool_hp_suitability_lsoas, file) + + # Check LSOAs not in HP suitability scores + not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set( + liverpool_hp_suitability_lsoas + ) + logging.info( + f"number of LSOAs in DESNZ list not in HP suitability data: {len(not_in_hp_suitability)}" + ) + + # Calculate and log average score + avg_hn_score = liverpool_hp_suitability_scores["HN_N_avg_score_weighted"].mean() + logging.info(f"Average HN_N_avg_score_weighted: {avg_hn_score}") + + # Add DESNZ pilot score and calculate averages + ( + liverpool_hp_suitability_scores_with_desnz, + avg_hn_score_pilot_1, + avg_hn_score_pilot_0, + ) = add_desnz_pilot_score( + liverpool_hp_suitability_scores, list_of_liverpool_desnz_hn_lsoas + ) + logging.info( + f"Avg HN_N_avg_score_weighted for DESNZ_pilot_score = 1: {avg_hn_score_pilot_1}" + ) + logging.info( + f"Avg HN_N_avg_score_weighted for DESNZ_pilot_score = 0: {avg_hn_score_pilot_0}" + ) + + # Calculate and log Mean Absolute Error (MAE) + liverpool_hp_suitability_scores_with_desnz, mae_all = calculate_mae_for_all( + liverpool_hp_suitability_scores_with_desnz, + "DESNZ_pilot_score", + "HN_N_avg_score_weighted", + ) + mae_pilot_1 = calculate_mae_for_pilot_score( + liverpool_hp_suitability_scores_with_desnz, score_value=1 + ) + mae_pilot_0 = calculate_mae_for_pilot_score( + liverpool_hp_suitability_scores_with_desnz, score_value=0 + ) + + liverpool_hp_suitability_scores_with_desnz.write_parquet( + os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet") + ) + + +if __name__ == "__main__": + main() diff --git a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py new file mode 100644 index 0000000..306ad35 --- /dev/null +++ b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py @@ -0,0 +1,19 @@ +import pyogrio +import matplotlib.pyplot as plt + +# List layers in the GeoPackage +layers = pyogrio.list_layers("heat-network-zone-map-Liverpool.gpkg") +print("Layers in GeoPackage:", layers) + + +# Read a specific layer into a GeoDataFrame +gdf = pyogrio.read_dataframe( + "heat-network-zone-map-Liverpool.gpkg", layer="heat-network-zone-map-Liverpool" +) +print(gdf.head()) +print(gdf.crs) +print(len(gdf)) +fig, ax = plt.subplots(figsize=(12, 6)) +gdf.plot(ax=ax) +plt.savefig("liverpool_heat_network_zones.png") +plt.show() diff --git a/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py new file mode 100644 index 0000000..4079262 --- /dev/null +++ b/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py @@ -0,0 +1,225 @@ +""" +This script processes and visualizes heat network (HN) and heat pump suitability data for Liverpool. + +Steps: +1. Load and preprocess data: + - Read Parquet and GeoPackage files. + - Extract unique LSOA geometries. + - Load LSOA geometries and convert CRS if needed. + - Filter Liverpool LSOAs. + +2. Plot Liverpool LSOA geometries and save as PNG. + +3. Merge suitability scores with geometries and convert to GeoDataFrame. + +4. Plot and save overlay of DESNZ Pilot Score = 1 on Liverpool heat network zones. + +5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files. +""" + +import geopandas as gpd +import pandas as pd +import matplotlib.pyplot as plt +import pyogrio +import json +import os +import logging +from asf_heat_pump_suitability import PROJECT_DIR + +# Set up logging configuration +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# Define constants for file paths +INPUT_DIR = os.path.join( + PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_data/" +) +LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join( + INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet" +) +LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" +LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" +LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json") +OUTPUT_DIR = os.path.join( + PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_plots/" +) + +# Ensure the output directory exists +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def load_data(): + """ + Load the necessary data files. + + Returns: + Tuple containing: + - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores. + - List[str]: List of LSOA codes for Liverpool. + """ + try: + liverpool_hp_suitability_scores_pd = pd.read_parquet( + LIVERPOOL_HP_SUITABILITY_PARQUET + ) + with open(LSOA_JSON_PATH, "r") as file: + liverpool_hp_suitability_lsoas = json.load(file) + return liverpool_hp_suitability_scores_pd, liverpool_hp_suitability_lsoas + except Exception as e: + logging.error(f"Error loading data: {e}") + raise + + +def preprocess_data( + liverpool_hp_suitability_lsoas: list, lsoa_shp_path: str = LSOA_SHP_PATH +) -> gpd.GeoDataFrame: + """ + Preprocess the data by extracting unique LSOA geometries and filtering Liverpool LSOAs. + + Args: + liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool. + lsoa_shp_path (str): Path to the LSOA shapefile. + + Returns: + gpd.GeoDataFrame: GeoDataFrame with filtered Liverpool LSOA geometries. + """ + lsoa_geometries_gdf = gpd.read_file(lsoa_shp_path) + if lsoa_geometries_gdf.crs != "EPSG:27700": # Assuming British National Grid + logging.info("Converting to EPSG:27700") + lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs("EPSG:27700") + liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[ + lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas) + ] + return liverpool_lsoa_geometries_gdf + + +def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR): + """ + Plot Liverpool LSOA geometries and save as PNG. + + Args: + liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries. + output_dir (str): Output directory to save the plot image. + """ + fig, ax = plt.subplots(figsize=(12, 6)) + liverpool_lsoa_geometries_gdf.plot(ax=ax) + plt.title("Liverpool LSOA Geometries") + plt.axis("off") + plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.png")) + plt.show() + + +def merge_data( + liverpool_hp_suitability_scores_pd: pd.DataFrame, + liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame, +) -> gpd.GeoDataFrame: + """ + Merge suitability scores with geometries and convert to GeoDataFrame. + + Args: + liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores. + liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries. + + Returns: + gpd.GeoDataFrame: GeoDataFrame with merged suitability scores and geometries. + """ + liverpool_hp_suitability_with_geometry = liverpool_hp_suitability_scores_pd.merge( + liverpool_lsoa_geometries_gdf, on="LSOA21CD", how="left" + ) + liverpool_hp_suitability_gdf = gpd.GeoDataFrame( + liverpool_hp_suitability_with_geometry, geometry="geometry" + ) + return liverpool_hp_suitability_gdf + + +def plot_overlay( + liverpool_hp_suitability_gdf: gpd.GeoDataFrame, + liverpool_heat_network_zones_filepath: str = LIVERPOOL_GPKG_PATH, + output_dir: str = OUTPUT_DIR, +): + """ + Plot and save overlay of DESNZ Pilot Score = 1 on Liverpool heat network zones. + + Args: + liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries. + liverpool_heat_network_zones_filepath (str): Path to the Liverpool heat network zones GeoPackage. + output_dir (str): Output directory to save the plot image. + """ + gdf = pyogrio.read_dataframe( + liverpool_heat_network_zones_filepath, layer="heat-network-zone-map-Liverpool" + ) + fig, ax = plt.subplots(figsize=(12, 6)) + gdf.plot(ax=ax, color="blue", alpha=0.5) + liverpool_hp_suitability_gdf[ + liverpool_hp_suitability_gdf["DESNZ_pilot_score"] == 1 + ].plot(ax=ax, color="pink", alpha=0.5) + plt.title("Overlay of DESNZ Pilot Score = 1 on Liverpool Heat Network Zones") + plt.axis("off") + plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.png")) + plt.show() + + +def plot_absolute_error_map( + hp_suitability_gdf: gpd.GeoDataFrame, + score: int, + title: str, + filename: str, + output_dir: str = OUTPUT_DIR, +): + """ + Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score, + overlaying regions with a different score as boundaries. + + Args: + df (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and + heat pump suitability scores, including absolute error values. + score (int): DESNZ pilot score value for which to highlight areas. + Only regions matching this score will be color-filled. + title (str): Title for the plot. + filename (str): Path to save the plot image as a PNG file. + output_dir (str): Output directory to save the plot image. + + This function: + - Plots regions where DESNZ pilot score matches the specified `score`, + coloring them by 'absolute_error' values. + - Overlays boundaries of areas where DESNZ pilot score does not match the specified `score`. + - Displays a legend and saves the plot as a PNG file. + """ + fig, ax = plt.subplots(figsize=(12, 6)) + hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_score"] == score].plot( + column="absolute_error", ax=ax, legend=True, cmap="inferno" + ) + hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_score"] != score].plot( + ax=ax, edgecolor="black", facecolor="none", linewidth=0.5 + ) + plt.title(title) + plt.axis("off") + plt.savefig(os.path.join(output_dir, filename)) + plt.show() + + +def main(): + """ + Main function to load, preprocess, and visualise data. + """ + liverpool_hp_suitability_scores_pd, liverpool_hp_suitability_lsoas = load_data() + liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas) + plot_lsoa_geometries(liverpool_lsoa_geometries_gdf) + liverpool_hp_suitability_gdf = merge_data( + liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf + ) + plot_overlay(liverpool_hp_suitability_gdf) + plot_absolute_error_map( + liverpool_hp_suitability_gdf, + score=1, + title="Absolute Error Map for DESNZ Pilot Score = 1 for Liverpool", + filename="liverpool_data_with_pilot_score_1_v2.png", + ) + plot_absolute_error_map( + liverpool_hp_suitability_gdf, + score=0, + title="Absolute Error Map for DESNZ Pilot Score = 0 for Liverpool", + filename="liverpool_data_with_pilot_score_0_v2.png", + ) + + +if __name__ == "__main__": + main() From 3bef3c24fd3bf83d07a789efdc0ffa5273768e86 Mon Sep 17 00:00:00 2001 From: helloaidank Date: Thu, 7 Nov 2024 09:28:59 +0000 Subject: [PATCH 2/6] delete heat network zone --- .../analysis/hn_zones/heat_network_zones.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py diff --git a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py deleted file mode 100644 index 306ad35..0000000 --- a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py +++ /dev/null @@ -1,19 +0,0 @@ -import pyogrio -import matplotlib.pyplot as plt - -# List layers in the GeoPackage -layers = pyogrio.list_layers("heat-network-zone-map-Liverpool.gpkg") -print("Layers in GeoPackage:", layers) - - -# Read a specific layer into a GeoDataFrame -gdf = pyogrio.read_dataframe( - "heat-network-zone-map-Liverpool.gpkg", layer="heat-network-zone-map-Liverpool" -) -print(gdf.head()) -print(gdf.crs) -print(len(gdf)) -fig, ax = plt.subplots(figsize=(12, 6)) -gdf.plot(ax=ax) -plt.savefig("liverpool_heat_network_zones.png") -plt.show() From 17d69a382a7ad6fcf5a2490be2f575a68a4003a7 Mon Sep 17 00:00:00 2001 From: helloaidank Date: Mon, 2 Dec 2024 14:52:30 +0000 Subject: [PATCH 3/6] changes to methodology to have the desnz hn pilot score as fraction rather than binary --- outputs/hn_zones/comparison_of_hn_zones.py | 449 +++++++++++++++++++++ 1 file changed, 449 insertions(+) create mode 100644 outputs/hn_zones/comparison_of_hn_zones.py diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py new file mode 100644 index 0000000..6e42c27 --- /dev/null +++ b/outputs/hn_zones/comparison_of_hn_zones.py @@ -0,0 +1,449 @@ +""" +This script analyses DESNZ heat network (HN) zones and Nesta's heat pump suitability data for Liverpool, performing spatial joins, statistical analysis, and exporting processed results. + +**Key Steps:** +1. **Spatial Analysis**: + - Loads DESNZ HN zones and LSOA polygons, ensuring consistent CRS. + - Performs spatial joins to calculate the fraction of each LSOA covered by HN zones. + +2. **Nesta Data Processing**: + - Filters Nesta heat pump suitability scores for Liverpool's LSOAs. + - Identifies LSOAs which are outside selected LA. + +3. **Statistical Metrics**: + - Calculates average Nesta suitability scores for LSOAs covered and not covered by HN zones. + - Computes Mean Absolute Error (MAE) between DESNZ and Nesta scores. + +4. **Data Export**: + - Outputs GeoDataFrame with coverage data, LSOA lists, and processed suitability scores to GeoPackage, JSON, Parquet, and CSV formats. + - Logs detailed metrics and analysis steps. + +**Outputs**: +- GeoPackage: 'liverpool_with_desnz_hn_lsoa.gpkg' +- JSON: 'liverpool_hp_suitability_lsoas.json' +- Parquet: 'liverpool_hp_suitability_scores_with_desnz' +- Log: 'script_output.log' + +**How to Run the Script**: +To run the script, use the following command: +python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD] +""" + +import geopandas as gpd +import pyogrio +import polars as pl +from typing import Tuple, List, Callable, Optional +import logging +import json +import os +from shapely.geometry import Polygon +from asf_heat_pump_suitability import PROJECT_DIR +import argparse + + +# Define file paths +# LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" +# LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" +# NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" +LA_TO_ANALYSE = "Liverpool" +LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg" +LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp" +NESTA_HP_SUITABILITY_PARQUET_PATH = ( + "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" +) + + +def setup_logging_and_file_path( + output_dir: str, log_filename: str = "script_output.log", level: int = logging.INFO +) -> None: + """ + Set up logging configuration to log messages to both a file and the console. + + Args: + output_dir (str): Path to the directory where the log file should be saved. + log_filename (str): Name of the log file. Defaults to "script_output.log". + level (int): Logging level, e.g., logging.INFO or logging.DEBUG. Defaults to logging.INFO. + """ + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Define the full path to the log file + log_file_path = os.path.join(output_dir, log_filename) + + # Clear existing logging handlers if any + logging.getLogger().handlers.clear() + + # Set up logging configuration + logging.basicConfig( + level=level, + format="%(levelname)s: %(message)s", + handlers=[ + logging.FileHandler(log_file_path, mode="w"), # Overwrite log file each run + logging.StreamHandler(), + ], + ) + logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.") + + +def load_hn_geodata( + desnz_hn_gpkg_path: str, lsoa_shp_path: str +) -> Tuple[gpd.GeoDataFrame, List[str]]: + """ + Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes. + + Args: + desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones. + lsoa_shp_path (str): Path to the LSOA shapefile. + + Returns: + Tuple[gpd.GeoDataFrame, List[str]]: + - GeoDataFrame with LSOA codes added via spatial join. + - List of unique LSOA codes present in the GeoDataFrame. + """ + # Load DESNZ HN polygons GeoPackage + desnz_hn_gdf = pyogrio.read_dataframe( + desnz_hn_gpkg_path, layer="heat-network-zone-map-Liverpool" + ) + + # Load the LSOA polygons file with 'LSOA21CD' codes + lsoa_gdf = gpd.read_file(lsoa_shp_path) + + # Remove 'index_right' column if it exists + if "index_right" in desnz_hn_gdf.columns: + desnz_hn_gdf = desnz_hn_gdf.drop(columns=["index_right"]) + + # Ensure both GeoDataFrames use the same CRS + if desnz_hn_gdf.crs != lsoa_gdf.crs: + lsoa_gdf = lsoa_gdf.to_crs(desnz_hn_gdf.crs) + + # Perform spatial join to match Liverpool polygons with LSOA codes + joined_gdf = desnz_hn_gdf.sjoin( + lsoa_gdf[["geometry", "LSOA21CD"]], + how="left", + predicate="intersects", + ) + + # Calculate the intersection area and the fraction of LSOA area covered by HN zones + joined_gdf["intersection_area"] = joined_gdf.apply( + lambda row: row.geometry.intersection( + lsoa_gdf.loc[row.index_right, "geometry"] + ).area, + axis=1, + ) + lsoa_gdf["total_area"] = lsoa_gdf.geometry.area + + # Merge the LSOA geometries into the joined GeoDataFrame + joined_gdf = joined_gdf.merge(lsoa_gdf[["LSOA21CD", "total_area"]], on="LSOA21CD") + + # Extract unique LSOA codes from the joined GeoDataFrame + joined_gdf["fraction_covered"] = ( + joined_gdf["intersection_area"] / joined_gdf["total_area"] + ) + desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist() + return joined_gdf, desnz_hn_unique_lsoas + + +def filter_la_nesta_hp_scores( + nesta_hp_suitability_scores: str, local_authority: str +) -> Tuple[pl.DataFrame, List[str]]: + """ + Load and process Nesta heat pump suitability data for a specific local authority. + + Args: + nesta_hp_suitability_scores (str): Path to the Nesta heat pump suitability Parquet file with HN scores. + local_authority (str): Local authority name to filter the data. + + Returns: + Tuple[pl.DataFrame, List[str]]: + - DataFrame containing filtered LSOA codes and 'HN_N_avg_score_weighted' column. + - List of unique LSOA codes within the local authority. + """ + # Load the data from the Parquet file + hp_scores_df = pl.read_parquet(nesta_hp_suitability_scores) + + # Rename 'lsoa' to 'LSOA21CD' if needed to match the GeoDataFrame + if "lsoa" in hp_scores_df.columns: + hp_scores_df = hp_scores_df.rename({"lsoa": "LSOA21CD"}) + + # Filter by local authority and select relevant columns + la_filtered_scores = hp_scores_df.filter( + pl.col("lsoa_name").str.starts_with(local_authority) + ).select(["LSOA21CD", "HN_N_avg_score_weighted"]) + + # Extract unique LSOA codes for LA + la_unique_lsoas = la_filtered_scores["LSOA21CD"].unique().to_list() + + return la_filtered_scores, la_unique_lsoas + + +def add_DESNZ_pilot_fraction( + la_hp_suitability_scores: pl.DataFrame, + joined_gdf: gpd.GeoDataFrame, + optional_threshold: Optional[float] = 0, +) -> Tuple[pl.DataFrame, float, float]: + """ + Add a 'DESNZ_pilot_fraction' column to the Nesta heat pump suitability data using the fraction of area covered by HN zones and compute the average Nesta HN score per LA. + + Args: + la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns for a single LA. + joined_gdf (gpd.GeoDataFrame): GeoDataFrame containing 'LSOA21CD' and 'fraction_covered' columns. + + Returns: + Tuple[pl.DataFrame, float, float]: + - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of area covered by HN zones). + - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is non-zero. + - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is zero. + """ + # Convert joined_gdf to a Polars DataFrame + joined_df = pl.DataFrame( + { + "LSOA21CD": joined_gdf["LSOA21CD"], + "fraction_covered": joined_gdf["fraction_covered"], + } + ) + + # Merge the fraction_covered into la_hp_suitability_scores + la_hp_suitability_scores = la_hp_suitability_scores.join( + joined_df, on="LSOA21CD", how="left" + ) + + # Fill NaN values in fraction_covered with 0 + la_hp_suitability_scores = la_hp_suitability_scores.with_columns( + pl.col("fraction_covered").fill_null(0).alias("DESNZ_pilot_fraction") + ).drop("fraction_covered") + + # Calculating the average Nesta HN score for non-zero and zero DESNZ pilot scores + avg_hn_score_pilot_nonzero = _calculate_hn_pilot_average_score( + la_hp_suitability_scores, + pilot_score_condition=lambda col: col > optional_threshold, + ) + avg_hn_score_pilot_zero = _calculate_hn_pilot_average_score( + la_hp_suitability_scores, pilot_score_condition=lambda col: col == 0 + ) + return la_hp_suitability_scores, avg_hn_score_pilot_nonzero, avg_hn_score_pilot_zero + + +def calculate_average_scores_for_thresholds( + la_hp_suitability_scores: pl.DataFrame, thresholds: List[float] +) -> pl.DataFrame: + """ + Calculate the average 'HN_N_avg_score_weighted' for various thresholds of 'DESNZ_pilot_fraction'. + + Args: + la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns. + thresholds (List[float]): List of thresholds to evaluate. + + Returns: + pl.DataFrame: DataFrame with columns 'DESNZ_pilot_fraction_threshold' and 'HN_N_avg_score_weighted'. + """ + results = [] + for threshold in thresholds: + avg_score = _calculate_hn_pilot_average_score( + la_hp_suitability_scores, pilot_score_condition=lambda col: col > threshold + ) + results.append( + { + "DESNZ_pilot_fraction_threshold": threshold, + "HN_N_avg_score_weighted": avg_score, + } + ) + results_df = pl.DataFrame(results) + return results_df + + +def _calculate_hn_pilot_average_score( + la_hp_suitability_scores: pl.DataFrame, + pilot_score_condition: Callable[[pl.Expr], pl.Expr], +) -> float: + """ + Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > 0`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas. + + Args: + la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns for a LA. + pilot_score_condition (Callable[[pl.Expr], pl.Expr]): The value of 'DESNZ_pilot_fraction' to filter by (must be >0 or 0). + + Returns: + float: Average 'HN_N_avg_score_weighted' for the filtered rows. + """ + # Apply the condition function to filter the DataFrame + filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter( + pilot_score_condition(pl.col("DESNZ_pilot_fraction")) + ) + + # Calculate the average score + avg_score = filtered_la_hp_suitability_scores["HN_N_avg_score_weighted"].mean() + return avg_score + + +def calculate_mae_for_pilot_score( + hp_suitability_scores_with_desnz: pl.DataFrame, + condition_score_value: Callable[[pl.Expr], pl.Expr], +) -> float: + """ + Calculate the Mean Absolute Error (MAE) for entries satisfying a specified condition on 'DESNZ_pilot_fraction'. + + Args: + hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'absolute_error' columns. + condition_score_value (Callable[[pl.Expr], pl.Expr]): A function defining the condition to filter 'DESNZ_pilot_fraction'. + + Returns: + float: The MAE for the specified condition. + """ + # Apply the DEZNZ pilot score condition to filter the DataFrame + filtered_df = hp_suitability_scores_with_desnz.filter( + condition_score_value(pl.col("DESNZ_pilot_fraction")) + ) + mae = filtered_df["absolute_error"].mean() + return mae + + +def calculate_mae_for_all( + hp_suitability_scores: pl.DataFrame, desnz_col: str, nesta_hn_score_col: str +) -> Tuple[pl.DataFrame, float]: + """ + Calculate the Mean Absolute Error (MAE) between the DESNZ pilot score and Nesta's HN suitability score, add the absolute error column to the DataFrame, + and log the result. + + Args: + hp_suitability_scores (pl.DataFrame): DataFrame containing the DESNZ pilot (actual) and Nesta HN suitability score (predicted) columns. + desnz_col (str): Name of the column with DESNZ pilot (actual) values. + nesta_hn_score_col (str): Name of the column with Nesta HN suitability (predicted) values. + + Returns: + Tuple[pl.DataFrame, float]: + - Updated DataFrame with 'absolute_error' column added. + - The Mean Absolute Error (MAE) between the actual and predicted columns. + """ + # Calculate absolute error and add it as a new column + hp_suitability_scores_with_error = hp_suitability_scores.with_columns( + (pl.col(desnz_col) - pl.col(nesta_hn_score_col)).abs().alias("absolute_error") + ) + + # Calculate the mean of the absolute error + mae = hp_suitability_scores_with_error["absolute_error"].mean() + logging.info( + f"Mean Absolute Error (MAE) for {desnz_col} vs {nesta_hn_score_col}: {mae}" + ) + + return hp_suitability_scores_with_error, mae + + +if __name__ == "__main__": + # Argument parser for optional threshold + parser = argparse.ArgumentParser( + description="Process heat network zones and calculate scores." + ) + parser.add_argument( + "--optional_threshold", + type=float, + default=0.0, + help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score.", + ) + args = parser.parse_args() + # Extract the optional_threshold value from args + optional_threshold = args.optional_threshold + # Define the output directory and set up logging + output_dir = os.path.join( + PROJECT_DIR, "asf_heat_pump_suitability/outputs/hn_zones/output_data/" + ) + setup_logging_and_file_path(output_dir=output_dir) + + # Load the data and perform spatial join + liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata( + desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH + ) + + logging.info("Loaded Liverpool with DESNZ HN LSOA data.") + liverpool_with_desnz_hn_lsoa.to_file( + filename=os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"), + driver="GPKG", + ) + + # Process Nesta heat pump suitability scores + liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = ( + filter_la_nesta_hp_scores( + nesta_hp_suitability_scores=NESTA_HP_SUITABILITY_PARQUET_PATH, + local_authority=LA_TO_ANALYSE, + ) + ) + logging.info("Processed Nesta HP suitability scores for Liverpool.") + + with open( + os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w" + ) as file: + json.dump(liverpool_hp_suitability_lsoas, file) + + # Check LSOAs not in HP suitability scores + not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set( + liverpool_hp_suitability_lsoas + ) + logging.info( + f"number of LSOAs in DESNZ list not in HP suitability data: {len(not_in_hp_suitability)}" + ) + + # Calculate and log average score + avg_hn_score = liverpool_hp_suitability_scores["HN_N_avg_score_weighted"].mean() + logging.info(f"Average HN_N_avg_score_weighted: {avg_hn_score}") + # Define the list of thresholds + thresholds = [ + round(i * 0.05, 2) for i in range(0, 20) + ] # Generates [0.0, 0.05, ..., 0.95] + + # Add DESNZ pilot score and calculate averages + ( + liverpool_hp_suitability_scores_with_desnz, + avg_hn_score_pilot_nonzero, + avg_hn_score_pilot_zero, + ) = add_DESNZ_pilot_fraction( + la_hp_suitability_scores=liverpool_hp_suitability_scores, + joined_gdf=liverpool_with_desnz_hn_lsoa, + optional_threshold=optional_threshold, + ) + logging.info( + f"Avg HN_N_avg_score_weighted for DESNZ_pilot_fraction when > {optional_threshold}: {avg_hn_score_pilot_nonzero}" + ) + logging.info( + f"Avg HN_N_avg_score_weighted for DESNZ_pilot_fraction = 0: {avg_hn_score_pilot_zero}" + ) + + # Calculate average scores for different thresholds + average_scores_df = calculate_average_scores_for_thresholds( + la_hp_suitability_scores=liverpool_hp_suitability_scores_with_desnz, + thresholds=thresholds, + ) + + # Save the results to CSV + average_scores_df.write_csv( + os.path.join(output_dir, "average_scores_by_threshold.csv") + ) + + # Calculate and log Mean Absolute Error (MAE) + liverpool_hp_suitability_scores_with_desnz, mae_all = calculate_mae_for_all( + hp_suitability_scores=liverpool_hp_suitability_scores_with_desnz, + desnz_col="DESNZ_pilot_fraction", + nesta_hn_score_col="HN_N_avg_score_weighted", + ) + mae_pilot_non_zero = calculate_mae_for_pilot_score( + hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz, + condition_score_value=lambda col: col > 0, + ) + logging.info( + f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction > 0: {mae_pilot_non_zero}" + ) + mae_pilot_zero = calculate_mae_for_pilot_score( + hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz, + condition_score_value=lambda col: col == 0, + ) + logging.info( + f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction = 0: {mae_pilot_zero}" + ) + + liverpool_hp_suitability_scores_with_desnz.write_parquet( + os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet") + ) + # Write to CSV file + liverpool_hp_suitability_scores_with_desnz.write_csv( + os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.csv") + ) + logging.info("Saved processed data to this output directory:") + logging.info(output_dir) From 4100bfd16b663c195cb5eb684fe85a9b957c8bd1 Mon Sep 17 00:00:00 2001 From: helloaidank Date: Wed, 4 Dec 2024 19:53:27 +0000 Subject: [PATCH 4/6] small changes to methods script alongside adding plot of average hn score vs desnz fraction --- outputs/hn_zones/comparison_of_hn_zones.py | 10 +- .../hn_zones/plot_comparison_of_hn_zones.py | 387 ++++++++++++++++++ 2 files changed, 391 insertions(+), 6 deletions(-) create mode 100644 outputs/hn_zones/plot_comparison_of_hn_zones.py diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py index 6e42c27..6d5ec53 100644 --- a/outputs/hn_zones/comparison_of_hn_zones.py +++ b/outputs/hn_zones/comparison_of_hn_zones.py @@ -343,9 +343,7 @@ def calculate_mae_for_all( # Extract the optional_threshold value from args optional_threshold = args.optional_threshold # Define the output directory and set up logging - output_dir = os.path.join( - PROJECT_DIR, "asf_heat_pump_suitability/outputs/hn_zones/output_data/" - ) + output_dir = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/") setup_logging_and_file_path(output_dir=output_dir) # Load the data and perform spatial join @@ -412,9 +410,9 @@ def calculate_mae_for_all( thresholds=thresholds, ) - # Save the results to CSV - average_scores_df.write_csv( - os.path.join(output_dir, "average_scores_by_threshold.csv") + # Save the results to Parquet + average_scores_df.write_parquet( + os.path.join(output_dir, "average_scores_by_threshold.parquet") ) # Calculate and log Mean Absolute Error (MAE) diff --git a/outputs/hn_zones/plot_comparison_of_hn_zones.py b/outputs/hn_zones/plot_comparison_of_hn_zones.py new file mode 100644 index 0000000..5048e1e --- /dev/null +++ b/outputs/hn_zones/plot_comparison_of_hn_zones.py @@ -0,0 +1,387 @@ +""" +This script processes and visualises heat network (HN) and heat pump suitability data for Liverpool. + +Steps: +1. Load and preprocess data: + - Read Parquet, CSV, and GeoPackage files. + - Extract unique LSOA geometries. + - Load LSOA geometries and convert CRS if needed. + - Filter Liverpool LSOAs. + +2. Plot Liverpool LSOA geometries and save as PNG. + +3. Merge suitability scores with geometries and convert to GeoDataFrame. + +4. Plot and save overlay of DESNZ Pilot Score > 0 on Liverpool heat network zones. + +5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files. + +6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction along with plotting the best fit line and calculating the R^{2}. +""" + +import geopandas as gpd +import pandas as pd +import polars as pl +import matplotlib.pyplot as plt +import pyogrio +import json +import os +import logging +from asf_heat_pump_suitability import PROJECT_DIR +import numpy as np +from sklearn.metrics import r2_score + +# Set up logging configuration +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# Define constants for file paths (LEAVE UNCHANGED) +INPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/") + +LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join( + INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet" +) + +AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join( + INPUT_DIR, "average_scores_by_threshold.parquet" +) +print(AVERAGE_HN_SCORES_COVERAGE_PARQUET) + +LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" +LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" +LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json") +OUTPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_plots/") + +# Ensure the output directory exists +os.makedirs(OUTPUT_DIR, exist_ok=True) + +# Define a variable for target CRS to avoid hardcoding +TARGET_CRS = "EPSG:27700" + + +def load_data(): + """ + Load the necessary data files and return them. + + Returns: + Tuple: + - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores. + - List[str]: List of LSOA codes for Liverpool. + - pl.DataFrame: DataFrame with average HN scores coverage. + Raises: + IOError: If any of the data files cannot be loaded. + """ + try: + logging.info("Loading data files...") + liverpool_hp_suitability_scores_pd = pd.read_parquet( + LIVERPOOL_HP_SUITABILITY_PARQUET + ) + with open(LSOA_JSON_PATH, "r") as file: + liverpool_hp_suitability_lsoas = json.load(file) + average_hn_scores_coverage_df = pl.read_parquet( + AVERAGE_HN_SCORES_COVERAGE_PARQUET + ) + + # Basic validation: Check expected columns in the main DataFrame + expected_cols = ["LSOA21CD", "DESNZ_pilot_fraction", "absolute_error"] + missing_cols = [ + col + for col in expected_cols + if col not in liverpool_hp_suitability_scores_pd.columns + ] + if missing_cols: + raise ValueError( + f"Missing expected columns in main DataFrame: {missing_cols}" + ) + + return ( + liverpool_hp_suitability_scores_pd, + liverpool_hp_suitability_lsoas, + average_hn_scores_coverage_df, + ) + except Exception as e: + logging.error(f"Error loading data: {e}") + raise IOError(f"Failed to load input data due to: {e}") + + +def preprocess_data( + liverpool_hp_suitability_lsoas: list, + lsoa_shp_path: str = LSOA_SHP_PATH, + target_crs: str = TARGET_CRS, +) -> gpd.GeoDataFrame: + """ + Load and preprocess LSOA geospatial data by extracting unique LSOA geometries and filtering Liverpool LSOAs. + + Args: + liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool. + lsoa_shp_path (str): Path to the LSOA shapefile. + target_crs (str): The target CRS to ensure consistent geospatial analysis. + + Returns: + gpd.GeoDataFrame: GeoDataFrame with filtered Liverpool LSOA geometries in the given CRS. + """ + logging.info("Preprocessing LSOA geometries...") + lsoa_geometries_gdf = gpd.read_file(lsoa_shp_path) + if lsoa_geometries_gdf.crs != target_crs: + logging.info(f"Converting to {target_crs}") + lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs(target_crs) + liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[ + lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas) + ] + + if liverpool_lsoa_geometries_gdf.empty: + raise ValueError("No LSOAs found for Liverpool in the provided LSOA shapefile.") + + return liverpool_lsoa_geometries_gdf + + +def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR): + """ + Plot Liverpool LSOA geometries and save as PNG. + + Args: + liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries. + output_dir (str): Output directory to save the plot image. + """ + logging.info("Plotting LSOA geometries...") + fig, ax = plt.subplots(figsize=(12, 6)) + liverpool_lsoa_geometries_gdf.plot(ax=ax) + plt.title("Liverpool LSOA Geometries") + plt.axis("off") + plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.png")) + plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.pdf")) + plt.show() + + +def merge_data( + liverpool_hp_suitability_scores_pd: pd.DataFrame, + liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame, +) -> gpd.GeoDataFrame: + """ + Merge suitability scores with geometries and convert to GeoDataFrame. + + Args: + liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores. + liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries. + + Returns: + gpd.GeoDataFrame: GeoDataFrame with merged suitability scores and geometries. + + Raises: + ValueError: If the merged result is empty or missing geometry. + """ + logging.info("Merging suitability data with geometries...") + merged = liverpool_hp_suitability_scores_pd.merge( + liverpool_lsoa_geometries_gdf, on="LSOA21CD", how="left" + ) + + # Validate merged data + if merged.empty: + raise ValueError("Merging resulted in an empty DataFrame. Check LSOA codes.") + if merged["geometry"].isna().any(): + raise ValueError("Some records are missing geometry after merging.") + + liverpool_hp_suitability_gdf = gpd.GeoDataFrame( + merged, geometry="geometry", crs=TARGET_CRS + ) + return liverpool_hp_suitability_gdf + + +def plot_overlay( + liverpool_hp_suitability_gdf: gpd.GeoDataFrame, + liverpool_hnz_filepath: str = LIVERPOOL_GPKG_PATH, + output_dir: str = OUTPUT_DIR, +): + """ + Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool. + + Args: + liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries. + liverpool_hnz_filepath (str): Path to the Liverpool heat network zones GeoPackage. + output_dir (str): Output directory to save the plot image. + """ + logging.info("Plotting overlay of DESNZ pilot heat network zones...") + gdf = pyogrio.read_dataframe( + liverpool_hnz_filepath, layer="heat-network-zone-map-Liverpool" + ) + fig, ax = plt.subplots(figsize=(12, 6)) + gdf.plot(ax=ax, color="blue", alpha=0.5, label="DESNZ pilot heat network zones") + liverpool_hp_suitability_gdf[ + liverpool_hp_suitability_gdf["DESNZ_pilot_fraction"] > 0.0 + ].plot(ax=ax, color="pink", alpha=0.5, label="LSOAs with DESNZ pilot heat network") + plt.title("Overlay of DESNZ pilot heat network zones on Liverpool LSOAs") + plt.axis("off") + plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.png")) + plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.pdf")) + plt.show() + + +def plot_absolute_error_map( + hp_suitability_gdf: gpd.GeoDataFrame, + score: float, + title: str, + filename: str, + output_dir: str = OUTPUT_DIR, +): + """ + Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score. + + Args: + hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and + heat pump suitability scores, including 'absolute_error' and + 'DESNZ_pilot_fraction' columns. + score (float): DESNZ pilot score threshold. + If > 0.0, plot regions where DESNZ_pilot_fraction >= score. + If 0, plot regions where DESNZ_pilot_fraction == 0. + title (str): Title for the plot. + filename (str): Filename (PNG) to save the plot. + output_dir (str): Output directory to save the plot image. + + Raises: + ValueError: If score < 0. + """ + logging.info(f"Plotting absolute error map for score {score}...") + if score < 0: + raise ValueError("Score must be a non-negative value.") + + fig, ax = plt.subplots(figsize=(12, 6)) + + if score > 0.0: + subset = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] >= score] + other = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] < score] + else: # score == 0 + subset = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] == 0] + other = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] > 0] + + if subset.empty: + logging.warning(f"No regions found for the given score threshold: {score}") + + subset.plot(column="absolute_error", ax=ax, legend=True, cmap="inferno") + other.plot(ax=ax, edgecolor="black", facecolor="none", linewidth=0.5) + + # Update legend labeling for clarity + cbar = ax.get_figure().get_axes()[1] + cbar.set_ylabel( + "Absolute Error (|Avg. HN Score - DESNZ HN Score|)", rotation=90, labelpad=15 + ) + + plt.title(title) + plt.axis("off") + plt.savefig(os.path.join(output_dir, f"{filename}.png")) + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + plt.show() + + +def plot_hn_avg_score_vs_fraction_threshold( + average_hn_scores_coverage_df: pl.DataFrame, + fraction_thresholds_col: str, + hn_avg_scores_col: str, + filename: str, + output_dir: str = OUTPUT_DIR, +): + """ + Plot the average Nesta HN score against DESNZ Pilot Fraction Threshold, fit a regression line, + and display R^2. + + Args: + average_hn_scores_coverage_df (pl.DataFrame): DataFrame with average Nesta HN scores and DESNZ Pilot Fraction. + fraction_thresholds_col (str): Column name for DESNZ Pilot Fraction Threshold. + hn_avg_scores_col (str): Column name for HN_N Avg Score Weighted. + """ + logging.info("Plotting average HN score vs DESNZ Pilot Fraction Threshold...") + fraction_thresholds = average_hn_scores_coverage_df[ + fraction_thresholds_col + ].to_list() + hn_avg_scores = average_hn_scores_coverage_df[hn_avg_scores_col].to_list() + + x = np.array(fraction_thresholds) + y = np.array(hn_avg_scores) + + # Fit a linear model + coefficients = np.polyfit(x, y, 1) + model = np.poly1d(coefficients) + predicted_y = model(x) + r_squared = r2_score(y, predicted_y) + + y_min = min(hn_avg_scores) - 0.05 + y_max = max(hn_avg_scores) + 0.05 + + fig, ax = plt.subplots(figsize=(10, 6)) + ax.plot( + fraction_thresholds, + hn_avg_scores, + marker="o", + linestyle="-", + linewidth=2, + label="Data", + ) + ax.plot( + fraction_thresholds, + predicted_y, + color="red", + linestyle="--", + linewidth=2, + label="Regression line", + ) + + ax.set_ylim(y_min, y_max) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + # Break mark on the y-axis + d = 0.015 + kwargs = dict(transform=ax.transAxes, color="k", clip_on=False) + ax.plot((-d, +d), (-d, +d), **kwargs) + ax.plot((-d, +d), (1 - d, 1 + d), **kwargs) + + ax.set_title( + "Average Nesta HN Score vs Fraction of DESNZ HN zone in LSOA", fontsize=14 + ) + ax.set_xlabel("Fraction of DESNZ HN zone in LSOA", fontsize=12) + ax.set_ylabel("Average Nesta HN Score", fontsize=12) + ax.grid(True) + ax.text( + 0.05, + 0.95, + f"$R^2 = {r_squared:.2f}$", + transform=ax.transAxes, + fontsize=12, + verticalalignment="top", + bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"), + ) + ax.legend() + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"{filename}.png")) + plt.savefig(os.path.join(output_dir, f"{filename}.pdf")) + plt.show() + + +if __name__ == "__main__": + ( + liverpool_hp_suitability_scores_pd, + liverpool_hp_suitability_lsoas, + average_hn_scores_coverage_df, + ) = load_data() + liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas) + plot_lsoa_geometries(liverpool_lsoa_geometries_gdf) + liverpool_hp_suitability_gdf = merge_data( + liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf + ) + plot_overlay(liverpool_hp_suitability_gdf) + plot_absolute_error_map( + liverpool_hp_suitability_gdf, + score=0.000001, + title="Absolute Error Map for when a DESNZ HN zone is present for Liverpool", + filename="liverpool_data_with_pilot_score_1_v2", + ) + plot_absolute_error_map( + liverpool_hp_suitability_gdf, + score=0, + title="Absolute Error Map for when a DESNZ HN zone is absent for Liverpool", + filename="liverpool_data_with_pilot_score_0_v2", + ) + plot_hn_avg_score_vs_fraction_threshold( + average_hn_scores_coverage_df, + fraction_thresholds_col="DESNZ_pilot_fraction_threshold", + hn_avg_scores_col="HN_N_avg_score_weighted", + filename="HN_N_avg_score_vs_fraction_cover_by_DESNZ_HN", + ) From 128474ba7fe9b5dbcd4b751c79ce6e0257682e92 Mon Sep 17 00:00:00 2001 From: helloaidank Date: Mon, 23 Dec 2024 16:33:08 +0000 Subject: [PATCH 5/6] suggested changes --- outputs/hn_zones/comparison_of_hn_zones.py | 233 +++++++++++++++------ 1 file changed, 174 insertions(+), 59 deletions(-) diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py index 6d5ec53..69d3092 100644 --- a/outputs/hn_zones/comparison_of_hn_zones.py +++ b/outputs/hn_zones/comparison_of_hn_zones.py @@ -26,31 +26,41 @@ **How to Run the Script**: To run the script, use the following command: -python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD] +python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD] [--read_in_s3 READ_IN_S3] [--save_to_s3 SAVE_TO_S3] + +Example: + # Run the script with default settings (local files, no threshold) + python comparison_of_hn_zones.py + + # Run the script with a specific threshold + python comparison_of_hn_zones.py --optional_threshold 0.1 + + # Run the script with files from S3 + python comparison_of_hn_zones.py --read_in_s3 True + + # Run the script and save outputs to S3 + python comparison_of_hn_zones.py --save_to_s3 True + + # Run the script with a specific threshold, read from S3, and save to S3 + python comparison_of_hn_zones.py --optional_threshold 0.1 --read_in_s3 True --save_to_s3 True """ import geopandas as gpd import pyogrio import polars as pl -from typing import Tuple, List, Callable, Optional +from typing import Tuple, List, Optional import logging import json import os -from shapely.geometry import Polygon from asf_heat_pump_suitability import PROJECT_DIR import argparse +import boto3 # Define file paths -# LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" -# LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" -# NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" LA_TO_ANALYSE = "Liverpool" -LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg" -LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp" -NESTA_HP_SUITABILITY_PARQUET_PATH = ( - "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" -) +S3_BUCKET = "asf-heat-pump-suitability" +S3_KEY_DIR = "evaluation/desnz_hn_zone_scores/" def setup_logging_and_file_path( @@ -85,14 +95,54 @@ def setup_logging_and_file_path( logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.") -def load_hn_geodata( +def setup_paths(read_in_s3: bool): + """ + Set up the paths based on whether to read from S3 or locally. + + Args: + read_in_s3 (bool): If True, set up paths to read from S3. Otherwise, set up local paths. + + Returns: + Tuple[str, str, str]: Paths for LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, and NESTA_HP_SUITABILITY_PARQUET_PATH. + """ + if read_in_s3: + LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" + LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" + NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" + else: + LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg" + LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp" + NESTA_HP_SUITABILITY_PARQUET_PATH = ( + "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet" + ) + return LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, NESTA_HP_SUITABILITY_PARQUET_PATH + + +def optionally_upload_file_to_s3( + local_file_path: str, s3_bucket: str, s3_key: str, save_to_s3: bool +) -> None: + """ + Upload a local file to an S3 bucket. + + Args: + local_file_path (str): Path to the local file. + s3_bucket (str): Name of the S3 bucket. + s3_key (str): S3 key (path) where the file should be uploaded. + """ + if save_to_s3: + s3_client = boto3.client("s3") + s3_client.upload_file(local_file_path, s3_bucket, s3_key) + logging.info(f"File uploaded to s3://{s3_bucket}/{s3_key}") + + +def load_transform_hn_geodata( desnz_hn_gpkg_path: str, lsoa_shp_path: str ) -> Tuple[gpd.GeoDataFrame, List[str]]: """ - Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes. + Load the Heat Networks GeoPackage file, perform a spatial join with LSOA polygons to add a column of LSOA codes and calculates the intersection area and the fraction of LSOA area covered by HN zones. Args: - desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones. + desnz_hn_gpkg_path (str): Path to the GeoPackage file with the Heat Network zones. lsoa_shp_path (str): Path to the LSOA shapefile. Returns: @@ -122,22 +172,12 @@ def load_hn_geodata( how="left", predicate="intersects", ) - - # Calculate the intersection area and the fraction of LSOA area covered by HN zones - joined_gdf["intersection_area"] = joined_gdf.apply( - lambda row: row.geometry.intersection( - lsoa_gdf.loc[row.index_right, "geometry"] - ).area, - axis=1, - ) lsoa_gdf["total_area"] = lsoa_gdf.geometry.area - - # Merge the LSOA geometries into the joined GeoDataFrame - joined_gdf = joined_gdf.merge(lsoa_gdf[["LSOA21CD", "total_area"]], on="LSOA21CD") - - # Extract unique LSOA codes from the joined GeoDataFrame + # Get all intersections between DESNZ heat network zones and LSOAs + joined_gdf = gpd.overlay(desnz_hn_gdf, lsoa_gdf, how="intersection") + # Calculate area of intersections joined_gdf["fraction_covered"] = ( - joined_gdf["intersection_area"] / joined_gdf["total_area"] + joined_gdf["geometry"].area / joined_gdf["total_area"] ) desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist() return joined_gdf, desnz_hn_unique_lsoas @@ -187,10 +227,11 @@ def add_DESNZ_pilot_fraction( Args: la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns for a single LA. joined_gdf (gpd.GeoDataFrame): GeoDataFrame containing 'LSOA21CD' and 'fraction_covered' columns. + optional_threshold (Optional[float]): Optional threshold for fraction of HN zone area contained within LA. Defaults to 0. Returns: Tuple[pl.DataFrame, float, float]: - - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of area covered by HN zones). + - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of local authority area covered by HN zones). - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is non-zero. - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is zero. """ @@ -215,10 +256,11 @@ def add_DESNZ_pilot_fraction( # Calculating the average Nesta HN score for non-zero and zero DESNZ pilot scores avg_hn_score_pilot_nonzero = _calculate_hn_pilot_average_score( la_hp_suitability_scores, - pilot_score_condition=lambda col: col > optional_threshold, + hn_zones=True, + optional_threshold=optional_threshold, ) avg_hn_score_pilot_zero = _calculate_hn_pilot_average_score( - la_hp_suitability_scores, pilot_score_condition=lambda col: col == 0 + la_hp_suitability_scores, hn_zones=False ) return la_hp_suitability_scores, avg_hn_score_pilot_nonzero, avg_hn_score_pilot_zero @@ -239,7 +281,7 @@ def calculate_average_scores_for_thresholds( results = [] for threshold in thresholds: avg_score = _calculate_hn_pilot_average_score( - la_hp_suitability_scores, pilot_score_condition=lambda col: col > threshold + la_hp_suitability_scores, hn_zones=True, optional_threshold=threshold ) results.append( { @@ -253,22 +295,30 @@ def calculate_average_scores_for_thresholds( def _calculate_hn_pilot_average_score( la_hp_suitability_scores: pl.DataFrame, - pilot_score_condition: Callable[[pl.Expr], pl.Expr], + hn_zones: bool, + optional_threshold: Optional[float] = 0, ) -> float: """ - Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > 0`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas. + Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > optional_threshold`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas. Args: la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns for a LA. - pilot_score_condition (Callable[[pl.Expr], pl.Expr]): The value of 'DESNZ_pilot_fraction' to filter by (must be >0 or 0). + hn_zones (bool): If True, calculate average Nesta heat network score for LSOAs in DESNZ heat network zones. Set to False to calculate the average score for LSOAs not in heat network zones. + optional_threshold (Optional[float]): The threshold value for 'DESNZ_pilot_fraction'. Defaults to 0. Range: 0-1. Returns: float: Average 'HN_N_avg_score_weighted' for the filtered rows. """ - # Apply the condition function to filter the DataFrame - filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter( - pilot_score_condition(pl.col("DESNZ_pilot_fraction")) - ) + if hn_zones: + # Filter for LSOAs in DESNZ heat network zones + filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter( + pl.col("DESNZ_pilot_fraction") > optional_threshold + ) + else: + # Filter for LSOAs not in DESNZ heat network zones + filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter( + pl.col("DESNZ_pilot_fraction") == 0 + ) # Calculate the average score avg_score = filtered_la_hp_suitability_scores["HN_N_avg_score_weighted"].mean() @@ -277,22 +327,32 @@ def _calculate_hn_pilot_average_score( def calculate_mae_for_pilot_score( hp_suitability_scores_with_desnz: pl.DataFrame, - condition_score_value: Callable[[pl.Expr], pl.Expr], + hn_zones: bool, + optional_threshold: Optional[float] = 0, ) -> float: """ - Calculate the Mean Absolute Error (MAE) for entries satisfying a specified condition on 'DESNZ_pilot_fraction'. + Calculate the Mean Absolute Error (MAE) for entries in or not in DESNZ heat network zones. Args: hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'absolute_error' columns. - condition_score_value (Callable[[pl.Expr], pl.Expr]): A function defining the condition to filter 'DESNZ_pilot_fraction'. + hn_zones (bool): If True, calculate MAE for entries in DESNZ heat network zones. Set to False to calculate MAE for entries not in heat network zones. + optional_threshold (Optional[float]): The threshold value for 'DESNZ_pilot_fraction'. Defaults to 0. Range: 0-1. Returns: float: The MAE for the specified condition. """ - # Apply the DEZNZ pilot score condition to filter the DataFrame - filtered_df = hp_suitability_scores_with_desnz.filter( - condition_score_value(pl.col("DESNZ_pilot_fraction")) - ) + if hn_zones: + # Filter for entries in DESNZ heat network zones + filtered_df = hp_suitability_scores_with_desnz.filter( + pl.col("DESNZ_pilot_fraction") > optional_threshold + ) + else: + # Filter for entries not in DESNZ heat network zones + filtered_df = hp_suitability_scores_with_desnz.filter( + pl.col("DESNZ_pilot_fraction") == 0 + ) + + # Calculate the MAE mae = filtered_df["absolute_error"].mean() return mae @@ -337,25 +397,55 @@ def calculate_mae_for_all( "--optional_threshold", type=float, default=0.0, - help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score.", + help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score. Range: 0-1.", + ) + parser.add_argument( + "--read_in_s3", + type=bool, + default=False, + help="Read in the input files from S3.", + ) + parser.add_argument( + "--save_to_s3", + type=bool, + default=False, + help="Save the output files to S3.", ) args = parser.parse_args() # Extract the optional_threshold value from args optional_threshold = args.optional_threshold + save_to_s3 = args.save_to_s3 + read_in_s3 = args.read_in_s3 + LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, NESTA_HP_SUITABILITY_PARQUET_PATH = setup_paths( + read_in_s3=read_in_s3 + ) # Define the output directory and set up logging output_dir = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/") + setup_logging_and_file_path(output_dir=output_dir) # Load the data and perform spatial join - liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata( - desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH + liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = ( + load_transform_hn_geodata( + desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH + ) ) logging.info("Loaded Liverpool with DESNZ HN LSOA data.") + + liv_desnz_hn_filename = "liverpool_with_desnz_hn_lsoa.gpkg" + liv_desnz_hn_local_file_path = os.path.join(output_dir, liv_desnz_hn_filename) liverpool_with_desnz_hn_lsoa.to_file( - filename=os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"), + filename=liv_desnz_hn_local_file_path, driver="GPKG", ) + liv_desnz_hn_s3_key = f"{S3_KEY_DIR}{liv_desnz_hn_filename}" + optionally_upload_file_to_s3( + local_file_path=liv_desnz_hn_local_file_path, + s3_bucket=S3_BUCKET, + s3_key=liv_desnz_hn_s3_key, + save_to_s3=save_to_s3, + ) # Process Nesta heat pump suitability scores liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = ( @@ -365,12 +455,21 @@ def calculate_mae_for_all( ) ) logging.info("Processed Nesta HP suitability scores for Liverpool.") + # Define JSON file name and paths + lsoas_json_filename = "liverpool_hp_suitability_lsoas.json" + lsoas_json_local_file_path = os.path.join(output_dir, lsoas_json_filename) + lsoas_json_s3_key = f"{S3_KEY_DIR}{lsoas_json_filename}" - with open( - os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w" - ) as file: + with open(os.path.join(output_dir, lsoas_json_filename), "w") as file: json.dump(liverpool_hp_suitability_lsoas, file) + optionally_upload_file_to_s3( + local_file_path=lsoas_json_local_file_path, + s3_bucket=S3_BUCKET, + s3_key=lsoas_json_s3_key, + save_to_s3=save_to_s3, + ) + # Check LSOAs not in HP suitability scores not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set( liverpool_hp_suitability_lsoas @@ -411,8 +510,14 @@ def calculate_mae_for_all( ) # Save the results to Parquet - average_scores_df.write_parquet( - os.path.join(output_dir, "average_scores_by_threshold.parquet") + avg_score_parquet_filename = "average_scores_by_threshold.parquet" + avg_score_parquet_filepath = os.path.join(output_dir, avg_score_parquet_filename) + average_scores_df.write_parquet(avg_score_parquet_filepath) + optionally_upload_file_to_s3( + local_file_path=avg_score_parquet_filepath, + s3_bucket=S3_BUCKET, + s3_key=f"{S3_KEY_DIR}{avg_score_parquet_filename}", + save_to_s3=save_to_s3, ) # Calculate and log Mean Absolute Error (MAE) @@ -423,21 +528,31 @@ def calculate_mae_for_all( ) mae_pilot_non_zero = calculate_mae_for_pilot_score( hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz, - condition_score_value=lambda col: col > 0, + hn_zones=True, ) logging.info( f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction > 0: {mae_pilot_non_zero}" ) mae_pilot_zero = calculate_mae_for_pilot_score( hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz, - condition_score_value=lambda col: col == 0, + hn_zones=False, ) logging.info( f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction = 0: {mae_pilot_zero}" ) - + # Define Parquet file name and paths for MAE results + mae_parquet_filename = "liverpool_hp_suitability_scores_with_desnz.parquet" + mae_parquet_local_file_path = os.path.join(output_dir, mae_parquet_filename) + mae_parquet_s3_key = f"{S3_KEY_DIR}{mae_parquet_filename}" liverpool_hp_suitability_scores_with_desnz.write_parquet( - os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet") + mae_parquet_local_file_path + ) + + optionally_upload_file_to_s3( + local_file_path=mae_parquet_local_file_path, + s3_bucket=S3_BUCKET, + s3_key=mae_parquet_s3_key, + save_to_s3=save_to_s3, ) # Write to CSV file liverpool_hp_suitability_scores_with_desnz.write_csv( From 94e813046624d055a518ea5587bfd7a903c32269 Mon Sep 17 00:00:00 2001 From: helloaidank Date: Mon, 23 Dec 2024 16:46:06 +0000 Subject: [PATCH 6/6] changes to the plotting script --- .../hn_zones/plot_comparison_of_hn_zones.py | 135 +++++++++++++----- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/outputs/hn_zones/plot_comparison_of_hn_zones.py b/outputs/hn_zones/plot_comparison_of_hn_zones.py index 5048e1e..4b3f022 100644 --- a/outputs/hn_zones/plot_comparison_of_hn_zones.py +++ b/outputs/hn_zones/plot_comparison_of_hn_zones.py @@ -1,5 +1,5 @@ """ -This script processes and visualises heat network (HN) and heat pump suitability data for Liverpool. +This script processes and visualises DESNZ heat network (HN) and Nesta heat pump suitability data for Liverpool. Steps: 1. Load and preprocess data: @@ -16,7 +16,20 @@ 5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files. -6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction along with plotting the best fit line and calculating the R^{2}. +6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction threshold along with plotting the best fit line and calculating the R^{2}. + +Usage: + python plot_comparison_of_hn_zones.py [--read_from_s3] + +Arguments: + --read_from_s3 (bool): If True, read the input files from S3 bucket. Defaults to False. + +Example: + # Run the script with local files + python plot_comparison_of_hn_zones.py + + # Run the script with files from S3 + python plot_comparison_of_hn_zones.py --read_from_s3 True """ import geopandas as gpd @@ -30,56 +43,92 @@ from asf_heat_pump_suitability import PROJECT_DIR import numpy as np from sklearn.metrics import r2_score +import argparse +import boto3 # Set up logging configuration logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - -# Define constants for file paths (LEAVE UNCHANGED) +# Define local directories and file paths INPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/") - -LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join( +LOCAL_LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join( INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet" ) - -AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join( +LOCAL_AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join( INPUT_DIR, "average_scores_by_threshold.parquet" ) -print(AVERAGE_HN_SCORES_COVERAGE_PARQUET) +LOCAL_LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json") LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp" LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg" -LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json") + +# Define S3 variables and paths +S3_BUCKET = "asf-heat-pump-suitability" +S3_KEY_DIR = "evaluation/desnz_hn_zone_scores/" +S3_KEY_LIVERPOOL_HP_SUITABILITY_PARQUET = ( + f"{S3_KEY_DIR}liverpool_hp_suitability_scores_with_desnz.parquet" +) +S3_KEY_AVERAGE_HN_SCORES_COVERAGE_PARQUET = ( + f"{S3_KEY_DIR}average_scores_by_threshold.parquet" +) +S3_KEY_LSOA_JSON_PATH = f"{S3_KEY_DIR}liverpool_hp_suitability_lsoas.json" +# Define output directory OUTPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_plots/") # Ensure the output directory exists os.makedirs(OUTPUT_DIR, exist_ok=True) + # Define a variable for target CRS to avoid hardcoding TARGET_CRS = "EPSG:27700" -def load_data(): +def load_data(read_from_s3: bool = False) -> tuple: """ Load the necessary data files and return them. + Args: + read_from_s3 (bool): If True, read the input files from S3. If False, read the input files locally. + Returns: Tuple: - - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores. + - pl.DataFrame: DataFrame with Liverpool heat pump suitability scores. - List[str]: List of LSOA codes for Liverpool. - - pl.DataFrame: DataFrame with average HN scores coverage. + - pl.DataFrame: DataFrame with the average heat network scores for different area thresholds (threshold is the fraction of an LSOA which is covered by a DESNZ HN zone). Raises: IOError: If any of the data files cannot be loaded. """ try: logging.info("Loading data files...") - liverpool_hp_suitability_scores_pd = pd.read_parquet( - LIVERPOOL_HP_SUITABILITY_PARQUET - ) - with open(LSOA_JSON_PATH, "r") as file: - liverpool_hp_suitability_lsoas = json.load(file) - average_hn_scores_coverage_df = pl.read_parquet( - AVERAGE_HN_SCORES_COVERAGE_PARQUET - ) + if read_from_s3: + # Read from S3 + s3_client = boto3.client("s3") + + # Read Liverpool heat pump suitability scores + liv_hps_obj = s3_client.get_object( + Bucket=S3_BUCKET, Key=S3_KEY_LIVERPOOL_HP_SUITABILITY_PARQUET + ) + liverpool_hp_suitability_scores_pd = pd.read_parquet(liv_hps_obj["Body"]) + + # Read LSOA JSON + liv_hps_lsoas_obj = s3_client.get_object( + Bucket=S3_BUCKET, Key=S3_KEY_LSOA_JSON_PATH + ) + liverpool_lsoas = json.loads(liv_hps_lsoas_obj["Body"].read()) + + # Read average HN scores coverage + avg_hn_scores_obj = s3_client.get_object( + Bucket=S3_BUCKET, Key=S3_KEY_AVERAGE_HN_SCORES_COVERAGE_PARQUET + ) + average_hn_scores_coverage_df = pl.read_parquet(avg_hn_scores_obj["Body"]) + else: + liverpool_hp_suitability_scores_pd = pd.read_parquet( + LOCAL_LIVERPOOL_HP_SUITABILITY_PARQUET + ) + with open(LOCAL_LSOA_JSON_PATH, "r") as file: + liverpool_lsoas = json.load(file) + average_hn_scores_coverage_df = pl.read_parquet( + LOCAL_AVERAGE_HN_SCORES_COVERAGE_PARQUET + ) # Basic validation: Check expected columns in the main DataFrame expected_cols = ["LSOA21CD", "DESNZ_pilot_fraction", "absolute_error"] @@ -90,12 +139,12 @@ def load_data(): ] if missing_cols: raise ValueError( - f"Missing expected columns in main DataFrame: {missing_cols}" + f"Missing expected columns in hp suitability df: {missing_cols}" ) return ( liverpool_hp_suitability_scores_pd, - liverpool_hp_suitability_lsoas, + liverpool_lsoas, average_hn_scores_coverage_df, ) except Exception as e: @@ -104,7 +153,7 @@ def load_data(): def preprocess_data( - liverpool_hp_suitability_lsoas: list, + liverpool_lsoas: list, lsoa_shp_path: str = LSOA_SHP_PATH, target_crs: str = TARGET_CRS, ) -> gpd.GeoDataFrame: @@ -112,7 +161,7 @@ def preprocess_data( Load and preprocess LSOA geospatial data by extracting unique LSOA geometries and filtering Liverpool LSOAs. Args: - liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool. + liverpool_lsoas (list): List of LSOA codes for Liverpool. lsoa_shp_path (str): Path to the LSOA shapefile. target_crs (str): The target CRS to ensure consistent geospatial analysis. @@ -125,7 +174,7 @@ def preprocess_data( logging.info(f"Converting to {target_crs}") lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs(target_crs) liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[ - lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas) + lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_lsoas) ] if liverpool_lsoa_geometries_gdf.empty: @@ -134,9 +183,11 @@ def preprocess_data( return liverpool_lsoa_geometries_gdf -def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR): +def plot_lsoa_geometries( + liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame, output_dir: str = OUTPUT_DIR +): """ - Plot Liverpool LSOA geometries and save as PNG. + Plot Liverpool LSOA geometries and save as PNG and PDF. Args: liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries. @@ -157,7 +208,7 @@ def merge_data( liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame, ) -> gpd.GeoDataFrame: """ - Merge suitability scores with geometries and convert to GeoDataFrame. + Merge LSOA suitability scores with geometries and convert to GeoDataFrame. Args: liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores. @@ -192,7 +243,7 @@ def plot_overlay( output_dir: str = OUTPUT_DIR, ): """ - Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool. + Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool. Saves the plot as PNG and PDF. Args: liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries. @@ -223,7 +274,7 @@ def plot_absolute_error_map( output_dir: str = OUTPUT_DIR, ): """ - Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score. + Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score. Saves the plot as PNG and PDF. Args: hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and @@ -232,6 +283,7 @@ def plot_absolute_error_map( score (float): DESNZ pilot score threshold. If > 0.0, plot regions where DESNZ_pilot_fraction >= score. If 0, plot regions where DESNZ_pilot_fraction == 0. + Range from 0 to 1. title (str): Title for the plot. filename (str): Filename (PNG) to save the plot. output_dir (str): Output directory to save the plot image. @@ -239,7 +291,7 @@ def plot_absolute_error_map( Raises: ValueError: If score < 0. """ - logging.info(f"Plotting absolute error map for score {score}...") + logging.info(f"Plotting absolute error map for score threshold {score}...") if score < 0: raise ValueError("Score must be a non-negative value.") @@ -285,7 +337,7 @@ def plot_hn_avg_score_vs_fraction_threshold( Args: average_hn_scores_coverage_df (pl.DataFrame): DataFrame with average Nesta HN scores and DESNZ Pilot Fraction. fraction_thresholds_col (str): Column name for DESNZ Pilot Fraction Threshold. - hn_avg_scores_col (str): Column name for HN_N Avg Score Weighted. + hn_avg_scores_col (str): Column name for Nesta Heat Network Avg Score Weighted. """ logging.info("Plotting average HN score vs DESNZ Pilot Fraction Threshold...") fraction_thresholds = average_hn_scores_coverage_df[ @@ -356,12 +408,25 @@ def plot_hn_avg_score_vs_fraction_threshold( if __name__ == "__main__": + # Argument parser for reading inputs from s3 + parser = argparse.ArgumentParser( + description="Process heat network zones and calculate scores." + ) + parser.add_argument( + "--read_from_s3", + type=bool, + default=False, + help="Read the input files from S3 bucket.", + ) + args = parser.parse_args() + read_from_s3 = args.read_from_s3 + ( liverpool_hp_suitability_scores_pd, - liverpool_hp_suitability_lsoas, + liverpool_lsoas, average_hn_scores_coverage_df, ) = load_data() - liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas) + liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_lsoas) plot_lsoa_geometries(liverpool_lsoa_geometries_gdf) liverpool_hp_suitability_gdf = merge_data( liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf