From e4cd63722d58a7a63870e141b0887878d58988bd Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Wed, 6 Nov 2024 18:36:14 +0000
Subject: [PATCH 1/6] adding file for calculating comparison of hn zones as
 well as plotting script for now

---
 .../hn_zones/comparison_of_hn_zones.py        | 350 ++++++++++++++++++
 .../analysis/hn_zones/heat_network_zones.py   |  19 +
 .../hn_zones/plot_comparison_of_hn_zones.py   | 225 +++++++++++
 3 files changed, 594 insertions(+)
 create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py
 create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py
 create mode 100644 asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py

diff --git a/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py
new file mode 100644
index 0000000..bd3d07b
--- /dev/null
+++ b/asf_heat_pump_suitability/analysis/hn_zones/comparison_of_hn_zones.py
@@ -0,0 +1,350 @@
+"""
+This script processes and compares DESNZ heat network (HN) and heat pump suitability data for Liverpool, and outputs some files for plotting.
+
+Steps:
+1. Load DESNZ HN GeoData and perform a spatial join with LSOA polygons.
+2. Process Nesta heat pump suitability scores for Liverpool.
+3. Identify LSOAs in DESNZ HN list not in HP suitability data.
+4. Calculate and log average scores and Mean Absolute Error (MAE).
+5. Save processed data to files.
+
+Outputs:
+- 'liverpool_with_desnz_hn_lsoa.gpkg': GeoDataFrame with LSOA codes added to HN zones.
+- 'liverpool_hp_suitability_lsoas.json': List of unique LSOA codes in Liverpool.
+- 'liverpool_hp_suitability_scores_with_desnz.parquet': Processed suitability scores with DESNZ pilot scores and MAE.
+- 'script_output.log': Log file with metrics such as avg HN scores and MAE.
+"""
+
+import geopandas as gpd
+import pyogrio
+import polars as pl
+from typing import Tuple, List, Optional
+import logging
+import json
+import os
+from asf_heat_pump_suitability import PROJECT_DIR
+
+
+# Define file paths
+LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
+LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
+NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
+LA_TO_ANALYSE = "Liverpool"
+
+
+def setup_logging_and_file_path(
+    output_dir: str, log_filename: str = "script_output.log", level: int = logging.INFO
+) -> None:
+    """
+    Set up logging configuration to log messages to both a file and the console.
+
+    Args:
+        output_dir (str): Path to the directory where the log file should be saved.
+        log_filename (str): Name of the log file. Defaults to "script_output.log".
+        level (int): Logging level, e.g., logging.INFO or logging.DEBUG. Defaults to logging.INFO.
+    """
+    # Ensure the output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Define the full path to the log file
+    log_file_path = os.path.join(output_dir, log_filename)
+
+    # Clear existing logging handlers if any
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+
+    # Set up logging configuration
+    logging.basicConfig(
+        level=level,
+        format="%(levelname)s: %(message)s",
+        handlers=[
+            logging.FileHandler(log_file_path, mode="w"),  # Overwrite log file each run
+            logging.StreamHandler(),
+        ],
+    )
+    logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.")
+
+
+def load_hn_geodata(
+    desnz_hn_gpkg_path: str, lsoa_shp_path: str
+) -> Tuple[gpd.GeoDataFrame, List[str]]:
+    """
+    Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes.
+
+    Args:
+        desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones.
+        lsoa_shp_path (str): Path to the LSOA shapefile.
+
+    Returns:
+        Tuple[gpd.GeoDataFrame, List[str]]:
+            - GeoDataFrame with LSOA codes added via spatial join.
+            - List of unique LSOA codes present in the GeoDataFrame.
+    """
+    # Load DESNZ HN polygons GeoPackage
+    desnz_hn_gdf = pyogrio.read_dataframe(
+        desnz_hn_gpkg_path, layer="heat-network-zone-map-Liverpool"
+    )
+
+    # Load the LSOA polygons file with 'LSOA21CD' codes
+    lsoa_gdf = gpd.read_file(lsoa_shp_path)
+
+    # Remove 'index_right' column if it exists
+    if "index_right" in desnz_hn_gdf.columns:
+        desnz_hn_gdf = desnz_hn_gdf.drop(columns=["index_right"])
+
+    # Ensure both GeoDataFrames use the same CRS
+    if desnz_hn_gdf.crs != desnz_hn_gdf.crs:
+        lsoa_gdf = lsoa_gdf.to_crs(desnz_hn_gdf.crs)
+
+    # Perform spatial join to match Liverpool polygons with LSOA codes
+    joined_gdf = gpd.sjoin(
+        desnz_hn_gdf,
+        lsoa_gdf[["geometry", "LSOA21CD"]],
+        how="left",
+        predicate="intersects",
+    )
+
+    # Extract unique LSOA codes from the joined GeoDataFrame
+    desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist()
+
+    return joined_gdf, desnz_hn_unique_lsoas
+
+
+def process_nesta_hp_suitability_scores(
+    nesta_hp_suitability_scores: str, local_authority: str
+) -> Tuple[pl.DataFrame, List[str]]:
+    """
+    Load and process Nesta heat pump suitability data for a specific local authority.
+
+    Args:
+        nesta_hp_suitability_scores (str): Path to the Nesta heat pump suitability Parquet file with HN scores.
+        local_authority (str): Local authority name to filter the data.
+
+    Returns:
+        Tuple[pl.DataFrame, List[str]]:
+            - DataFrame containing filtered LSOA codes and 'HN_N_avg_score_weighted' column.
+            - List of unique LSOA codes within the local authority.
+    """
+    # Load the data from the Parquet file
+    hp_scores_df = pl.read_parquet(nesta_hp_suitability_scores)
+
+    # Rename 'lsoa' to 'LSOA21CD' if needed to match the GeoDataFrame
+    if "lsoa" in hp_scores_df.columns:
+        hp_scores_df = hp_scores_df.rename({"lsoa": "LSOA21CD"})
+
+    # Filter by local authority and select relevant columns
+    la_filtered_scores = hp_scores_df.filter(
+        pl.col("lsoa_name").str.starts_with(local_authority)
+    ).select(["LSOA21CD", "HN_N_avg_score_weighted"])
+
+    # Extract unique LSOA codes
+    la_unique_lsoas = la_filtered_scores["LSOA21CD"].unique().to_list()
+
+    return la_filtered_scores, la_unique_lsoas
+
+
+def add_desnz_pilot_score(
+    la_hp_suitability_scores: pl.DataFrame, list_of_desnz_hn_lsoas: List[str]
+) -> Tuple[pl.DataFrame, float, float]:
+    """
+    Add a 'DESNZ_pilot_score' column to the Nesta heat pump suitability data and compute average scores.
+
+    Args:
+        la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns.
+        list_of_desnz_hn_lsoas (List[str]): List of LSOA codes within the DESNZ HN pilot zones.
+
+    Returns:
+        Tuple[pl.DataFrame, float, float]:
+            - Updated DataFrame with 'DESNZ_pilot_score' column added (1 if in pilot zone, otherwise 0).
+            - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_score' is 1.
+            - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_score' is 0.
+    """
+    # Add 'DESNZ_pilot_score' column: 1 if LSOA is in DESNZ pilot zone, else 0
+    la_hp_suitability_scores = la_hp_suitability_scores.with_columns(
+        pl.col("LSOA21CD")
+        .is_in(list_of_desnz_hn_lsoas)
+        .cast(pl.Int8)
+        .alias("DESNZ_pilot_score")
+    )
+    # Calculate average scores where 'DESNZ_pilot_score' == 1 and == 0
+    avg_hn_score_pilot_1 = _calculate_hn_pilot_average_score(
+        la_hp_suitability_scores, pilot_score=1
+    )
+    avg_hn_score_pilot_0 = _calculate_hn_pilot_average_score(
+        la_hp_suitability_scores, pilot_score=0
+    )
+    return la_hp_suitability_scores, avg_hn_score_pilot_1, avg_hn_score_pilot_0
+
+
+def _calculate_hn_pilot_average_score(
+    la_hp_suitability_scores: pl.DataFrame, pilot_score: int
+) -> float:
+    """
+    Calculate the average 'HN_N_avg_score_weighted' for a specified 'DESNZ_pilot_score' value.
+
+    Args:
+        la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_score' and 'HN_N_avg_score_weighted' columns for a LA.
+        pilot_score (int): The value of 'DESNZ_pilot_score' to filter by (must be 1 or 0).
+
+    Returns:
+        float: The average 'HN_N_avg_score_weighted' for rows with the specified 'DESNZ_pilot_score'.
+               Returns 0.0 if there are no rows with the specified score.
+
+    Raises:
+        ValueError: If pilot_score is not 1 or 0.
+    """
+    # Validate that pilot_score is either 1 or 0
+    if pilot_score not in {0, 1}:
+        raise ValueError("pilot_score must be either 1 or 0")
+
+    # Filter and calculate the average score
+    filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter(
+        pl.col("DESNZ_pilot_score") == pilot_score
+    )
+    avg_hn_score_pilot = (
+        filtered_la_hp_suitability_scores.select(
+            pl.col("HN_N_avg_score_weighted").mean()
+        ).item()
+        if filtered_la_hp_suitability_scores.height > 0
+        else 0.0
+    )
+    return avg_hn_score_pilot
+
+
+def calculate_mae_for_pilot_score(
+    hp_suitability_scores_with_desnz: pl.DataFrame, score_value: int
+) -> float:
+    """
+    Calculate and log the Mean Absolute Error (MAE) for entries with a specified 'DESNZ_pilot_score'.
+
+    Args:
+        hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_score' and 'absolute_error' columns.
+        score_value (int): The value of 'DESNZ_pilot_score' (must be 1 or 0) for which to calculate the MAE.
+
+    Returns:
+        float: The MAE for the specified 'DESNZ_pilot_score'.
+
+    Raises:
+        ValueError: If score_value is not 1 or 0.
+    """
+    # Validate that score_value is either 1 or 0
+    if score_value not in {0, 1}:
+        raise ValueError("score_value must be either 1 or 0")
+
+    mae = hp_suitability_scores_with_desnz.filter(
+        pl.col("DESNZ_pilot_score") == score_value
+    )["absolute_error"].mean()
+
+    logging.info(
+        f"Mean Absolute Error (MAE) for DESNZ_pilot_score = {score_value}: {mae}"
+    )
+    return mae
+
+
+def calculate_mae_for_all(
+    hp_suitability_scores: pl.DataFrame, desnz_col: str, nesta_hn_score_col: str
+) -> Tuple[pl.DataFrame, float]:
+    """
+    Calculate the Mean Absolute Error (MAE) between the DESNZ pilot score and Nesta's HN suitability score, add the absolute error column to the DataFrame,
+    and log the result.
+
+    Args:
+        hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing the DESNZ pilot (actual) and Nesta HN suitability score (predicted) columns.
+        desnz_col (str): Name of the column with DESNZ pilot (actual) values.
+        nesta_hn_score_col (str): Name of the column with Nesta HN suitability (predicted) values.
+
+    Returns:
+        Tuple[pl.DataFrame, float]:
+            - Updated DataFrame with 'absolute_error' column added.
+            - The Mean Absolute Error (MAE) between the actual and predicted columns.
+    """
+    # Calculate absolute error and add it as a new column
+    hp_suitability_scores_with_error = hp_suitability_scores.with_columns(
+        (pl.col(desnz_col) - pl.col(nesta_hn_score_col)).abs().alias("absolute_error")
+    )
+
+    # Calculate the mean of the absolute error
+    mae = hp_suitability_scores_with_error["absolute_error"].mean()
+    logging.info(
+        f"Mean Absolute Error (MAE) for {desnz_col} vs {nesta_hn_score_col}: {mae}"
+    )
+
+    return hp_suitability_scores_with_error, mae
+
+
+def main():
+    # Define the output directory and set up logging
+    output_dir = os.path.join(
+        PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_data/"
+    )
+    setup_logging_and_file_path(output_dir)
+
+    # Load the data and perform spatial join
+    liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata(
+        LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH
+    )
+    logging.info("Loaded Liverpool with DESNZ HN LSOA data.")
+    liverpool_with_desnz_hn_lsoa.to_file(
+        os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"), driver="GPKG"
+    )
+
+    # Process Nesta heat pump suitability scores
+    liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = (
+        process_nesta_hp_suitability_scores(
+            NESTA_HP_SUITABILITY_PARQUET_PATH, LA_TO_ANALYSE
+        )
+    )
+    logging.info("Processed Nesta HP suitability scores for Liverpool.")
+
+    with open(
+        os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w"
+    ) as file:
+        json.dump(liverpool_hp_suitability_lsoas, file)
+
+    # Check LSOAs not in HP suitability scores
+    not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set(
+        liverpool_hp_suitability_lsoas
+    )
+    logging.info(
+        f"number of LSOAs in DESNZ list not in HP suitability data: {len(not_in_hp_suitability)}"
+    )
+
+    # Calculate and log average score
+    avg_hn_score = liverpool_hp_suitability_scores["HN_N_avg_score_weighted"].mean()
+    logging.info(f"Average HN_N_avg_score_weighted: {avg_hn_score}")
+
+    # Add DESNZ pilot score and calculate averages
+    (
+        liverpool_hp_suitability_scores_with_desnz,
+        avg_hn_score_pilot_1,
+        avg_hn_score_pilot_0,
+    ) = add_desnz_pilot_score(
+        liverpool_hp_suitability_scores, list_of_liverpool_desnz_hn_lsoas
+    )
+    logging.info(
+        f"Avg HN_N_avg_score_weighted for DESNZ_pilot_score = 1: {avg_hn_score_pilot_1}"
+    )
+    logging.info(
+        f"Avg HN_N_avg_score_weighted for DESNZ_pilot_score = 0: {avg_hn_score_pilot_0}"
+    )
+
+    # Calculate and log Mean Absolute Error (MAE)
+    liverpool_hp_suitability_scores_with_desnz, mae_all = calculate_mae_for_all(
+        liverpool_hp_suitability_scores_with_desnz,
+        "DESNZ_pilot_score",
+        "HN_N_avg_score_weighted",
+    )
+    mae_pilot_1 = calculate_mae_for_pilot_score(
+        liverpool_hp_suitability_scores_with_desnz, score_value=1
+    )
+    mae_pilot_0 = calculate_mae_for_pilot_score(
+        liverpool_hp_suitability_scores_with_desnz, score_value=0
+    )
+
+    liverpool_hp_suitability_scores_with_desnz.write_parquet(
+        os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet")
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py
new file mode 100644
index 0000000..306ad35
--- /dev/null
+++ b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py
@@ -0,0 +1,19 @@
+import pyogrio
+import matplotlib.pyplot as plt
+
+# List layers in the GeoPackage
+layers = pyogrio.list_layers("heat-network-zone-map-Liverpool.gpkg")
+print("Layers in GeoPackage:", layers)
+
+
+# Read a specific layer into a GeoDataFrame
+gdf = pyogrio.read_dataframe(
+    "heat-network-zone-map-Liverpool.gpkg", layer="heat-network-zone-map-Liverpool"
+)
+print(gdf.head())
+print(gdf.crs)
+print(len(gdf))
+fig, ax = plt.subplots(figsize=(12, 6))
+gdf.plot(ax=ax)
+plt.savefig("liverpool_heat_network_zones.png")
+plt.show()
diff --git a/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py
new file mode 100644
index 0000000..4079262
--- /dev/null
+++ b/asf_heat_pump_suitability/analysis/hn_zones/plot_comparison_of_hn_zones.py
@@ -0,0 +1,225 @@
+"""
+This script processes and visualizes heat network (HN) and heat pump suitability data for Liverpool.
+
+Steps:
+1. Load and preprocess data:
+   - Read Parquet and GeoPackage files.
+   - Extract unique LSOA geometries.
+   - Load LSOA geometries and convert CRS if needed.
+   - Filter Liverpool LSOAs.
+
+2. Plot Liverpool LSOA geometries and save as PNG.
+
+3. Merge suitability scores with geometries and convert to GeoDataFrame.
+
+4. Plot and save overlay of DESNZ Pilot Score = 1 on Liverpool heat network zones.
+
+5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files.
+"""
+
+import geopandas as gpd
+import pandas as pd
+import matplotlib.pyplot as plt
+import pyogrio
+import json
+import os
+import logging
+from asf_heat_pump_suitability import PROJECT_DIR
+
+# Set up logging configuration
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+# Define constants for file paths
+INPUT_DIR = os.path.join(
+    PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_data/"
+)
+LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join(
+    INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet"
+)
+LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
+LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
+LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json")
+OUTPUT_DIR = os.path.join(
+    PROJECT_DIR, "asf_heat_pump_suitability/analysis/hn_zones/output_plots/"
+)
+
+# Ensure the output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+
+def load_data():
+    """
+    Load the necessary data files.
+
+    Returns:
+        Tuple containing:
+            - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores.
+            - List[str]: List of LSOA codes for Liverpool.
+    """
+    try:
+        liverpool_hp_suitability_scores_pd = pd.read_parquet(
+            LIVERPOOL_HP_SUITABILITY_PARQUET
+        )
+        with open(LSOA_JSON_PATH, "r") as file:
+            liverpool_hp_suitability_lsoas = json.load(file)
+        return liverpool_hp_suitability_scores_pd, liverpool_hp_suitability_lsoas
+    except Exception as e:
+        logging.error(f"Error loading data: {e}")
+        raise
+
+
+def preprocess_data(
+    liverpool_hp_suitability_lsoas: list, lsoa_shp_path: str = LSOA_SHP_PATH
+) -> gpd.GeoDataFrame:
+    """
+    Preprocess the data by extracting unique LSOA geometries and filtering Liverpool LSOAs.
+
+    Args:
+        liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool.
+        lsoa_shp_path (str): Path to the LSOA shapefile.
+
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame with filtered Liverpool LSOA geometries.
+    """
+    lsoa_geometries_gdf = gpd.read_file(lsoa_shp_path)
+    if lsoa_geometries_gdf.crs != "EPSG:27700":  # Assuming British National Grid
+        logging.info("Converting to EPSG:27700")
+        lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs("EPSG:27700")
+    liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[
+        lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas)
+    ]
+    return liverpool_lsoa_geometries_gdf
+
+
+def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR):
+    """
+    Plot Liverpool LSOA geometries and save as PNG.
+
+    Args:
+        liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries.
+        output_dir (str): Output directory to save the plot image.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    liverpool_lsoa_geometries_gdf.plot(ax=ax)
+    plt.title("Liverpool LSOA Geometries")
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.png"))
+    plt.show()
+
+
+def merge_data(
+    liverpool_hp_suitability_scores_pd: pd.DataFrame,
+    liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame,
+) -> gpd.GeoDataFrame:
+    """
+    Merge suitability scores with geometries and convert to GeoDataFrame.
+
+    Args:
+        liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores.
+        liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries.
+
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame with merged suitability scores and geometries.
+    """
+    liverpool_hp_suitability_with_geometry = liverpool_hp_suitability_scores_pd.merge(
+        liverpool_lsoa_geometries_gdf, on="LSOA21CD", how="left"
+    )
+    liverpool_hp_suitability_gdf = gpd.GeoDataFrame(
+        liverpool_hp_suitability_with_geometry, geometry="geometry"
+    )
+    return liverpool_hp_suitability_gdf
+
+
+def plot_overlay(
+    liverpool_hp_suitability_gdf: gpd.GeoDataFrame,
+    liverpool_heat_network_zones_filepath: str = LIVERPOOL_GPKG_PATH,
+    output_dir: str = OUTPUT_DIR,
+):
+    """
+    Plot and save overlay of DESNZ Pilot Score = 1 on Liverpool heat network zones.
+
+    Args:
+        liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries.
+        liverpool_heat_network_zones_filepath (str): Path to the Liverpool heat network zones GeoPackage.
+        output_dir (str): Output directory to save the plot image.
+    """
+    gdf = pyogrio.read_dataframe(
+        liverpool_heat_network_zones_filepath, layer="heat-network-zone-map-Liverpool"
+    )
+    fig, ax = plt.subplots(figsize=(12, 6))
+    gdf.plot(ax=ax, color="blue", alpha=0.5)
+    liverpool_hp_suitability_gdf[
+        liverpool_hp_suitability_gdf["DESNZ_pilot_score"] == 1
+    ].plot(ax=ax, color="pink", alpha=0.5)
+    plt.title("Overlay of DESNZ Pilot Score = 1 on Liverpool Heat Network Zones")
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.png"))
+    plt.show()
+
+
+def plot_absolute_error_map(
+    hp_suitability_gdf: gpd.GeoDataFrame,
+    score: int,
+    title: str,
+    filename: str,
+    output_dir: str = OUTPUT_DIR,
+):
+    """
+    Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score,
+    overlaying regions with a different score as boundaries.
+
+    Args:
+        df (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and
+                               heat pump suitability scores, including absolute error values.
+        score (int): DESNZ pilot score value for which to highlight areas.
+                     Only regions matching this score will be color-filled.
+        title (str): Title for the plot.
+        filename (str): Path to save the plot image as a PNG file.
+        output_dir (str): Output directory to save the plot image.
+
+    This function:
+        - Plots regions where DESNZ pilot score matches the specified `score`,
+          coloring them by 'absolute_error' values.
+        - Overlays boundaries of areas where DESNZ pilot score does not match the specified `score`.
+        - Displays a legend and saves the plot as a PNG file.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_score"] == score].plot(
+        column="absolute_error", ax=ax, legend=True, cmap="inferno"
+    )
+    hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_score"] != score].plot(
+        ax=ax, edgecolor="black", facecolor="none", linewidth=0.5
+    )
+    plt.title(title)
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, filename))
+    plt.show()
+
+
+def main():
+    """
+    Main function to load, preprocess, and visualise data.
+    """
+    liverpool_hp_suitability_scores_pd, liverpool_hp_suitability_lsoas = load_data()
+    liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas)
+    plot_lsoa_geometries(liverpool_lsoa_geometries_gdf)
+    liverpool_hp_suitability_gdf = merge_data(
+        liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf
+    )
+    plot_overlay(liverpool_hp_suitability_gdf)
+    plot_absolute_error_map(
+        liverpool_hp_suitability_gdf,
+        score=1,
+        title="Absolute Error Map for DESNZ Pilot Score = 1 for Liverpool",
+        filename="liverpool_data_with_pilot_score_1_v2.png",
+    )
+    plot_absolute_error_map(
+        liverpool_hp_suitability_gdf,
+        score=0,
+        title="Absolute Error Map for DESNZ Pilot Score = 0 for Liverpool",
+        filename="liverpool_data_with_pilot_score_0_v2.png",
+    )
+
+
+if __name__ == "__main__":
+    main()

From 3bef3c24fd3bf83d07a789efdc0ffa5273768e86 Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Thu, 7 Nov 2024 09:28:59 +0000
Subject: [PATCH 2/6] delete heat network zone

---
 .../analysis/hn_zones/heat_network_zones.py   | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py

diff --git a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py b/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py
deleted file mode 100644
index 306ad35..0000000
--- a/asf_heat_pump_suitability/analysis/hn_zones/heat_network_zones.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pyogrio
-import matplotlib.pyplot as plt
-
-# List layers in the GeoPackage
-layers = pyogrio.list_layers("heat-network-zone-map-Liverpool.gpkg")
-print("Layers in GeoPackage:", layers)
-
-
-# Read a specific layer into a GeoDataFrame
-gdf = pyogrio.read_dataframe(
-    "heat-network-zone-map-Liverpool.gpkg", layer="heat-network-zone-map-Liverpool"
-)
-print(gdf.head())
-print(gdf.crs)
-print(len(gdf))
-fig, ax = plt.subplots(figsize=(12, 6))
-gdf.plot(ax=ax)
-plt.savefig("liverpool_heat_network_zones.png")
-plt.show()

From 17d69a382a7ad6fcf5a2490be2f575a68a4003a7 Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Mon, 2 Dec 2024 14:52:30 +0000
Subject: [PATCH 3/6] changes to methodology to have the desnz hn pilot score
 as fraction rather than binary

---
 outputs/hn_zones/comparison_of_hn_zones.py | 449 +++++++++++++++++++++
 1 file changed, 449 insertions(+)
 create mode 100644 outputs/hn_zones/comparison_of_hn_zones.py

diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py
new file mode 100644
index 0000000..6e42c27
--- /dev/null
+++ b/outputs/hn_zones/comparison_of_hn_zones.py
@@ -0,0 +1,449 @@
+"""
+This script analyses DESNZ heat network (HN) zones and Nesta's heat pump suitability data for Liverpool, performing spatial joins, statistical analysis, and exporting processed results.
+
+**Key Steps:**
+1. **Spatial Analysis**:
+   - Loads DESNZ HN zones and LSOA polygons, ensuring consistent CRS.
+   - Performs spatial joins to calculate the fraction of each LSOA covered by HN zones.
+
+2. **Nesta Data Processing**:
+   - Filters Nesta heat pump suitability scores for Liverpool's LSOAs.
+   - Identifies LSOAs which are outside selected LA.
+
+3. **Statistical Metrics**:
+   - Calculates average Nesta suitability scores for LSOAs covered and not covered by HN zones.
+   - Computes Mean Absolute Error (MAE) between DESNZ and Nesta scores.
+
+4. **Data Export**:
+   - Outputs GeoDataFrame with coverage data, LSOA lists, and processed suitability scores to GeoPackage, JSON, Parquet, and CSV formats.
+   - Logs detailed metrics and analysis steps.
+
+**Outputs**:
+- GeoPackage: 'liverpool_with_desnz_hn_lsoa.gpkg'
+- JSON: 'liverpool_hp_suitability_lsoas.json'
+- Parquet: 'liverpool_hp_suitability_scores_with_desnz'
+- Log: 'script_output.log'
+
+**How to Run the Script**:
+To run the script, use the following command:
+python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD]
+"""
+
+import geopandas as gpd
+import pyogrio
+import polars as pl
+from typing import Tuple, List, Callable, Optional
+import logging
+import json
+import os
+from shapely.geometry import Polygon
+from asf_heat_pump_suitability import PROJECT_DIR
+import argparse
+
+
+# Define file paths
+# LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
+# LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
+# NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
+LA_TO_ANALYSE = "Liverpool"
+LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg"
+LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp"
+NESTA_HP_SUITABILITY_PARQUET_PATH = (
+    "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
+)
+
+
+def setup_logging_and_file_path(
+    output_dir: str, log_filename: str = "script_output.log", level: int = logging.INFO
+) -> None:
+    """
+    Set up logging configuration to log messages to both a file and the console.
+
+    Args:
+        output_dir (str): Path to the directory where the log file should be saved.
+        log_filename (str): Name of the log file. Defaults to "script_output.log".
+        level (int): Logging level, e.g., logging.INFO or logging.DEBUG. Defaults to logging.INFO.
+    """
+    # Ensure the output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Define the full path to the log file
+    log_file_path = os.path.join(output_dir, log_filename)
+
+    # Clear existing logging handlers if any
+    logging.getLogger().handlers.clear()
+
+    # Set up logging configuration
+    logging.basicConfig(
+        level=level,
+        format="%(levelname)s: %(message)s",
+        handlers=[
+            logging.FileHandler(log_file_path, mode="w"),  # Overwrite log file each run
+            logging.StreamHandler(),
+        ],
+    )
+    logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.")
+
+
+def load_hn_geodata(
+    desnz_hn_gpkg_path: str, lsoa_shp_path: str
+) -> Tuple[gpd.GeoDataFrame, List[str]]:
+    """
+    Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes.
+
+    Args:
+        desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones.
+        lsoa_shp_path (str): Path to the LSOA shapefile.
+
+    Returns:
+        Tuple[gpd.GeoDataFrame, List[str]]:
+            - GeoDataFrame with LSOA codes added via spatial join.
+            - List of unique LSOA codes present in the GeoDataFrame.
+    """
+    # Load DESNZ HN polygons GeoPackage
+    desnz_hn_gdf = pyogrio.read_dataframe(
+        desnz_hn_gpkg_path, layer="heat-network-zone-map-Liverpool"
+    )
+
+    # Load the LSOA polygons file with 'LSOA21CD' codes
+    lsoa_gdf = gpd.read_file(lsoa_shp_path)
+
+    # Remove 'index_right' column if it exists
+    if "index_right" in desnz_hn_gdf.columns:
+        desnz_hn_gdf = desnz_hn_gdf.drop(columns=["index_right"])
+
+    # Ensure both GeoDataFrames use the same CRS
+    if desnz_hn_gdf.crs != lsoa_gdf.crs:
+        lsoa_gdf = lsoa_gdf.to_crs(desnz_hn_gdf.crs)
+
+    # Perform spatial join to match Liverpool polygons with LSOA codes
+    joined_gdf = desnz_hn_gdf.sjoin(
+        lsoa_gdf[["geometry", "LSOA21CD"]],
+        how="left",
+        predicate="intersects",
+    )
+
+    # Calculate the intersection area and the fraction of LSOA area covered by HN zones
+    joined_gdf["intersection_area"] = joined_gdf.apply(
+        lambda row: row.geometry.intersection(
+            lsoa_gdf.loc[row.index_right, "geometry"]
+        ).area,
+        axis=1,
+    )
+    lsoa_gdf["total_area"] = lsoa_gdf.geometry.area
+
+    # Merge the LSOA geometries into the joined GeoDataFrame
+    joined_gdf = joined_gdf.merge(lsoa_gdf[["LSOA21CD", "total_area"]], on="LSOA21CD")
+
+    # Extract unique LSOA codes from the joined GeoDataFrame
+    joined_gdf["fraction_covered"] = (
+        joined_gdf["intersection_area"] / joined_gdf["total_area"]
+    )
+    desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist()
+    return joined_gdf, desnz_hn_unique_lsoas
+
+
+def filter_la_nesta_hp_scores(
+    nesta_hp_suitability_scores: str, local_authority: str
+) -> Tuple[pl.DataFrame, List[str]]:
+    """
+    Load and process Nesta heat pump suitability data for a specific local authority.
+
+    Args:
+        nesta_hp_suitability_scores (str): Path to the Nesta heat pump suitability Parquet file with HN scores.
+        local_authority (str): Local authority name to filter the data.
+
+    Returns:
+        Tuple[pl.DataFrame, List[str]]:
+            - DataFrame containing filtered LSOA codes and 'HN_N_avg_score_weighted' column.
+            - List of unique LSOA codes within the local authority.
+    """
+    # Load the data from the Parquet file
+    hp_scores_df = pl.read_parquet(nesta_hp_suitability_scores)
+
+    # Rename 'lsoa' to 'LSOA21CD' if needed to match the GeoDataFrame
+    if "lsoa" in hp_scores_df.columns:
+        hp_scores_df = hp_scores_df.rename({"lsoa": "LSOA21CD"})
+
+    # Filter by local authority and select relevant columns
+    la_filtered_scores = hp_scores_df.filter(
+        pl.col("lsoa_name").str.starts_with(local_authority)
+    ).select(["LSOA21CD", "HN_N_avg_score_weighted"])
+
+    # Extract unique LSOA codes for LA
+    la_unique_lsoas = la_filtered_scores["LSOA21CD"].unique().to_list()
+
+    return la_filtered_scores, la_unique_lsoas
+
+
+def add_DESNZ_pilot_fraction(
+    la_hp_suitability_scores: pl.DataFrame,
+    joined_gdf: gpd.GeoDataFrame,
+    optional_threshold: Optional[float] = 0,
+) -> Tuple[pl.DataFrame, float, float]:
+    """
+    Add a 'DESNZ_pilot_fraction' column to the Nesta heat pump suitability data using the fraction of area covered by HN zones and compute the average Nesta HN score per LA.
+
+    Args:
+        la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns for a single LA.
+        joined_gdf (gpd.GeoDataFrame): GeoDataFrame containing 'LSOA21CD' and 'fraction_covered' columns.
+
+    Returns:
+        Tuple[pl.DataFrame, float, float]:
+            - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of area covered by HN zones).
+            - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is non-zero.
+            - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is zero.
+    """
+    # Convert joined_gdf to a Polars DataFrame
+    joined_df = pl.DataFrame(
+        {
+            "LSOA21CD": joined_gdf["LSOA21CD"],
+            "fraction_covered": joined_gdf["fraction_covered"],
+        }
+    )
+
+    # Merge the fraction_covered into la_hp_suitability_scores
+    la_hp_suitability_scores = la_hp_suitability_scores.join(
+        joined_df, on="LSOA21CD", how="left"
+    )
+
+    # Fill NaN values in fraction_covered with 0
+    la_hp_suitability_scores = la_hp_suitability_scores.with_columns(
+        pl.col("fraction_covered").fill_null(0).alias("DESNZ_pilot_fraction")
+    ).drop("fraction_covered")
+
+    # Calculating the average Nesta HN score for non-zero and zero DESNZ pilot scores
+    avg_hn_score_pilot_nonzero = _calculate_hn_pilot_average_score(
+        la_hp_suitability_scores,
+        pilot_score_condition=lambda col: col > optional_threshold,
+    )
+    avg_hn_score_pilot_zero = _calculate_hn_pilot_average_score(
+        la_hp_suitability_scores, pilot_score_condition=lambda col: col == 0
+    )
+    return la_hp_suitability_scores, avg_hn_score_pilot_nonzero, avg_hn_score_pilot_zero
+
+
+def calculate_average_scores_for_thresholds(
+    la_hp_suitability_scores: pl.DataFrame, thresholds: List[float]
+) -> pl.DataFrame:
+    """
+    Calculate the average 'HN_N_avg_score_weighted' for various thresholds of 'DESNZ_pilot_fraction'.
+
+    Args:
+        la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns.
+        thresholds (List[float]): List of thresholds to evaluate.
+
+    Returns:
+        pl.DataFrame: DataFrame with columns 'DESNZ_pilot_fraction_threshold' and 'HN_N_avg_score_weighted'.
+    """
+    results = []
+    for threshold in thresholds:
+        avg_score = _calculate_hn_pilot_average_score(
+            la_hp_suitability_scores, pilot_score_condition=lambda col: col > threshold
+        )
+        results.append(
+            {
+                "DESNZ_pilot_fraction_threshold": threshold,
+                "HN_N_avg_score_weighted": avg_score,
+            }
+        )
+    results_df = pl.DataFrame(results)
+    return results_df
+
+
+def _calculate_hn_pilot_average_score(
+    la_hp_suitability_scores: pl.DataFrame,
+    pilot_score_condition: Callable[[pl.Expr], pl.Expr],
+) -> float:
+    """
+    Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > 0`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas.
+
+    Args:
+        la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns for a LA.
+        pilot_score_condition (Callable[[pl.Expr], pl.Expr]): The value of 'DESNZ_pilot_fraction' to filter by (must be >0 or 0).
+
+    Returns:
+        float: Average 'HN_N_avg_score_weighted' for the filtered rows.
+    """
+    # Apply the condition function to filter the DataFrame
+    filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter(
+        pilot_score_condition(pl.col("DESNZ_pilot_fraction"))
+    )
+
+    # Calculate the average score
+    avg_score = filtered_la_hp_suitability_scores["HN_N_avg_score_weighted"].mean()
+    return avg_score
+
+
+def calculate_mae_for_pilot_score(
+    hp_suitability_scores_with_desnz: pl.DataFrame,
+    condition_score_value: Callable[[pl.Expr], pl.Expr],
+) -> float:
+    """
+    Calculate the Mean Absolute Error (MAE) for entries satisfying a specified condition on 'DESNZ_pilot_fraction'.
+
+    Args:
+        hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'absolute_error' columns.
+        condition_score_value (Callable[[pl.Expr], pl.Expr]): A function defining the condition to filter 'DESNZ_pilot_fraction'.
+
+    Returns:
+        float: The MAE for the specified condition.
+    """
+    # Apply the DEZNZ pilot score condition to filter the DataFrame
+    filtered_df = hp_suitability_scores_with_desnz.filter(
+        condition_score_value(pl.col("DESNZ_pilot_fraction"))
+    )
+    mae = filtered_df["absolute_error"].mean()
+    return mae
+
+
+def calculate_mae_for_all(
+    hp_suitability_scores: pl.DataFrame, desnz_col: str, nesta_hn_score_col: str
+) -> Tuple[pl.DataFrame, float]:
+    """
+    Calculate the Mean Absolute Error (MAE) between the DESNZ pilot score and Nesta's HN suitability score, add the absolute error column to the DataFrame,
+    and log the result.
+
+    Args:
+        hp_suitability_scores (pl.DataFrame): DataFrame containing the DESNZ pilot (actual) and Nesta HN suitability score (predicted) columns.
+        desnz_col (str): Name of the column with DESNZ pilot (actual) values.
+        nesta_hn_score_col (str): Name of the column with Nesta HN suitability (predicted) values.
+
+    Returns:
+        Tuple[pl.DataFrame, float]:
+            - Updated DataFrame with 'absolute_error' column added.
+            - The Mean Absolute Error (MAE) between the actual and predicted columns.
+    """
+    # Calculate absolute error and add it as a new column
+    hp_suitability_scores_with_error = hp_suitability_scores.with_columns(
+        (pl.col(desnz_col) - pl.col(nesta_hn_score_col)).abs().alias("absolute_error")
+    )
+
+    # Calculate the mean of the absolute error
+    mae = hp_suitability_scores_with_error["absolute_error"].mean()
+    logging.info(
+        f"Mean Absolute Error (MAE) for {desnz_col} vs {nesta_hn_score_col}: {mae}"
+    )
+
+    return hp_suitability_scores_with_error, mae
+
+
+if __name__ == "__main__":
+    # Argument parser for optional threshold
+    parser = argparse.ArgumentParser(
+        description="Process heat network zones and calculate scores."
+    )
+    parser.add_argument(
+        "--optional_threshold",
+        type=float,
+        default=0.0,
+        help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score.",
+    )
+    args = parser.parse_args()
+    # Extract the optional_threshold value from args
+    optional_threshold = args.optional_threshold
+    # Define the output directory and set up logging
+    output_dir = os.path.join(
+        PROJECT_DIR, "asf_heat_pump_suitability/outputs/hn_zones/output_data/"
+    )
+    setup_logging_and_file_path(output_dir=output_dir)
+
+    # Load the data and perform spatial join
+    liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata(
+        desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH
+    )
+
+    logging.info("Loaded Liverpool with DESNZ HN LSOA data.")
+    liverpool_with_desnz_hn_lsoa.to_file(
+        filename=os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"),
+        driver="GPKG",
+    )
+
+    # Process Nesta heat pump suitability scores
+    liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = (
+        filter_la_nesta_hp_scores(
+            nesta_hp_suitability_scores=NESTA_HP_SUITABILITY_PARQUET_PATH,
+            local_authority=LA_TO_ANALYSE,
+        )
+    )
+    logging.info("Processed Nesta HP suitability scores for Liverpool.")
+
+    with open(
+        os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w"
+    ) as file:
+        json.dump(liverpool_hp_suitability_lsoas, file)
+
+    # Check LSOAs not in HP suitability scores
+    not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set(
+        liverpool_hp_suitability_lsoas
+    )
+    logging.info(
+        f"number of LSOAs in DESNZ list not in HP suitability data: {len(not_in_hp_suitability)}"
+    )
+
+    # Calculate and log average score
+    avg_hn_score = liverpool_hp_suitability_scores["HN_N_avg_score_weighted"].mean()
+    logging.info(f"Average HN_N_avg_score_weighted: {avg_hn_score}")
+    # Define the list of thresholds
+    thresholds = [
+        round(i * 0.05, 2) for i in range(0, 20)
+    ]  # Generates [0.0, 0.05, ..., 0.95]
+
+    # Add DESNZ pilot score and calculate averages
+    (
+        liverpool_hp_suitability_scores_with_desnz,
+        avg_hn_score_pilot_nonzero,
+        avg_hn_score_pilot_zero,
+    ) = add_DESNZ_pilot_fraction(
+        la_hp_suitability_scores=liverpool_hp_suitability_scores,
+        joined_gdf=liverpool_with_desnz_hn_lsoa,
+        optional_threshold=optional_threshold,
+    )
+    logging.info(
+        f"Avg HN_N_avg_score_weighted for DESNZ_pilot_fraction when > {optional_threshold}: {avg_hn_score_pilot_nonzero}"
+    )
+    logging.info(
+        f"Avg HN_N_avg_score_weighted for DESNZ_pilot_fraction = 0: {avg_hn_score_pilot_zero}"
+    )
+
+    # Calculate average scores for different thresholds
+    average_scores_df = calculate_average_scores_for_thresholds(
+        la_hp_suitability_scores=liverpool_hp_suitability_scores_with_desnz,
+        thresholds=thresholds,
+    )
+
+    # Save the results to CSV
+    average_scores_df.write_csv(
+        os.path.join(output_dir, "average_scores_by_threshold.csv")
+    )
+
+    # Calculate and log Mean Absolute Error (MAE)
+    liverpool_hp_suitability_scores_with_desnz, mae_all = calculate_mae_for_all(
+        hp_suitability_scores=liverpool_hp_suitability_scores_with_desnz,
+        desnz_col="DESNZ_pilot_fraction",
+        nesta_hn_score_col="HN_N_avg_score_weighted",
+    )
+    mae_pilot_non_zero = calculate_mae_for_pilot_score(
+        hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz,
+        condition_score_value=lambda col: col > 0,
+    )
+    logging.info(
+        f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction > 0: {mae_pilot_non_zero}"
+    )
+    mae_pilot_zero = calculate_mae_for_pilot_score(
+        hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz,
+        condition_score_value=lambda col: col == 0,
+    )
+    logging.info(
+        f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction = 0: {mae_pilot_zero}"
+    )
+
+    liverpool_hp_suitability_scores_with_desnz.write_parquet(
+        os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet")
+    )
+    # Write to CSV file
+    liverpool_hp_suitability_scores_with_desnz.write_csv(
+        os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.csv")
+    )
+    logging.info("Saved processed data to this output directory:")
+    logging.info(output_dir)

From 4100bfd16b663c195cb5eb684fe85a9b957c8bd1 Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Wed, 4 Dec 2024 19:53:27 +0000
Subject: [PATCH 4/6] small changes to methods script alongside adding plot of
 average hn score vs desnz fraction

---
 outputs/hn_zones/comparison_of_hn_zones.py    |  10 +-
 .../hn_zones/plot_comparison_of_hn_zones.py   | 387 ++++++++++++++++++
 2 files changed, 391 insertions(+), 6 deletions(-)
 create mode 100644 outputs/hn_zones/plot_comparison_of_hn_zones.py

diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py
index 6e42c27..6d5ec53 100644
--- a/outputs/hn_zones/comparison_of_hn_zones.py
+++ b/outputs/hn_zones/comparison_of_hn_zones.py
@@ -343,9 +343,7 @@ def calculate_mae_for_all(
     # Extract the optional_threshold value from args
     optional_threshold = args.optional_threshold
     # Define the output directory and set up logging
-    output_dir = os.path.join(
-        PROJECT_DIR, "asf_heat_pump_suitability/outputs/hn_zones/output_data/"
-    )
+    output_dir = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/")
     setup_logging_and_file_path(output_dir=output_dir)
 
     # Load the data and perform spatial join
@@ -412,9 +410,9 @@ def calculate_mae_for_all(
         thresholds=thresholds,
     )
 
-    # Save the results to CSV
-    average_scores_df.write_csv(
-        os.path.join(output_dir, "average_scores_by_threshold.csv")
+    # Save the results to Parquet
+    average_scores_df.write_parquet(
+        os.path.join(output_dir, "average_scores_by_threshold.parquet")
     )
 
     # Calculate and log Mean Absolute Error (MAE)
diff --git a/outputs/hn_zones/plot_comparison_of_hn_zones.py b/outputs/hn_zones/plot_comparison_of_hn_zones.py
new file mode 100644
index 0000000..5048e1e
--- /dev/null
+++ b/outputs/hn_zones/plot_comparison_of_hn_zones.py
@@ -0,0 +1,387 @@
+"""
+This script processes and visualises heat network (HN) and heat pump suitability data for Liverpool.
+
+Steps:
+1. Load and preprocess data:
+   - Read Parquet, CSV, and GeoPackage files.
+   - Extract unique LSOA geometries.
+   - Load LSOA geometries and convert CRS if needed.
+   - Filter Liverpool LSOAs.
+
+2. Plot Liverpool LSOA geometries and save as PNG.
+
+3. Merge suitability scores with geometries and convert to GeoDataFrame.
+
+4. Plot and save overlay of DESNZ Pilot Score > 0 on Liverpool heat network zones.
+
+5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files.
+
+6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction along with plotting the best fit line and calculating the R^{2}.
+"""
+
+import geopandas as gpd
+import pandas as pd
+import polars as pl
+import matplotlib.pyplot as plt
+import pyogrio
+import json
+import os
+import logging
+from asf_heat_pump_suitability import PROJECT_DIR
+import numpy as np
+from sklearn.metrics import r2_score
+
+# Set up logging configuration
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+# Define constants for file paths (LEAVE UNCHANGED)
+INPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/")
+
+LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join(
+    INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet"
+)
+
+AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join(
+    INPUT_DIR, "average_scores_by_threshold.parquet"
+)
+print(AVERAGE_HN_SCORES_COVERAGE_PARQUET)
+
+LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
+LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
+LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json")
+OUTPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_plots/")
+
+# Ensure the output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+# Define a variable for target CRS to avoid hardcoding
+TARGET_CRS = "EPSG:27700"
+
+
+def load_data():
+    """
+    Load the necessary data files and return them.
+
+    Returns:
+        Tuple:
+            - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores.
+            - List[str]: List of LSOA codes for Liverpool.
+            - pl.DataFrame: DataFrame with average HN scores coverage.
+    Raises:
+        IOError: If any of the data files cannot be loaded.
+    """
+    try:
+        logging.info("Loading data files...")
+        liverpool_hp_suitability_scores_pd = pd.read_parquet(
+            LIVERPOOL_HP_SUITABILITY_PARQUET
+        )
+        with open(LSOA_JSON_PATH, "r") as file:
+            liverpool_hp_suitability_lsoas = json.load(file)
+        average_hn_scores_coverage_df = pl.read_parquet(
+            AVERAGE_HN_SCORES_COVERAGE_PARQUET
+        )
+
+        # Basic validation: Check expected columns in the main DataFrame
+        expected_cols = ["LSOA21CD", "DESNZ_pilot_fraction", "absolute_error"]
+        missing_cols = [
+            col
+            for col in expected_cols
+            if col not in liverpool_hp_suitability_scores_pd.columns
+        ]
+        if missing_cols:
+            raise ValueError(
+                f"Missing expected columns in main DataFrame: {missing_cols}"
+            )
+
+        return (
+            liverpool_hp_suitability_scores_pd,
+            liverpool_hp_suitability_lsoas,
+            average_hn_scores_coverage_df,
+        )
+    except Exception as e:
+        logging.error(f"Error loading data: {e}")
+        raise IOError(f"Failed to load input data due to: {e}")
+
+
+def preprocess_data(
+    liverpool_hp_suitability_lsoas: list,
+    lsoa_shp_path: str = LSOA_SHP_PATH,
+    target_crs: str = TARGET_CRS,
+) -> gpd.GeoDataFrame:
+    """
+    Load and preprocess LSOA geospatial data by extracting unique LSOA geometries and filtering Liverpool LSOAs.
+
+    Args:
+        liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool.
+        lsoa_shp_path (str): Path to the LSOA shapefile.
+        target_crs (str): The target CRS to ensure consistent geospatial analysis.
+
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame with filtered Liverpool LSOA geometries in the given CRS.
+    """
+    logging.info("Preprocessing LSOA geometries...")
+    lsoa_geometries_gdf = gpd.read_file(lsoa_shp_path)
+    if lsoa_geometries_gdf.crs != target_crs:
+        logging.info(f"Converting to {target_crs}")
+        lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs(target_crs)
+    liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[
+        lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas)
+    ]
+
+    if liverpool_lsoa_geometries_gdf.empty:
+        raise ValueError("No LSOAs found for Liverpool in the provided LSOA shapefile.")
+
+    return liverpool_lsoa_geometries_gdf
+
+
+def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR):
+    """
+    Plot Liverpool LSOA geometries and save as PNG.
+
+    Args:
+        liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries.
+        output_dir (str): Output directory to save the plot image.
+    """
+    logging.info("Plotting LSOA geometries...")
+    fig, ax = plt.subplots(figsize=(12, 6))
+    liverpool_lsoa_geometries_gdf.plot(ax=ax)
+    plt.title("Liverpool LSOA Geometries")
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.png"))
+    plt.savefig(os.path.join(output_dir, "liverpool_data_lsoas.pdf"))
+    plt.show()
+
+
+def merge_data(
+    liverpool_hp_suitability_scores_pd: pd.DataFrame,
+    liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame,
+) -> gpd.GeoDataFrame:
+    """
+    Merge suitability scores with geometries and convert to GeoDataFrame.
+
+    Args:
+        liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores.
+        liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries.
+
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame with merged suitability scores and geometries.
+
+    Raises:
+        ValueError: If the merged result is empty or missing geometry.
+    """
+    logging.info("Merging suitability data with geometries...")
+    merged = liverpool_hp_suitability_scores_pd.merge(
+        liverpool_lsoa_geometries_gdf, on="LSOA21CD", how="left"
+    )
+
+    # Validate merged data
+    if merged.empty:
+        raise ValueError("Merging resulted in an empty DataFrame. Check LSOA codes.")
+    if merged["geometry"].isna().any():
+        raise ValueError("Some records are missing geometry after merging.")
+
+    liverpool_hp_suitability_gdf = gpd.GeoDataFrame(
+        merged, geometry="geometry", crs=TARGET_CRS
+    )
+    return liverpool_hp_suitability_gdf
+
+
+def plot_overlay(
+    liverpool_hp_suitability_gdf: gpd.GeoDataFrame,
+    liverpool_hnz_filepath: str = LIVERPOOL_GPKG_PATH,
+    output_dir: str = OUTPUT_DIR,
+):
+    """
+    Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool.
+
+    Args:
+        liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries.
+        liverpool_hnz_filepath (str): Path to the Liverpool heat network zones GeoPackage.
+        output_dir (str): Output directory to save the plot image.
+    """
+    logging.info("Plotting overlay of DESNZ pilot heat network zones...")
+    gdf = pyogrio.read_dataframe(
+        liverpool_hnz_filepath, layer="heat-network-zone-map-Liverpool"
+    )
+    fig, ax = plt.subplots(figsize=(12, 6))
+    gdf.plot(ax=ax, color="blue", alpha=0.5, label="DESNZ pilot heat network zones")
+    liverpool_hp_suitability_gdf[
+        liverpool_hp_suitability_gdf["DESNZ_pilot_fraction"] > 0.0
+    ].plot(ax=ax, color="pink", alpha=0.5, label="LSOAs with DESNZ pilot heat network")
+    plt.title("Overlay of DESNZ pilot heat network zones on Liverpool LSOAs")
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.png"))
+    plt.savefig(os.path.join(output_dir, "overlay_desnz_pilot.pdf"))
+    plt.show()
+
+
+def plot_absolute_error_map(
+    hp_suitability_gdf: gpd.GeoDataFrame,
+    score: float,
+    title: str,
+    filename: str,
+    output_dir: str = OUTPUT_DIR,
+):
+    """
+    Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score.
+
+    Args:
+        hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and
+                                               heat pump suitability scores, including 'absolute_error' and
+                                               'DESNZ_pilot_fraction' columns.
+        score (float): DESNZ pilot score threshold.
+                       If > 0.0, plot regions where DESNZ_pilot_fraction >= score.
+                       If 0, plot regions where DESNZ_pilot_fraction == 0.
+        title (str): Title for the plot.
+        filename (str): Filename (PNG) to save the plot.
+        output_dir (str): Output directory to save the plot image.
+
+    Raises:
+        ValueError: If score < 0.
+    """
+    logging.info(f"Plotting absolute error map for score {score}...")
+    if score < 0:
+        raise ValueError("Score must be a non-negative value.")
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    if score > 0.0:
+        subset = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] >= score]
+        other = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] < score]
+    else:  # score == 0
+        subset = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] == 0]
+        other = hp_suitability_gdf[hp_suitability_gdf["DESNZ_pilot_fraction"] > 0]
+
+    if subset.empty:
+        logging.warning(f"No regions found for the given score threshold: {score}")
+
+    subset.plot(column="absolute_error", ax=ax, legend=True, cmap="inferno")
+    other.plot(ax=ax, edgecolor="black", facecolor="none", linewidth=0.5)
+
+    # Update legend labeling for clarity
+    cbar = ax.get_figure().get_axes()[1]
+    cbar.set_ylabel(
+        "Absolute Error (|Avg. HN Score - DESNZ HN Score|)", rotation=90, labelpad=15
+    )
+
+    plt.title(title)
+    plt.axis("off")
+    plt.savefig(os.path.join(output_dir, f"{filename}.png"))
+    plt.savefig(os.path.join(output_dir, f"{filename}.pdf"))
+    plt.show()
+
+
+def plot_hn_avg_score_vs_fraction_threshold(
+    average_hn_scores_coverage_df: pl.DataFrame,
+    fraction_thresholds_col: str,
+    hn_avg_scores_col: str,
+    filename: str,
+    output_dir: str = OUTPUT_DIR,
+):
+    """
+    Plot the average Nesta HN score against DESNZ Pilot Fraction Threshold, fit a regression line,
+    and display R^2.
+
+    Args:
+        average_hn_scores_coverage_df (pl.DataFrame): DataFrame with average Nesta HN scores and DESNZ Pilot Fraction.
+        fraction_thresholds_col (str): Column name for DESNZ Pilot Fraction Threshold.
+        hn_avg_scores_col (str): Column name for HN_N Avg Score Weighted.
+    """
+    logging.info("Plotting average HN score vs DESNZ Pilot Fraction Threshold...")
+    fraction_thresholds = average_hn_scores_coverage_df[
+        fraction_thresholds_col
+    ].to_list()
+    hn_avg_scores = average_hn_scores_coverage_df[hn_avg_scores_col].to_list()
+
+    x = np.array(fraction_thresholds)
+    y = np.array(hn_avg_scores)
+
+    # Fit a linear model
+    coefficients = np.polyfit(x, y, 1)
+    model = np.poly1d(coefficients)
+    predicted_y = model(x)
+    r_squared = r2_score(y, predicted_y)
+
+    y_min = min(hn_avg_scores) - 0.05
+    y_max = max(hn_avg_scores) + 0.05
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+    ax.plot(
+        fraction_thresholds,
+        hn_avg_scores,
+        marker="o",
+        linestyle="-",
+        linewidth=2,
+        label="Data",
+    )
+    ax.plot(
+        fraction_thresholds,
+        predicted_y,
+        color="red",
+        linestyle="--",
+        linewidth=2,
+        label="Regression line",
+    )
+
+    ax.set_ylim(y_min, y_max)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    # Break mark on the y-axis
+    d = 0.015
+    kwargs = dict(transform=ax.transAxes, color="k", clip_on=False)
+    ax.plot((-d, +d), (-d, +d), **kwargs)
+    ax.plot((-d, +d), (1 - d, 1 + d), **kwargs)
+
+    ax.set_title(
+        "Average Nesta HN Score vs Fraction of DESNZ HN zone in LSOA", fontsize=14
+    )
+    ax.set_xlabel("Fraction of DESNZ HN zone in LSOA", fontsize=12)
+    ax.set_ylabel("Average Nesta HN Score", fontsize=12)
+    ax.grid(True)
+    ax.text(
+        0.05,
+        0.95,
+        f"$R^2 = {r_squared:.2f}$",
+        transform=ax.transAxes,
+        fontsize=12,
+        verticalalignment="top",
+        bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"),
+    )
+    ax.legend()
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, f"{filename}.png"))
+    plt.savefig(os.path.join(output_dir, f"{filename}.pdf"))
+    plt.show()
+
+
+if __name__ == "__main__":
+    (
+        liverpool_hp_suitability_scores_pd,
+        liverpool_hp_suitability_lsoas,
+        average_hn_scores_coverage_df,
+    ) = load_data()
+    liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas)
+    plot_lsoa_geometries(liverpool_lsoa_geometries_gdf)
+    liverpool_hp_suitability_gdf = merge_data(
+        liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf
+    )
+    plot_overlay(liverpool_hp_suitability_gdf)
+    plot_absolute_error_map(
+        liverpool_hp_suitability_gdf,
+        score=0.000001,
+        title="Absolute Error Map for when a DESNZ HN zone is present for Liverpool",
+        filename="liverpool_data_with_pilot_score_1_v2",
+    )
+    plot_absolute_error_map(
+        liverpool_hp_suitability_gdf,
+        score=0,
+        title="Absolute Error Map for when a DESNZ HN zone is absent for Liverpool",
+        filename="liverpool_data_with_pilot_score_0_v2",
+    )
+    plot_hn_avg_score_vs_fraction_threshold(
+        average_hn_scores_coverage_df,
+        fraction_thresholds_col="DESNZ_pilot_fraction_threshold",
+        hn_avg_scores_col="HN_N_avg_score_weighted",
+        filename="HN_N_avg_score_vs_fraction_cover_by_DESNZ_HN",
+    )

From 128474ba7fe9b5dbcd4b751c79ce6e0257682e92 Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Mon, 23 Dec 2024 16:33:08 +0000
Subject: [PATCH 5/6] suggested changes

---
 outputs/hn_zones/comparison_of_hn_zones.py | 233 +++++++++++++++------
 1 file changed, 174 insertions(+), 59 deletions(-)

diff --git a/outputs/hn_zones/comparison_of_hn_zones.py b/outputs/hn_zones/comparison_of_hn_zones.py
index 6d5ec53..69d3092 100644
--- a/outputs/hn_zones/comparison_of_hn_zones.py
+++ b/outputs/hn_zones/comparison_of_hn_zones.py
@@ -26,31 +26,41 @@
 
 **How to Run the Script**:
 To run the script, use the following command:
-python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD]
+python comparison_of_hn_zones.py [--optional_threshold OPTIONAL_THRESHOLD] [--read_in_s3 READ_IN_S3] [--save_to_s3 SAVE_TO_S3]
+
+Example:
+    # Run the script with default settings (local files, no threshold)
+    python comparison_of_hn_zones.py
+
+    # Run the script with a specific threshold
+    python comparison_of_hn_zones.py --optional_threshold 0.1
+
+    # Run the script with files from S3
+    python comparison_of_hn_zones.py --read_in_s3 True
+
+    # Run the script and save outputs to S3
+    python comparison_of_hn_zones.py --save_to_s3 True
+
+    # Run the script with a specific threshold, read from S3, and save to S3
+    python comparison_of_hn_zones.py --optional_threshold 0.1 --read_in_s3 True --save_to_s3 True
 """
 
 import geopandas as gpd
 import pyogrio
 import polars as pl
-from typing import Tuple, List, Callable, Optional
+from typing import Tuple, List, Optional
 import logging
 import json
 import os
-from shapely.geometry import Polygon
 from asf_heat_pump_suitability import PROJECT_DIR
 import argparse
+import boto3
 
 
 # Define file paths
-# LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
-# LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
-# NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
 LA_TO_ANALYSE = "Liverpool"
-LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg"
-LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp"
-NESTA_HP_SUITABILITY_PARQUET_PATH = (
-    "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
-)
+S3_BUCKET = "asf-heat-pump-suitability"
+S3_KEY_DIR = "evaluation/desnz_hn_zone_scores/"
 
 
 def setup_logging_and_file_path(
@@ -85,14 +95,54 @@ def setup_logging_and_file_path(
     logging.info(f"Logging setup complete. Logs are saved to {log_file_path}.")
 
 
-def load_hn_geodata(
+def setup_paths(read_in_s3: bool):
+    """
+    Set up the paths based on whether to read from S3 or locally.
+
+    Args:
+        read_in_s3 (bool): If True, set up paths to read from S3. Otherwise, set up local paths.
+
+    Returns:
+        Tuple[str, str, str]: Paths for LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, and NESTA_HP_SUITABILITY_PARQUET_PATH.
+    """
+    if read_in_s3:
+        LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
+        LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
+        NESTA_HP_SUITABILITY_PARQUET_PATH = "s3://nesta-open-data/asf_heat_pump_suitability/2023Q4/20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
+    else:
+        LIVERPOOL_GPKG_PATH = "heat-network-zone-map-Liverpool.gpkg"
+        LSOA_SHP_PATH = "LSOA_2021_EW_BFE_V9.shp"
+        NESTA_HP_SUITABILITY_PARQUET_PATH = (
+            "20240925_2023_Q4_EPC_heat_pump_suitability_per_lsoa.parquet"
+        )
+    return LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, NESTA_HP_SUITABILITY_PARQUET_PATH
+
+
+def optionally_upload_file_to_s3(
+    local_file_path: str, s3_bucket: str, s3_key: str, save_to_s3: bool
+) -> None:
+    """
+    Upload a local file to an S3 bucket.
+
+    Args:
+        local_file_path (str): Path to the local file.
+        s3_bucket (str): Name of the S3 bucket.
+        s3_key (str): S3 key (path) where the file should be uploaded.
+    """
+    if save_to_s3:
+        s3_client = boto3.client("s3")
+        s3_client.upload_file(local_file_path, s3_bucket, s3_key)
+        logging.info(f"File uploaded to s3://{s3_bucket}/{s3_key}")
+
+
+def load_transform_hn_geodata(
     desnz_hn_gpkg_path: str, lsoa_shp_path: str
 ) -> Tuple[gpd.GeoDataFrame, List[str]]:
     """
-    Load the HN GeoPackage file and perform a spatial join with LSOA polygons to add a column of LSOA codes.
+    Load the Heat Networks GeoPackage file, perform a spatial join with LSOA polygons to add a column of LSOA codes and calculates the intersection area and the fraction of LSOA area covered by HN zones.
 
     Args:
-        desnz_hn_gpkg_path (str): Path to the GeoPackage file with the HN zones.
+        desnz_hn_gpkg_path (str): Path to the GeoPackage file with the Heat Network zones.
         lsoa_shp_path (str): Path to the LSOA shapefile.
 
     Returns:
@@ -122,22 +172,12 @@ def load_hn_geodata(
         how="left",
         predicate="intersects",
     )
-
-    # Calculate the intersection area and the fraction of LSOA area covered by HN zones
-    joined_gdf["intersection_area"] = joined_gdf.apply(
-        lambda row: row.geometry.intersection(
-            lsoa_gdf.loc[row.index_right, "geometry"]
-        ).area,
-        axis=1,
-    )
     lsoa_gdf["total_area"] = lsoa_gdf.geometry.area
-
-    # Merge the LSOA geometries into the joined GeoDataFrame
-    joined_gdf = joined_gdf.merge(lsoa_gdf[["LSOA21CD", "total_area"]], on="LSOA21CD")
-
-    # Extract unique LSOA codes from the joined GeoDataFrame
+    # Get all intersections between DESNZ heat network zones and LSOAs
+    joined_gdf = gpd.overlay(desnz_hn_gdf, lsoa_gdf, how="intersection")
+    # Calculate area of intersections
     joined_gdf["fraction_covered"] = (
-        joined_gdf["intersection_area"] / joined_gdf["total_area"]
+        joined_gdf["geometry"].area / joined_gdf["total_area"]
     )
     desnz_hn_unique_lsoas = joined_gdf["LSOA21CD"].dropna().unique().tolist()
     return joined_gdf, desnz_hn_unique_lsoas
@@ -187,10 +227,11 @@ def add_DESNZ_pilot_fraction(
     Args:
         la_hp_suitability_scores (pl.DataFrame): Data containing 'LSOA21CD' and 'HN_N_avg_score_weighted' columns for a single LA.
         joined_gdf (gpd.GeoDataFrame): GeoDataFrame containing 'LSOA21CD' and 'fraction_covered' columns.
+        optional_threshold (Optional[float]): Optional threshold for fraction of HN zone area contained within LA. Defaults to 0.
 
     Returns:
         Tuple[pl.DataFrame, float, float]:
-            - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of area covered by HN zones).
+            - Updated DataFrame with 'DESNZ_pilot_fraction' column added (fraction of local authority area covered by HN zones).
             - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is non-zero.
             - Average 'HN_N_avg_score_weighted' for rows where 'DESNZ_pilot_fraction' is zero.
     """
@@ -215,10 +256,11 @@ def add_DESNZ_pilot_fraction(
     # Calculating the average Nesta HN score for non-zero and zero DESNZ pilot scores
     avg_hn_score_pilot_nonzero = _calculate_hn_pilot_average_score(
         la_hp_suitability_scores,
-        pilot_score_condition=lambda col: col > optional_threshold,
+        hn_zones=True,
+        optional_threshold=optional_threshold,
     )
     avg_hn_score_pilot_zero = _calculate_hn_pilot_average_score(
-        la_hp_suitability_scores, pilot_score_condition=lambda col: col == 0
+        la_hp_suitability_scores, hn_zones=False
     )
     return la_hp_suitability_scores, avg_hn_score_pilot_nonzero, avg_hn_score_pilot_zero
 
@@ -239,7 +281,7 @@ def calculate_average_scores_for_thresholds(
     results = []
     for threshold in thresholds:
         avg_score = _calculate_hn_pilot_average_score(
-            la_hp_suitability_scores, pilot_score_condition=lambda col: col > threshold
+            la_hp_suitability_scores, hn_zones=True, optional_threshold=threshold
         )
         results.append(
             {
@@ -253,22 +295,30 @@ def calculate_average_scores_for_thresholds(
 
 def _calculate_hn_pilot_average_score(
     la_hp_suitability_scores: pl.DataFrame,
-    pilot_score_condition: Callable[[pl.Expr], pl.Expr],
+    hn_zones: bool,
+    optional_threshold: Optional[float] = 0,
 ) -> float:
     """
-    Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > 0`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas.
+    Calculate the average Nesta Heat Network score for LSOAs in (`DESNZ_pilot_fraction > optional_threshold`) or not in (`DESNZ_pilot_fraction == 0`) DESNZ heat network pilot areas.
 
     Args:
         la_hp_suitability_scores (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'HN_N_avg_score_weighted' columns for a LA.
-        pilot_score_condition (Callable[[pl.Expr], pl.Expr]): The value of 'DESNZ_pilot_fraction' to filter by (must be >0 or 0).
+        hn_zones (bool): If True, calculate average Nesta heat network score for LSOAs in DESNZ heat network zones. Set to False to calculate the average score for LSOAs not in heat network zones.
+        optional_threshold (Optional[float]): The threshold value for 'DESNZ_pilot_fraction'. Defaults to 0. Range: 0-1.
 
     Returns:
         float: Average 'HN_N_avg_score_weighted' for the filtered rows.
     """
-    # Apply the condition function to filter the DataFrame
-    filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter(
-        pilot_score_condition(pl.col("DESNZ_pilot_fraction"))
-    )
+    if hn_zones:
+        # Filter for LSOAs in DESNZ heat network zones
+        filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter(
+            pl.col("DESNZ_pilot_fraction") > optional_threshold
+        )
+    else:
+        # Filter for LSOAs not in DESNZ heat network zones
+        filtered_la_hp_suitability_scores = la_hp_suitability_scores.filter(
+            pl.col("DESNZ_pilot_fraction") == 0
+        )
 
     # Calculate the average score
     avg_score = filtered_la_hp_suitability_scores["HN_N_avg_score_weighted"].mean()
@@ -277,22 +327,32 @@ def _calculate_hn_pilot_average_score(
 
 def calculate_mae_for_pilot_score(
     hp_suitability_scores_with_desnz: pl.DataFrame,
-    condition_score_value: Callable[[pl.Expr], pl.Expr],
+    hn_zones: bool,
+    optional_threshold: Optional[float] = 0,
 ) -> float:
     """
-    Calculate the Mean Absolute Error (MAE) for entries satisfying a specified condition on 'DESNZ_pilot_fraction'.
+    Calculate the Mean Absolute Error (MAE) for entries in or not in DESNZ heat network zones.
 
     Args:
         hp_suitability_scores_with_desnz (pl.DataFrame): DataFrame containing 'DESNZ_pilot_fraction' and 'absolute_error' columns.
-        condition_score_value (Callable[[pl.Expr], pl.Expr]): A function defining the condition to filter 'DESNZ_pilot_fraction'.
+        hn_zones (bool): If True, calculate MAE for entries in DESNZ heat network zones. Set to False to calculate MAE for entries not in heat network zones.
+        optional_threshold (Optional[float]): The threshold value for 'DESNZ_pilot_fraction'. Defaults to 0. Range: 0-1.
 
     Returns:
         float: The MAE for the specified condition.
     """
-    # Apply the DEZNZ pilot score condition to filter the DataFrame
-    filtered_df = hp_suitability_scores_with_desnz.filter(
-        condition_score_value(pl.col("DESNZ_pilot_fraction"))
-    )
+    if hn_zones:
+        # Filter for entries in DESNZ heat network zones
+        filtered_df = hp_suitability_scores_with_desnz.filter(
+            pl.col("DESNZ_pilot_fraction") > optional_threshold
+        )
+    else:
+        # Filter for entries not in DESNZ heat network zones
+        filtered_df = hp_suitability_scores_with_desnz.filter(
+            pl.col("DESNZ_pilot_fraction") == 0
+        )
+
+    # Calculate the MAE
     mae = filtered_df["absolute_error"].mean()
     return mae
 
@@ -337,25 +397,55 @@ def calculate_mae_for_all(
         "--optional_threshold",
         type=float,
         default=0.0,
-        help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score.",
+        help="Optional threshold for DESNZ pilot fraction for calculating average Nesta HN score. Range: 0-1.",
+    )
+    parser.add_argument(
+        "--read_in_s3",
+        type=bool,
+        default=False,
+        help="Read in the input files from S3.",
+    )
+    parser.add_argument(
+        "--save_to_s3",
+        type=bool,
+        default=False,
+        help="Save the output files to S3.",
     )
     args = parser.parse_args()
     # Extract the optional_threshold value from args
     optional_threshold = args.optional_threshold
+    save_to_s3 = args.save_to_s3
+    read_in_s3 = args.read_in_s3
+    LIVERPOOL_GPKG_PATH, LSOA_SHP_PATH, NESTA_HP_SUITABILITY_PARQUET_PATH = setup_paths(
+        read_in_s3=read_in_s3
+    )
     # Define the output directory and set up logging
     output_dir = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/")
+
     setup_logging_and_file_path(output_dir=output_dir)
 
     # Load the data and perform spatial join
-    liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = load_hn_geodata(
-        desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH
+    liverpool_with_desnz_hn_lsoa, list_of_liverpool_desnz_hn_lsoas = (
+        load_transform_hn_geodata(
+            desnz_hn_gpkg_path=LIVERPOOL_GPKG_PATH, lsoa_shp_path=LSOA_SHP_PATH
+        )
     )
 
     logging.info("Loaded Liverpool with DESNZ HN LSOA data.")
+
+    liv_desnz_hn_filename = "liverpool_with_desnz_hn_lsoa.gpkg"
+    liv_desnz_hn_local_file_path = os.path.join(output_dir, liv_desnz_hn_filename)
     liverpool_with_desnz_hn_lsoa.to_file(
-        filename=os.path.join(output_dir, "liverpool_with_desnz_hn_lsoa.gpkg"),
+        filename=liv_desnz_hn_local_file_path,
         driver="GPKG",
     )
+    liv_desnz_hn_s3_key = f"{S3_KEY_DIR}{liv_desnz_hn_filename}"
+    optionally_upload_file_to_s3(
+        local_file_path=liv_desnz_hn_local_file_path,
+        s3_bucket=S3_BUCKET,
+        s3_key=liv_desnz_hn_s3_key,
+        save_to_s3=save_to_s3,
+    )
 
     # Process Nesta heat pump suitability scores
     liverpool_hp_suitability_scores, liverpool_hp_suitability_lsoas = (
@@ -365,12 +455,21 @@ def calculate_mae_for_all(
         )
     )
     logging.info("Processed Nesta HP suitability scores for Liverpool.")
+    # Define JSON file name and paths
+    lsoas_json_filename = "liverpool_hp_suitability_lsoas.json"
+    lsoas_json_local_file_path = os.path.join(output_dir, lsoas_json_filename)
+    lsoas_json_s3_key = f"{S3_KEY_DIR}{lsoas_json_filename}"
 
-    with open(
-        os.path.join(output_dir, "liverpool_hp_suitability_lsoas.json"), "w"
-    ) as file:
+    with open(os.path.join(output_dir, lsoas_json_filename), "w") as file:
         json.dump(liverpool_hp_suitability_lsoas, file)
 
+    optionally_upload_file_to_s3(
+        local_file_path=lsoas_json_local_file_path,
+        s3_bucket=S3_BUCKET,
+        s3_key=lsoas_json_s3_key,
+        save_to_s3=save_to_s3,
+    )
+
     # Check LSOAs not in HP suitability scores
     not_in_hp_suitability = set(list_of_liverpool_desnz_hn_lsoas) - set(
         liverpool_hp_suitability_lsoas
@@ -411,8 +510,14 @@ def calculate_mae_for_all(
     )
 
     # Save the results to Parquet
-    average_scores_df.write_parquet(
-        os.path.join(output_dir, "average_scores_by_threshold.parquet")
+    avg_score_parquet_filename = "average_scores_by_threshold.parquet"
+    avg_score_parquet_filepath = os.path.join(output_dir, avg_score_parquet_filename)
+    average_scores_df.write_parquet(avg_score_parquet_filepath)
+    optionally_upload_file_to_s3(
+        local_file_path=avg_score_parquet_filepath,
+        s3_bucket=S3_BUCKET,
+        s3_key=f"{S3_KEY_DIR}{avg_score_parquet_filename}",
+        save_to_s3=save_to_s3,
     )
 
     # Calculate and log Mean Absolute Error (MAE)
@@ -423,21 +528,31 @@ def calculate_mae_for_all(
     )
     mae_pilot_non_zero = calculate_mae_for_pilot_score(
         hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz,
-        condition_score_value=lambda col: col > 0,
+        hn_zones=True,
     )
     logging.info(
         f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction > 0: {mae_pilot_non_zero}"
     )
     mae_pilot_zero = calculate_mae_for_pilot_score(
         hp_suitability_scores_with_desnz=liverpool_hp_suitability_scores_with_desnz,
-        condition_score_value=lambda col: col == 0,
+        hn_zones=False,
     )
     logging.info(
         f"Mean Absolute Error (MAE) for DESNZ_pilot_fraction = 0: {mae_pilot_zero}"
     )
-
+    # Define Parquet file name and paths for MAE results
+    mae_parquet_filename = "liverpool_hp_suitability_scores_with_desnz.parquet"
+    mae_parquet_local_file_path = os.path.join(output_dir, mae_parquet_filename)
+    mae_parquet_s3_key = f"{S3_KEY_DIR}{mae_parquet_filename}"
     liverpool_hp_suitability_scores_with_desnz.write_parquet(
-        os.path.join(output_dir, "liverpool_hp_suitability_scores_with_desnz.parquet")
+        mae_parquet_local_file_path
+    )
+
+    optionally_upload_file_to_s3(
+        local_file_path=mae_parquet_local_file_path,
+        s3_bucket=S3_BUCKET,
+        s3_key=mae_parquet_s3_key,
+        save_to_s3=save_to_s3,
     )
     # Write to CSV file
     liverpool_hp_suitability_scores_with_desnz.write_csv(

From 94e813046624d055a518ea5587bfd7a903c32269 Mon Sep 17 00:00:00 2001
From: helloaidank <helloaidank@gmail.com>
Date: Mon, 23 Dec 2024 16:46:06 +0000
Subject: [PATCH 6/6] changes to the plotting script

---
 .../hn_zones/plot_comparison_of_hn_zones.py   | 135 +++++++++++++-----
 1 file changed, 100 insertions(+), 35 deletions(-)

diff --git a/outputs/hn_zones/plot_comparison_of_hn_zones.py b/outputs/hn_zones/plot_comparison_of_hn_zones.py
index 5048e1e..4b3f022 100644
--- a/outputs/hn_zones/plot_comparison_of_hn_zones.py
+++ b/outputs/hn_zones/plot_comparison_of_hn_zones.py
@@ -1,5 +1,5 @@
 """
-This script processes and visualises heat network (HN) and heat pump suitability data for Liverpool.
+This script processes and visualises DESNZ heat network (HN) and Nesta heat pump suitability data for Liverpool.
 
 Steps:
 1. Load and preprocess data:
@@ -16,7 +16,20 @@
 
 5. Define and use a function to plot absolute error maps by DESNZ Pilot Score, saving the plots as PNG files.
 
-6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction along with plotting the best fit line and calculating the R^{2}.
+6. Load and plot average Nesta HN scores against DESNZ Pilot Fraction threshold along with plotting the best fit line and calculating the R^{2}.
+
+Usage:
+    python plot_comparison_of_hn_zones.py [--read_from_s3]
+
+Arguments:
+    --read_from_s3 (bool): If True, read the input files from S3 bucket. Defaults to False.
+
+Example:
+    # Run the script with local files
+    python plot_comparison_of_hn_zones.py
+
+    # Run the script with files from S3
+    python plot_comparison_of_hn_zones.py --read_from_s3 True
 """
 
 import geopandas as gpd
@@ -30,56 +43,92 @@
 from asf_heat_pump_suitability import PROJECT_DIR
 import numpy as np
 from sklearn.metrics import r2_score
+import argparse
+import boto3
 
 # Set up logging configuration
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
-
-# Define constants for file paths (LEAVE UNCHANGED)
+# Define local directories and file paths
 INPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_data/")
-
-LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join(
+LOCAL_LIVERPOOL_HP_SUITABILITY_PARQUET = os.path.join(
     INPUT_DIR, "liverpool_hp_suitability_scores_with_desnz.parquet"
 )
-
-AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join(
+LOCAL_AVERAGE_HN_SCORES_COVERAGE_PARQUET = os.path.join(
     INPUT_DIR, "average_scores_by_threshold.parquet"
 )
-print(AVERAGE_HN_SCORES_COVERAGE_PARQUET)
+LOCAL_LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json")
 
 LSOA_SHP_PATH = "s3://asf-heat-pump-suitability/source_data/Lower_layer_Super_Output_Areas_2021_EW_BFE_V9_-9107090204806789093/LSOA_2021_EW_BFE_V9.shp"
 LIVERPOOL_GPKG_PATH = "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Liverpool.gpkg"
-LSOA_JSON_PATH = os.path.join(INPUT_DIR, "liverpool_hp_suitability_lsoas.json")
+
+# Define S3 variables and paths
+S3_BUCKET = "asf-heat-pump-suitability"
+S3_KEY_DIR = "evaluation/desnz_hn_zone_scores/"
+S3_KEY_LIVERPOOL_HP_SUITABILITY_PARQUET = (
+    f"{S3_KEY_DIR}liverpool_hp_suitability_scores_with_desnz.parquet"
+)
+S3_KEY_AVERAGE_HN_SCORES_COVERAGE_PARQUET = (
+    f"{S3_KEY_DIR}average_scores_by_threshold.parquet"
+)
+S3_KEY_LSOA_JSON_PATH = f"{S3_KEY_DIR}liverpool_hp_suitability_lsoas.json"
+# Define output directory
 OUTPUT_DIR = os.path.join(PROJECT_DIR, "outputs/hn_zones/output_plots/")
 
 # Ensure the output directory exists
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 
+
 # Define a variable for target CRS to avoid hardcoding
 TARGET_CRS = "EPSG:27700"
 
 
-def load_data():
+def load_data(read_from_s3: bool = False) -> tuple:
     """
     Load the necessary data files and return them.
 
+    Args:
+        read_from_s3 (bool): If True, read the input files from S3. If False, read the input files locally.
+
     Returns:
         Tuple:
-            - pd.DataFrame: DataFrame with Liverpool heat pump suitability scores.
+            - pl.DataFrame: DataFrame with Liverpool heat pump suitability scores.
             - List[str]: List of LSOA codes for Liverpool.
-            - pl.DataFrame: DataFrame with average HN scores coverage.
+            - pl.DataFrame: DataFrame with the average heat network scores for different area thresholds (threshold is the fraction of an LSOA which is covered by a DESNZ HN zone).
     Raises:
         IOError: If any of the data files cannot be loaded.
     """
     try:
         logging.info("Loading data files...")
-        liverpool_hp_suitability_scores_pd = pd.read_parquet(
-            LIVERPOOL_HP_SUITABILITY_PARQUET
-        )
-        with open(LSOA_JSON_PATH, "r") as file:
-            liverpool_hp_suitability_lsoas = json.load(file)
-        average_hn_scores_coverage_df = pl.read_parquet(
-            AVERAGE_HN_SCORES_COVERAGE_PARQUET
-        )
+        if read_from_s3:
+            # Read from S3
+            s3_client = boto3.client("s3")
+
+            # Read Liverpool heat pump suitability scores
+            liv_hps_obj = s3_client.get_object(
+                Bucket=S3_BUCKET, Key=S3_KEY_LIVERPOOL_HP_SUITABILITY_PARQUET
+            )
+            liverpool_hp_suitability_scores_pd = pd.read_parquet(liv_hps_obj["Body"])
+
+            # Read LSOA JSON
+            liv_hps_lsoas_obj = s3_client.get_object(
+                Bucket=S3_BUCKET, Key=S3_KEY_LSOA_JSON_PATH
+            )
+            liverpool_lsoas = json.loads(liv_hps_lsoas_obj["Body"].read())
+
+            # Read average HN scores coverage
+            avg_hn_scores_obj = s3_client.get_object(
+                Bucket=S3_BUCKET, Key=S3_KEY_AVERAGE_HN_SCORES_COVERAGE_PARQUET
+            )
+            average_hn_scores_coverage_df = pl.read_parquet(avg_hn_scores_obj["Body"])
+        else:
+            liverpool_hp_suitability_scores_pd = pd.read_parquet(
+                LOCAL_LIVERPOOL_HP_SUITABILITY_PARQUET
+            )
+            with open(LOCAL_LSOA_JSON_PATH, "r") as file:
+                liverpool_lsoas = json.load(file)
+            average_hn_scores_coverage_df = pl.read_parquet(
+                LOCAL_AVERAGE_HN_SCORES_COVERAGE_PARQUET
+            )
 
         # Basic validation: Check expected columns in the main DataFrame
         expected_cols = ["LSOA21CD", "DESNZ_pilot_fraction", "absolute_error"]
@@ -90,12 +139,12 @@ def load_data():
         ]
         if missing_cols:
             raise ValueError(
-                f"Missing expected columns in main DataFrame: {missing_cols}"
+                f"Missing expected columns in hp suitability df: {missing_cols}"
             )
 
         return (
             liverpool_hp_suitability_scores_pd,
-            liverpool_hp_suitability_lsoas,
+            liverpool_lsoas,
             average_hn_scores_coverage_df,
         )
     except Exception as e:
@@ -104,7 +153,7 @@ def load_data():
 
 
 def preprocess_data(
-    liverpool_hp_suitability_lsoas: list,
+    liverpool_lsoas: list,
     lsoa_shp_path: str = LSOA_SHP_PATH,
     target_crs: str = TARGET_CRS,
 ) -> gpd.GeoDataFrame:
@@ -112,7 +161,7 @@ def preprocess_data(
     Load and preprocess LSOA geospatial data by extracting unique LSOA geometries and filtering Liverpool LSOAs.
 
     Args:
-        liverpool_hp_suitability_lsoas (list): List of LSOA codes for Liverpool.
+        liverpool_lsoas (list): List of LSOA codes for Liverpool.
         lsoa_shp_path (str): Path to the LSOA shapefile.
         target_crs (str): The target CRS to ensure consistent geospatial analysis.
 
@@ -125,7 +174,7 @@ def preprocess_data(
         logging.info(f"Converting to {target_crs}")
         lsoa_geometries_gdf = lsoa_geometries_gdf.to_crs(target_crs)
     liverpool_lsoa_geometries_gdf = lsoa_geometries_gdf[
-        lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_hp_suitability_lsoas)
+        lsoa_geometries_gdf["LSOA21CD"].isin(liverpool_lsoas)
     ]
 
     if liverpool_lsoa_geometries_gdf.empty:
@@ -134,9 +183,11 @@ def preprocess_data(
     return liverpool_lsoa_geometries_gdf
 
 
-def plot_lsoa_geometries(liverpool_lsoa_geometries_gdf, output_dir: str = OUTPUT_DIR):
+def plot_lsoa_geometries(
+    liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame, output_dir: str = OUTPUT_DIR
+):
     """
-    Plot Liverpool LSOA geometries and save as PNG.
+    Plot Liverpool LSOA geometries and save as PNG and PDF.
 
     Args:
         liverpool_lsoa_geometries_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool LSOA geometries.
@@ -157,7 +208,7 @@ def merge_data(
     liverpool_lsoa_geometries_gdf: gpd.GeoDataFrame,
 ) -> gpd.GeoDataFrame:
     """
-    Merge suitability scores with geometries and convert to GeoDataFrame.
+    Merge LSOA suitability scores with geometries and convert to GeoDataFrame.
 
     Args:
         liverpool_hp_suitability_scores_pd (pd.DataFrame): DataFrame with Liverpool heat pump suitability scores.
@@ -192,7 +243,7 @@ def plot_overlay(
     output_dir: str = OUTPUT_DIR,
 ):
     """
-    Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool.
+    Overlay of DESNZ pilot heat network zones on LSOAs in Liverpool. Saves the plot as PNG and PDF.
 
     Args:
         liverpool_hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame with Liverpool heat pump suitability scores and geometries.
@@ -223,7 +274,7 @@ def plot_absolute_error_map(
     output_dir: str = OUTPUT_DIR,
 ):
     """
-    Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score.
+    Plot an absolute error map for areas in Liverpool based on the DESNZ pilot score. Saves the plot as PNG and PDF.
 
     Args:
         hp_suitability_gdf (gpd.GeoDataFrame): GeoDataFrame containing Liverpool's LSOA geometries and
@@ -232,6 +283,7 @@ def plot_absolute_error_map(
         score (float): DESNZ pilot score threshold.
                        If > 0.0, plot regions where DESNZ_pilot_fraction >= score.
                        If 0, plot regions where DESNZ_pilot_fraction == 0.
+                       Range from 0 to 1.
         title (str): Title for the plot.
         filename (str): Filename (PNG) to save the plot.
         output_dir (str): Output directory to save the plot image.
@@ -239,7 +291,7 @@ def plot_absolute_error_map(
     Raises:
         ValueError: If score < 0.
     """
-    logging.info(f"Plotting absolute error map for score {score}...")
+    logging.info(f"Plotting absolute error map for score threshold {score}...")
     if score < 0:
         raise ValueError("Score must be a non-negative value.")
 
@@ -285,7 +337,7 @@ def plot_hn_avg_score_vs_fraction_threshold(
     Args:
         average_hn_scores_coverage_df (pl.DataFrame): DataFrame with average Nesta HN scores and DESNZ Pilot Fraction.
         fraction_thresholds_col (str): Column name for DESNZ Pilot Fraction Threshold.
-        hn_avg_scores_col (str): Column name for HN_N Avg Score Weighted.
+        hn_avg_scores_col (str): Column name for Nesta Heat Network Avg Score Weighted.
     """
     logging.info("Plotting average HN score vs DESNZ Pilot Fraction Threshold...")
     fraction_thresholds = average_hn_scores_coverage_df[
@@ -356,12 +408,25 @@ def plot_hn_avg_score_vs_fraction_threshold(
 
 
 if __name__ == "__main__":
+    # Argument parser for reading inputs from s3
+    parser = argparse.ArgumentParser(
+        description="Process heat network zones and calculate scores."
+    )
+    parser.add_argument(
+        "--read_from_s3",
+        type=bool,
+        default=False,
+        help="Read the input files from S3 bucket.",
+    )
+    args = parser.parse_args()
+    read_from_s3 = args.read_from_s3
+
     (
         liverpool_hp_suitability_scores_pd,
-        liverpool_hp_suitability_lsoas,
+        liverpool_lsoas,
         average_hn_scores_coverage_df,
     ) = load_data()
-    liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_hp_suitability_lsoas)
+    liverpool_lsoa_geometries_gdf = preprocess_data(liverpool_lsoas)
     plot_lsoa_geometries(liverpool_lsoa_geometries_gdf)
     liverpool_hp_suitability_gdf = merge_data(
         liverpool_hp_suitability_scores_pd, liverpool_lsoa_geometries_gdf