diff --git a/analysis/bootstrapping.py b/analysis/bootstrapping.py index 0544e2d1..f042052c 100644 --- a/analysis/bootstrapping.py +++ b/analysis/bootstrapping.py @@ -2,10 +2,12 @@ import copy import analysis.statistics + def get_seeds(number_of_seeds): return np.arange(1, number_of_seeds + 1) * 1000 -def configure(context, stage, sample_size, parameters = {}, alias = None, ephemeral = True): + +def configure(context, stage, sample_size, parameters={}, alias=None, ephemeral=True): if alias is None: alias = stage @@ -15,11 +17,18 @@ def configure(context, stage, sample_size, parameters = {}, alias = None, epheme sample_parameters = copy.copy(parameters) sample_parameters["random_seed"] = int(random_seed) - context.stage(stage, sample_parameters, alias = "bootstrap_%s_%d" % (alias, index), ephemeral = ephemeral) + context.stage( + stage, + sample_parameters, + alias="bootstrap_%s_%d" % (alias, index), + ephemeral=ephemeral, + ) + def get_stage(context, alias, index): return context.stage("bootstrap_%s_%d" % (alias, index)) + def get_stages(context, alias, sample_size): for index in range(sample_size): yield get_stage(context, alias, index) diff --git a/analysis/chains.py b/analysis/chains.py index 886bd765..66a809d6 100644 --- a/analysis/chains.py +++ b/analysis/chains.py @@ -9,22 +9,30 @@ ("chain", "sex"), ("chain_length_class", "age_class"), ("chain_length_class", "sex"), - ("chain",), ("chain_length_class",), + ("chain",), + ("chain_length_class",), ("age_range", "sex", "chain"), - ("age_range", "sex", "chain_length_class") + ("age_range", "sex", "chain_length_class"), ] PURPOSE_MAPPING = { - "home": "h", "work": "w", "education": "e", - "shop": "s", "leisure": "l", "other": "o" + "home": "h", + "work": "w", + "education": "e", + "shop": "s", + "leisure": "l", + "other": "o", } + def aggregate_chains(df_chains): current_person_id = None current_chain = None records = [] - for person_id, purpose in zip(df_chains["person_id"].values, df_chains["purpose"].values): + for person_id, purpose in zip( + df_chains["person_id"].values, df_chains["purpose"].values + ): if not person_id == current_person_id: if not current_person_id is None: records.append((current_person_id, current_chain)) @@ -36,11 +44,11 @@ def aggregate_chains(df_chains): records.append((current_person_id, current_chain)) - df_chains = pd.DataFrame.from_records(records, columns = ["person_id", "chain"]) + df_chains = pd.DataFrame.from_records(records, columns=["person_id", "chain"]) - #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"w+", "w", x)) - #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"e+", "e", x)) - #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"h+", "h", x)) + # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"w+", "w", x)) + # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"e+", "e", x)) + # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"h+", "h", x)) df_chains["chain_length"] = df_chains["chain"].str.len() diff --git a/analysis/debug/sc.py b/analysis/debug/sc.py index 2f73f125..8bbeeaab 100644 --- a/analysis/debug/sc.py +++ b/analysis/debug/sc.py @@ -1,11 +1,13 @@ import numpy as np import pandas as pd + def configure(context): - context.stage("data.census.filtered", alias = "census") - context.stage("data.hts.selected", alias = "hts") + context.stage("data.census.filtered", alias="census") + context.stage("data.hts.selected", alias="hts") context.config("output_path") + def execute(context): df_census = context.stage("census") df_hts = context.stage("hts")[1] @@ -19,14 +21,16 @@ def execute(context): f_census = df_census["socioprofessional_class"] == value f_hts = df_hts["socioprofessional_class"] == value - df_output.append({ - "value": value, - "census_count": np.count_nonzero(f_census), - "hts_count": np.count_nonzero(f_hts), - "census_weight": df_census[f_census]["weight"].sum(), - "hts_weight": df_hts[f_hts]["person_weight"].sum() - }) + df_output.append( + { + "value": value, + "census_count": np.count_nonzero(f_census), + "hts_count": np.count_nonzero(f_hts), + "census_weight": df_census[f_census]["weight"].sum(), + "hts_weight": df_hts[f_hts]["person_weight"].sum(), + } + ) pd.DataFrame.from_records(df_output).to_csv( - "{}/debug_sc.csv".format(context.config("output_path")), - sep = ";", index = False) + "{}/debug_sc.csv".format(context.config("output_path")), sep=";", index=False + ) diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py index b2506ea1..4201d3d6 100644 --- a/analysis/grid/comparison_flow_volume.py +++ b/analysis/grid/comparison_flow_volume.py @@ -1,116 +1,268 @@ import pandas as pd import geopandas as gpd -import plotly.express as px +import plotly.express as px SAMPLING_RATE = 0.05 + def configure(context): - if not context.config("analysis_from_file",False) : + if not context.config("analysis_from_file", False): context.stage("synthesis.population.trips") context.stage("synthesis.population.spatial.locations") context.stage("synthesis.population.enriched") context.stage("data.spatial.departments") - context.config("comparison_file_prefix",None) + context.config("comparison_file_prefix", None) context.config("output_prefix", "ile_de_france_") context.config("output_formats", ["csv", "gpkg"]) context.config("output_path") context.config("data_path") -def stat_grid(df_trips,df_locations,df_persons,df_grid): - + +def stat_grid(df_trips, df_locations, df_persons, df_grid): + # Write spatial trips - df_spatial = pd.merge(df_trips, df_locations[[ - "person_id", "activity_index", "geometry" - ]].rename(columns = { - "activity_index": "following_activity_index", - }), how = "left", on = ["person_id", "following_activity_index"]) - df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",]) - df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326") - - df_stats = gpd.sjoin(df_grid,df_spatial,how="left") - return df_stats[['id_carr_1km', 'geometry','person_id', 'following_purpose', 'household_id', 'age']] + df_spatial = pd.merge( + df_trips, + df_locations[["person_id", "activity_index", "geometry"]].rename( + columns={ + "activity_index": "following_activity_index", + } + ), + how="left", + on=["person_id", "following_activity_index"], + ) + df_spatial = pd.merge( + df_spatial, + df_persons, + how="left", + on=[ + "person_id", + ], + ) + df_spatial = gpd.GeoDataFrame(df_spatial, crs="EPSG:2154").to_crs("4326") + + df_stats = gpd.sjoin(df_grid, df_spatial, how="left") + return df_stats[ + [ + "id_carr_1km", + "geometry", + "person_id", + "following_purpose", + "household_id", + "age", + ] + ] + + def execute(context): - + figures = { - "Yrs:0-10":{"min_age": 0, "max_age": 10,}, - "Yrs:11-14":{"min_age": 11, "max_age": 14,}, - "Yrs:15-18":{"min_age": 15, "max_age": 17,}, - "Yrs:18-25":{"min_age": 18, "max_age": 25,}, - "Yrs:25-50":{"min_age": 26, "max_age": 50,}, - "Yrs:50-65":{"min_age": 51, "max_age": 65,}, - "Yrs:65-75":{"min_age": 66, "max_age": 75,}, - "Yrs:75+":{"min_age": 76, "max_age": 110,},} - comparison_file = context.config("output_prefix") if context.config("comparison_file_prefix") is None else context.config("comparison_file_prefix") - + "Yrs:0-10": { + "min_age": 0, + "max_age": 10, + }, + "Yrs:11-14": { + "min_age": 11, + "max_age": 14, + }, + "Yrs:15-18": { + "min_age": 15, + "max_age": 17, + }, + "Yrs:18-25": { + "min_age": 18, + "max_age": 25, + }, + "Yrs:25-50": { + "min_age": 26, + "max_age": 50, + }, + "Yrs:50-65": { + "min_age": 51, + "max_age": 65, + }, + "Yrs:65-75": { + "min_age": 66, + "max_age": 75, + }, + "Yrs:75+": { + "min_age": 76, + "max_age": 110, + }, + } + comparison_file = ( + context.config("output_prefix") + if context.config("comparison_file_prefix") is None + else context.config("comparison_file_prefix") + ) + if not context.config("analysis_from_file"): print("Récupération simu données ...") # from simulation cache df_trips = context.stage("synthesis.population.trips") - df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]] - df_locations = context.stage("synthesis.population.spatial.locations")[[ - "person_id", "activity_index", "geometry" - ]] + df_persons = context.stage("synthesis.population.enriched")[ + ["person_id", "household_id", "age"] + ] + df_locations = context.stage("synthesis.population.spatial.locations")[ + ["person_id", "activity_index", "geometry"] + ] df_trips["preceding_activity_index"] = df_trips["trip_index"] df_trips["following_activity_index"] = df_trips["trip_index"] + 1 - else : + else: # from file trips, activites and person print("Récupération données ...") - df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]] - df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg') - df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]] + df_trips = pd.read_csv( + f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv', + sep=";", + )[["person_id", "trip_index", "following_activity_index", "following_purpose"]] + df_locations = ( + gpd.read_parquet( + f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet' + ) + if "geoparquet" in context.config("output_formats") + else gpd.read_file( + f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg' + ) + ) + df_persons = pd.read_csv( + f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv', + sep=";", + )[["person_id", "household_id", "age"]] print("Récupération comp données ...") - df_trips_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]] - df_locations_comp = gpd.read_parquet(f'{context.config("output_path")}/{comparison_file}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{comparison_file}activities.gpkg') - df_persons_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}persons.csv',sep=';')[["person_id", "household_id","age"]] - + df_trips_comp = pd.read_csv( + f'{context.config("output_path")}/{comparison_file}trips.csv', sep=";" + )[["person_id", "trip_index", "following_activity_index", "following_purpose"]] + df_locations_comp = ( + gpd.read_parquet( + f'{context.config("output_path")}/{comparison_file}activities.geoparquet' + ) + if "geoparquet" in context.config("output_formats") + else gpd.read_file( + f'{context.config("output_path")}/{comparison_file}activities.gpkg' + ) + ) + df_persons_comp = pd.read_csv( + f'{context.config("output_path")}/{comparison_file}persons.csv', sep=";" + )[["person_id", "household_id", "age"]] + list_purpose = list(df_trips["following_purpose"].unique()) # grid 1km of location data df_departments = context.stage("data.spatial.departments") poly_dep = df_departments.unary_union df_grids = gpd.read_file( - f'{context.config("data_path")}/grid/grille200m_metropole.gpkg', - mask=poly_dep, - ) + f'{context.config("data_path")}/grid/grille200m_metropole.gpkg', + mask=poly_dep, + ) df_grids = df_grids.to_crs("4326") - df_grid = df_grids[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index() + df_grid = ( + df_grids[["id_carr_1km", "geometry"]].dissolve(by="id_carr_1km").reset_index() + ) - df_stats = stat_grid(df_trips,df_locations,df_persons,df_grid) - df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid) - point = df_grid.unary_union.centroid # a changé avec ploy_dep + df_stats = stat_grid(df_trips, df_locations, df_persons, df_grid) + df_grids = stat_grid(df_trips_comp, df_locations_comp, df_persons_comp, df_grid) + point = df_grid.unary_union.centroid # a changé avec ploy_dep print("Printing grids...") for prefix, figure in figures.items(): - df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])] - df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index() + df_select_age = df_stats[ + df_stats["age"].between(figure["min_age"], figure["max_age"]) + ] + df_select_age = df_select_age.dissolve( + by=["id_carr_1km", "following_purpose"], aggfunc="count" + ).reset_index() df_select_age = df_select_age[~(df_select_age["geometry"].isna())] - df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str') + df_select_age["following_purpose"] = df_select_age["following_purpose"].astype( + "str" + ) - df_grids_age = df_grids[df_grids["age"].between(figure["min_age"],figure["max_age"])] - df_grids_age = df_grids_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index() + df_grids_age = df_grids[ + df_grids["age"].between(figure["min_age"], figure["max_age"]) + ] + df_grids_age = df_grids_age.dissolve( + by=["id_carr_1km", "following_purpose"], aggfunc="count" + ).reset_index() df_grids_age = df_grids_age[~(df_grids_age["geometry"].isna())] - df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype('str') - - for purpose in list_purpose : - df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"}) - df_grids_select = df_grids_age[df_grids_age["following_purpose"]==purpose].rename(columns={"person_id":"count"}) - if context.config("output_prefix") == comparison_file : - df_select = gpd.sjoin(df_select,df_grid,how='right',predicate="contains").fillna(0) - df_select = df_select[df_select["count"] != 0] - fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds', - mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose") - fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') - else : - df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0) - df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0) - df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"] - df_select = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)] - df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"] - px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"], - mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html') - - \ No newline at end of file + df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype( + "str" + ) + + for purpose in list_purpose: + df_select = df_select_age[ + df_select_age["following_purpose"] == purpose + ].rename(columns={"person_id": "count"}) + df_grids_select = df_grids_age[ + df_grids_age["following_purpose"] == purpose + ].rename(columns={"person_id": "count"}) + if context.config("output_prefix") == comparison_file: + df_select = gpd.sjoin( + df_select, df_grid, how="right", predicate="contains" + ).fillna(0) + df_select = df_select[df_select["count"] != 0] + fig = px.choropleth_mapbox( + df_select, + geojson=df_select.geometry, + locations=df_select.index, + color="count", + opacity=0.7, + color_continuous_scale="reds", + mapbox_style="open-street-map", + center=dict(lat=point.y, lon=point.x), + title=f"Localisation flow distribution for {prefix} group with {purpose} purpose", + ) + fig.write_html( + f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html' + ) + else: + df_grids_select = gpd.sjoin( + df_grids_select, df_grid, how="right", predicate="contains" + ).fillna(0) + df_select = ( + gpd.sjoin( + df_select, + df_grids_select.drop(columns=["index_left"]), + how="right", + predicate="contains", + ) + .rename( + columns={ + "count_left": "volume_studied_simu", + "count_right": "volume_compared_simu", + } + ) + .fillna(0) + ) + df_select["volume_difference"] = ( + df_select["volume_studied_simu"] - df_select["volume_compared_simu"] + ) + df_select = df_select[ + (df_select["volume_studied_simu"] != 0) + | (df_select["volume_compared_simu"] != 0) + ] + df_select["pourcentage_vol"] = ( + df_select["volume_difference"] / df_select["volume_compared_simu"] + ) + px.choropleth_mapbox( + df_select, + geojson=df_select.geometry, + locations=df_select.index, + color="volume_difference", + opacity=0.7, + color_continuous_scale="picnic", + color_continuous_midpoint=0, + hover_name="id_carr_1km_right", + hover_data=[ + "volume_studied_simu", + "volume_compared_simu", + "pourcentage_vol", + ], + mapbox_style="open-street-map", + center=dict(lat=point.y, lon=point.x), + title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose", + ).write_html( + f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html' + ) diff --git a/analysis/marginals.py b/analysis/marginals.py index 98baf359..7e78720f 100644 --- a/analysis/marginals.py +++ b/analysis/marginals.py @@ -20,25 +20,44 @@ CENSUS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [("socioprofessional_class",)] CENSUS_HOUSEHOLD_MARGINALS = GENERAL_HOUSEHOLD_MARGINALS -HTS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [("has_license",), ("has_pt_subscription",)] +HTS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [ + ("has_license",), + ("has_pt_subscription",), +] HTS_HOUSEHOLD_MARGINALS = GENERAL_HOUSEHOLD_MARGINALS + [("number_of_bikes_class",)] SOCIOPROFESIONAL_CLASS_LABELS = [ - "???", "Agriculture", "Independent", "Science", "Intermediate", "Employee", "Worker", "Retired", "Other" + "???", + "Agriculture", + "Independent", + "Science", + "Intermediate", + "Employee", + "Worker", + "Retired", + "Other", ] + def prepare_classes(df): if "age" in df: - df["age_class"] = np.digitize(df["age"], AGE_CLASS_BOUNDS, right = True) + df["age_class"] = np.digitize(df["age"], AGE_CLASS_BOUNDS, right=True) if "household_size" in df: - df["household_size_class"] = np.digitize(df["household_size"], HOUSEHOLD_SIZE_BOUNDS, right = True) + df["household_size_class"] = np.digitize( + df["household_size"], HOUSEHOLD_SIZE_BOUNDS, right=True + ) if "number_of_vehicles" in df: - df["number_of_vehicles_class"] = np.digitize(df["number_of_vehicles"], NUMBER_OF_VEHICLES_BOUNDS, right = True) + df["number_of_vehicles_class"] = np.digitize( + df["number_of_vehicles"], NUMBER_OF_VEHICLES_BOUNDS, right=True + ) if "number_of_bikes" in df: - df["number_of_bikes_class"] = np.digitize(df["number_of_bikes"], NUMBER_OF_BIKES_BOUNDS, right = True) + df["number_of_bikes_class"] = np.digitize( + df["number_of_bikes"], NUMBER_OF_BIKES_BOUNDS, right=True + ) + def cross(*marginals): result = [] @@ -56,6 +75,7 @@ def cross(*marginals): return list(set(result)) + def combine(*marginals): result = [] @@ -64,21 +84,22 @@ def combine(*marginals): return list(set(result)) + ALL_PERSON_MARGINALS = combine(CENSUS_PERSON_MARGINALS, HTS_PERSON_MARGINALS) ALL_HOUSEHOLD_MARGINALS = combine(CENSUS_HOUSEHOLD_MARGINALS, HTS_HOUSEHOLD_MARGINALS) SPATIAL_MARGINALS = [("departement_id",), ("commune_id",)] ANALYSIS_PERSON_MARGINALS = combine( - ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS, + ALL_PERSON_MARGINALS, + ALL_HOUSEHOLD_MARGINALS, cross(ALL_PERSON_MARGINALS, ALL_PERSON_MARGINALS), cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS), - cross(ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS) + cross(ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS), ) ANALYSIS_HOUSEHOLD_MARGINALS = combine( - ALL_HOUSEHOLD_MARGINALS, - cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS) + ALL_HOUSEHOLD_MARGINALS, cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS) ) SPATIAL_PERSON_MARGINALS = combine( diff --git a/analysis/methods/income/compare_methods.py b/analysis/methods/income/compare_methods.py index d8573521..f4f545cd 100644 --- a/analysis/methods/income/compare_methods.py +++ b/analysis/methods/income/compare_methods.py @@ -29,10 +29,17 @@ def execute(context): df_population = add_household_type_attribute(df_population) # get most populated commune - commune_id = df_population.groupby(["commune_id"], observed=True)["commune_id"].count().drop("undefined").idxmax() + commune_id = ( + df_population.groupby(["commune_id"], observed=True)["commune_id"] + .count() + .drop("undefined") + .idxmax() + ) # get income distributions by attributes - income_df = context.stage("data.income.municipality").query(f"commune_id == '{commune_id}'") + income_df = context.stage("data.income.municipality").query( + f"commune_id == '{commune_id}'" + ) income_df = income_df.rename( columns={ "value": "modality", @@ -48,22 +55,22 @@ def execute(context): } ) - households_with_attributes = df_population[[ - "household_id", "commune_id", "size", "family_comp" - ]].drop_duplicates("household_id") + households_with_attributes = df_population[ + ["household_id", "commune_id", "size", "family_comp"] + ].drop_duplicates("household_id") # get enriched population with different methods uniform_pop_df = context.stage("uniform") uniform_pop_df = uniform_pop_df.merge(households_with_attributes, on="household_id") uniform_pop_df["household_income"] = ( - uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"] + uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"] ) uniform_pop_df = uniform_pop_df.query(f"commune_id == '{commune_id}'") bhepop2_pop_df = context.stage("bhepop2") bhepop2_pop_df = bhepop2_pop_df.merge(households_with_attributes, on="household_id") bhepop2_pop_df["household_income"] = ( - bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"] + bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"] ) bhepop2_pop_df = bhepop2_pop_df.query(f"commune_id == '{commune_id}'") @@ -76,28 +83,29 @@ def execute(context): ["size", "family_comp"], 0, relative_maximum=MAXIMUM_INCOME_FACTOR, - delta_min=1000 + delta_min=1000, ) # check output folder existence - compare_output_path = os.path.join(context.config("output_path"), COMPARE_INCOME_FOLDER) + compare_output_path = os.path.join( + context.config("output_path"), COMPARE_INCOME_FOLDER + ) if not os.path.exists(compare_output_path): os.mkdir(compare_output_path) # create an analysis instance analysis = marginal_distributions_source.compare_with_populations( - { - "Uniform": uniform_pop_df, - "Bhepop2": bhepop2_pop_df - }, + {"Uniform": uniform_pop_df, "Bhepop2": bhepop2_pop_df}, feature_name="household_income", - output_folder=compare_output_path + output_folder=compare_output_path, + ) + analysis.plot_title_format = ( + analysis.plot_title_format + f" \n(commune={commune_id})" ) - analysis.plot_title_format = analysis.plot_title_format + f" \n(commune={commune_id})" analysis.generate_analysis_plots() analysis.generate_analysis_error_table() - print(f"Generated compared analysis of income assignation methods in {compare_output_path}") - - + print( + f"Generated compared analysis of income assignation methods in {compare_output_path}" + ) diff --git a/analysis/reference/census/sociodemographics.py b/analysis/reference/census/sociodemographics.py index 47c6204d..203c8d4f 100644 --- a/analysis/reference/census/sociodemographics.py +++ b/analysis/reference/census/sociodemographics.py @@ -1,34 +1,39 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): context.stage("data.census.filtered") + def execute(context): person_marginals = marginals.combine( marginals.TOTAL_MARGINAL, - marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS, - - marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_PERSON_MARGINALS), - marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), - - marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), - + marginals.cross( + marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_PERSON_MARGINALS + ), + marginals.cross( + marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS + ), + marginals.cross( + marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS + ), marginals.SPATIAL_MARGINALS, - marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_PERSON_MARGINALS) + marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_PERSON_MARGINALS), ) household_marginals = marginals.combine( marginals.TOTAL_MARGINAL, - marginals.CENSUS_HOUSEHOLD_MARGINALS, - - marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), - + marginals.cross( + marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS + ), marginals.SPATIAL_MARGINALS, - marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS) + marginals.cross( + marginals.SPATIAL_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS + ), ) df_persons = context.stage("data.census.filtered") @@ -37,6 +42,6 @@ def execute(context): df_households = df_persons.drop_duplicates("household_id").copy() return dict( - person = stats.marginalize(df_persons, person_marginals), - household = stats.marginalize(df_households, household_marginals) + person=stats.marginalize(df_persons, person_marginals), + household=stats.marginalize(df_households, household_marginals), ) diff --git a/analysis/reference/hts/activities.py b/analysis/reference/hts/activities.py index f1268709..b098e95a 100644 --- a/analysis/reference/hts/activities.py +++ b/analysis/reference/hts/activities.py @@ -1,14 +1,21 @@ import pandas as pd import numpy as np + def configure(context): - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + PURPOSE_MAPPING = { - "home": "h", "work": "w", "education": "e", - "shop": "s", "leisure": "l", "other": "o" + "home": "h", + "work": "w", + "education": "e", + "shop": "s", + "leisure": "l", + "other": "o", } + def execute(context): df_households, df_persons, df_activities = context.stage("hts") @@ -36,13 +43,37 @@ def execute(context): df_last["is_first"] = False df_last["is_last"] = True - df_activities = pd.concat([ - df_activities[["person_id", "activity_id", "purpose", "start_time", "end_time", "is_first", "is_last"]], - df_last[["person_id", "activity_id", "purpose", "start_time", "end_time", "is_first", "is_last"]] - ]).sort_values(by = ["person_id", "activity_id"]) + df_activities = pd.concat( + [ + df_activities[ + [ + "person_id", + "activity_id", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + ] + ], + df_last[ + [ + "person_id", + "activity_id", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + ] + ], + ] + ).sort_values(by=["person_id", "activity_id"]) # Add activities for people without trips - df_missing = df_persons[~df_persons["person_id"].isin(df_activities["person_id"])][["person_id"]] + df_missing = df_persons[~df_persons["person_id"].isin(df_activities["person_id"])][ + ["person_id"] + ] df_missing["activity_id"] = 0 df_missing["purpose"] = "home" diff --git a/analysis/reference/hts/chains.py b/analysis/reference/hts/chains.py index 9f7cd0dc..b10730d5 100644 --- a/analysis/reference/hts/chains.py +++ b/analysis/reference/hts/chains.py @@ -5,29 +5,47 @@ import analysis.statistics as stats import analysis.marginals as marginals -from analysis.chains import aggregate_chains, CHAIN_MARGINALS, CHAIN_LENGTH_LIMIT, CHAIN_TOP_K +from analysis.chains import ( + aggregate_chains, + CHAIN_MARGINALS, + CHAIN_LENGTH_LIMIT, + CHAIN_TOP_K, +) + def configure(context): context.stage("analysis.reference.hts.activities") - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def execute(context): - df_chains = context.stage("analysis.reference.hts.activities")[[ - "person_id", "activity_id", "purpose" - ]].sort_values(by = ["person_id", "activity_id"]) + df_chains = context.stage("analysis.reference.hts.activities")[ + ["person_id", "activity_id", "purpose"] + ].sort_values(by=["person_id", "activity_id"]) df_chains = aggregate_chains(df_chains) df_population = context.stage("hts")[1] marginals.prepare_classes(df_population) - df_chains = pd.merge(df_population[["person_id", "age_class", "sex", "person_weight", "age"]], df_chains, on = "person_id") - df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT) - - top_k_chains = df_chains.groupby("chain")["person_weight"].sum().reset_index().sort_values( - by = "person_weight", ascending = False - ).head(CHAIN_TOP_K)["chain"].values + df_chains = pd.merge( + df_population[["person_id", "age_class", "sex", "person_weight", "age"]], + df_chains, + on="person_id", + ) + df_chains["chain_length_class"] = np.minimum( + df_chains["chain_length"], CHAIN_LENGTH_LIMIT + ) + + top_k_chains = ( + df_chains.groupby("chain")["person_weight"] + .sum() + .reset_index() + .sort_values(by="person_weight", ascending=False) + .head(CHAIN_TOP_K)["chain"] + .values + ) df_chains = df_chains[df_chains["chain"].isin(top_k_chains)] df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40) - return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column = "person_weight") + return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column="person_weight") diff --git a/analysis/reference/hts/commute_distance.py b/analysis/reference/hts/commute_distance.py index 70cd8931..72897de6 100644 --- a/analysis/reference/hts/commute_distance.py +++ b/analysis/reference/hts/commute_distance.py @@ -5,33 +5,49 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def execute(context): - df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" }) - df_trips = pd.merge(context.stage("hts")[2], df_weight, on = "person_id") + df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename( + columns={"person_weight": "weight"} + ) + df_trips = pd.merge(context.stage("hts")[2], df_weight, on="person_id") # Prepare data frames df_work = df_trips[ - ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == "work")) | - ((df_trips["preceding_purpose"] == "work") & (df_trips["following_purpose"] == "home")) - ].drop_duplicates("person_id", keep = "first")[["euclidean_distance", "weight"]] + ( + (df_trips["preceding_purpose"] == "home") + & (df_trips["following_purpose"] == "work") + ) + | ( + (df_trips["preceding_purpose"] == "work") + & (df_trips["following_purpose"] == "home") + ) + ].drop_duplicates("person_id", keep="first")[["euclidean_distance", "weight"]] df_education = df_trips[ - ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == "education")) | - ((df_trips["preceding_purpose"] == "education") & (df_trips["following_purpose"] == "home")) - ].drop_duplicates("person_id", keep = "first")[["euclidean_distance", "weight"]] + ( + (df_trips["preceding_purpose"] == "home") + & (df_trips["following_purpose"] == "education") + ) + | ( + (df_trips["preceding_purpose"] == "education") + & (df_trips["following_purpose"] == "home") + ) + ].drop_duplicates("person_id", keep="first")[["euclidean_distance", "weight"]] # Prepare distributions - df_work = df_work.sort_values(by = "euclidean_distance") + df_work = df_work.sort_values(by="euclidean_distance") df_work["cdf"] = np.cumsum(df_work["weight"]) df_work["cdf"] /= df_work["cdf"].max() df_work = df_work[["euclidean_distance", "cdf"]] - df_education = df_education.sort_values(by = "euclidean_distance") + df_education = df_education.sort_values(by="euclidean_distance") df_education["cdf"] = np.cumsum(df_education["weight"]) df_education["cdf"] /= df_education["cdf"].max() df_education = df_education[["euclidean_distance", "cdf"]] - return dict(work = df_work, education = df_education) + return dict(work=df_work, education=df_education) diff --git a/analysis/reference/hts/commute_flow.py b/analysis/reference/hts/commute_flow.py index 5a922409..6cf2722f 100644 --- a/analysis/reference/hts/commute_flow.py +++ b/analysis/reference/hts/commute_flow.py @@ -5,64 +5,115 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") -def execute(context): - df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" }) - df_trips = context.stage("hts")[2][[ - "person_id", "origin_departement_id", "destination_departement_id", - "preceding_purpose", "following_purpose" - ]] +def execute(context): + df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename( + columns={"person_weight": "weight"} + ) + + df_trips = context.stage("hts")[2][ + [ + "person_id", + "origin_departement_id", + "destination_departement_id", + "preceding_purpose", + "following_purpose", + ] + ] # Prepare homes - df_homes = df_trips[df_trips["preceding_purpose"] == "home"][["person_id", "origin_departement_id"]].rename( - columns = { "origin_departement_id": "home" } - ).drop_duplicates("person_id") + df_homes = ( + df_trips[df_trips["preceding_purpose"] == "home"][ + ["person_id", "origin_departement_id"] + ] + .rename(columns={"origin_departement_id": "home"}) + .drop_duplicates("person_id") + ) # Calculate work - df_work = df_trips[df_trips["following_purpose"] == "work"][["person_id", "destination_departement_id"]].rename( - columns = { "destination_departement_id": "work" } - ).drop_duplicates("person_id") + df_work = ( + df_trips[df_trips["following_purpose"] == "work"][ + ["person_id", "destination_departement_id"] + ] + .rename(columns={"destination_departement_id": "work"}) + .drop_duplicates("person_id") + ) - df_work = pd.merge(df_homes, df_work, on = "person_id") - df_work = pd.merge(df_work, df_weight, on = "person_id", how = "left") + df_work = pd.merge(df_homes, df_work, on="person_id") + df_work = pd.merge(df_work, df_weight, on="person_id", how="left") df_work = df_work.groupby(["home", "work"])["weight"].sum() df_work = df_work.reset_index() # Calculate education - df_education = df_trips[df_trips["following_purpose"] == "education"][["person_id", "destination_departement_id"]].rename( - columns = { "destination_departement_id": "education" } - ).drop_duplicates("person_id") + df_education = ( + df_trips[df_trips["following_purpose"] == "education"][ + ["person_id", "destination_departement_id"] + ] + .rename(columns={"destination_departement_id": "education"}) + .drop_duplicates("person_id") + ) - df_education = pd.merge(df_homes, df_education, on = "person_id") - df_education = pd.merge(df_education, df_weight, on = "person_id", how = "left") + df_education = pd.merge(df_homes, df_education, on="person_id") + df_education = pd.merge(df_education, df_weight, on="person_id", how="left") df_education = df_education.groupby(["home", "education"])["weight"].sum() df_education = df_education.reset_index() # Calculate corrections for employed non-movers - df_existing = context.stage("hts")[1][["employed", "departement_id", "person_weight"]].rename(columns = { "person_weight": "weight", "departement_id": "home" }) + df_existing = context.stage("hts")[1][ + ["employed", "departement_id", "person_weight"] + ].rename(columns={"person_weight": "weight", "departement_id": "home"}) df_existing = df_existing[df_existing["employed"]] - df_existing = df_existing.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "existing" }) - - df_origin = df_work.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "active" }) - - df_work_correction = pd.merge(df_existing, df_origin, on = "home") - df_work_correction["factor"] = df_work_correction["active"] / df_work_correction["existing"] + df_existing = ( + df_existing.groupby("home")["weight"] + .sum() + .reset_index() + .rename(columns={"weight": "existing"}) + ) + + df_origin = ( + df_work.groupby("home")["weight"] + .sum() + .reset_index() + .rename(columns={"weight": "active"}) + ) + + df_work_correction = pd.merge(df_existing, df_origin, on="home") + df_work_correction["factor"] = ( + df_work_correction["active"] / df_work_correction["existing"] + ) df_work_correction = df_work_correction[["home", "factor"]] # Calculate corrections for studying non-movers - df_existing = context.stage("hts")[1][["studies", "departement_id", "person_weight"]].rename(columns = { "person_weight": "weight", "departement_id": "home" }) + df_existing = context.stage("hts")[1][ + ["studies", "departement_id", "person_weight"] + ].rename(columns={"person_weight": "weight", "departement_id": "home"}) df_existing = df_existing[df_existing["studies"]] - df_existing = df_existing.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "existing" }) - - df_origin = df_education.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "active" }) - - df_education_correction = pd.merge(df_existing, df_origin, on = "home") - df_education_correction["factor"] = df_education_correction["active"] / df_education_correction["existing"] + df_existing = ( + df_existing.groupby("home")["weight"] + .sum() + .reset_index() + .rename(columns={"weight": "existing"}) + ) + + df_origin = ( + df_education.groupby("home")["weight"] + .sum() + .reset_index() + .rename(columns={"weight": "active"}) + ) + + df_education_correction = pd.merge(df_existing, df_origin, on="home") + df_education_correction["factor"] = ( + df_education_correction["active"] / df_education_correction["existing"] + ) df_education_correction = df_education_correction[["home", "factor"]] - return dict(work = df_work, education = df_education), dict(work = df_work_correction, education = df_education_correction) + return dict(work=df_work, education=df_education), dict( + work=df_work_correction, education=df_education_correction + ) diff --git a/analysis/reference/hts/mode_distances.py b/analysis/reference/hts/mode_distances.py index 6b556bd5..f347a72c 100644 --- a/analysis/reference/hts/mode_distances.py +++ b/analysis/reference/hts/mode_distances.py @@ -1,9 +1,11 @@ import pandas as pd import numpy as np + def configure(context): context.stage("data.hts.selected") + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.selected") df = pd.merge(df_trips, df_persons[["person_id", "person_weight"]]) @@ -12,11 +14,13 @@ def execute(context): df["travel_time"] = df["arrival_time"] - df["departure_time"] primary_activities = ["home", "work", "education"] - #primary_activities = [] - df = df[~( - df["preceding_purpose"].isin(primary_activities) & - df["following_purpose"].isin(primary_activities) - )] + # primary_activities = [] + df = df[ + ~( + df["preceding_purpose"].isin(primary_activities) + & df["following_purpose"].isin(primary_activities) + ) + ] data = dict() @@ -32,6 +36,6 @@ def execute(context): cdf = np.cumsum(weights[sorter]) cdf /= cdf[-1] - data[mode] = dict(values = values, cdf = cdf) + data[mode] = dict(values=values, cdf=cdf) return data diff --git a/analysis/reference/hts/sociodemographics.py b/analysis/reference/hts/sociodemographics.py index d6acb58f..ad64a5d9 100644 --- a/analysis/reference/hts/sociodemographics.py +++ b/analysis/reference/hts/sociodemographics.py @@ -2,8 +2,10 @@ import analysis.marginals as marginals import pandas as pd + def configure(context): - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def execute(context): df_households, df_persons, _ = context.stage("hts") @@ -13,7 +15,7 @@ def execute(context): household_columns -= person_columns household_columns.add("household_id") - df = pd.merge(df_persons, df_households[household_columns], on = "household_id") + df = pd.merge(df_persons, df_households[household_columns], on="household_id") assert len(df_persons) == len(df) df_persons = df @@ -21,36 +23,36 @@ def execute(context): person_marginals = marginals.combine( marginals.TOTAL_MARGINAL, - marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS, - marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_PERSON_MARGINALS), - marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), - - marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), - + marginals.cross( + marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS + ), + marginals.cross( + marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS + ), spatial_marginals, - marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS) + marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS), ) household_marginals = marginals.combine( marginals.TOTAL_MARGINAL, - marginals.HTS_HOUSEHOLD_MARGINALS, - marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), - + marginals.cross( + marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS + ), spatial_marginals, - marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS) + marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS), ) marginals.prepare_classes(df_persons) df_households = df_persons.drop_duplicates("household_id").copy() - df_persons = df_persons.rename(columns = { "person_weight": "weight" }) - df_households = df_households.rename(columns = { "household_weight": "weight" }) + df_persons = df_persons.rename(columns={"person_weight": "weight"}) + df_households = df_households.rename(columns={"household_weight": "weight"}) return dict( - person = stats.marginalize(df_persons, person_marginals), - household = stats.marginalize(df_households, household_marginals) + person=stats.marginalize(df_persons, person_marginals), + household=stats.marginalize(df_households, household_marginals), ) diff --git a/analysis/reference/income.py b/analysis/reference/income.py index 5b4e068d..c75e184a 100644 --- a/analysis/reference/income.py +++ b/analysis/reference/income.py @@ -4,11 +4,13 @@ import data.hts.egt.cleaned import data.hts.entd.cleaned + def configure(context): context.stage("data.hts.entd.cleaned") context.stage("data.hts.egt.cleaned") context.stage("data.income.region") + def calculate_cdf(df): weights = df["household_weight"].values incomes = df["income"].values @@ -16,33 +18,53 @@ def calculate_cdf(df): sorter = np.argsort(incomes) cdf = np.cumsum(weights[sorter]) / np.sum(weights) - return dict(income = incomes[sorter], cdf = cdf) + return dict(income=incomes[sorter], cdf=cdf) + def execute(context): # Calculate ENTD income distribution - df_entd = context.stage("data.hts.entd.cleaned")[0][["household_weight", "income_class", "consumption_units"]].copy() + df_entd = context.stage("data.hts.entd.cleaned")[0][ + ["household_weight", "income_class", "consumption_units"] + ].copy() entd_upper_bounds = data.hts.entd.cleaned.INCOME_CLASS_BOUNDS entd_lower_bounds = [0] + entd_upper_bounds[:-1] - df_entd["income"] = 12 * 0.5 * df_entd["income_class"].apply(lambda k: entd_lower_bounds[k] + entd_upper_bounds[k] if k >= 0 else np.nan) + df_entd["income"] = ( + 12 + * 0.5 + * df_entd["income_class"].apply( + lambda k: entd_lower_bounds[k] + entd_upper_bounds[k] if k >= 0 else np.nan + ) + ) df_entd = pd.DataFrame(calculate_cdf(df_entd)) df_entd["source"] = "entd" # Calculate EGT income distribution - df_egt = context.stage("data.hts.egt.cleaned")[0][["household_weight", "income_class", "consumption_units"]].copy() + df_egt = context.stage("data.hts.egt.cleaned")[0][ + ["household_weight", "income_class", "consumption_units"] + ].copy() egt_upper_bounds = data.hts.egt.cleaned.INCOME_CLASS_BOUNDS egt_lower_bounds = [0] + egt_upper_bounds[:-1] - df_egt["income"] = 12 * 0.5 * df_egt["income_class"].apply(lambda k: egt_lower_bounds[k] + egt_upper_bounds[k] if k >= 0 else np.nan) + df_egt["income"] = ( + 12 + * 0.5 + * df_egt["income_class"].apply( + lambda k: egt_lower_bounds[k] + egt_upper_bounds[k] if k >= 0 else np.nan + ) + ) df_egt["income"] /= df_egt["consumption_units"] df_egt = pd.DataFrame(calculate_cdf(df_egt)) df_egt["source"] = "egt" # Calcultae FiLo income distribution df_filo = context.stage("data.income.region") - df_filo = pd.DataFrame(dict( - income = np.array([0.0] + df_filo.tolist()), cdf = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) - )) + df_filo = pd.DataFrame( + dict( + income=np.array([0.0] + df_filo.tolist()), + cdf=np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), + ) + ) df_filo["source"] = "filo" return pd.concat([df_entd, df_egt, df_filo]) diff --git a/analysis/reference/od/commute_distance.py b/analysis/reference/od/commute_distance.py index 14a04eae..9bd3b13f 100644 --- a/analysis/reference/od/commute_distance.py +++ b/analysis/reference/od/commute_distance.py @@ -1,19 +1,21 @@ import pandas as pd import numpy as np + def configure(context): context.stage("data.od.cleaned") context.stage("data.spatial.centroid_distances") + def execute(context): df_distances = context.stage("data.spatial.centroid_distances") result = {} for df_data, name in zip(context.stage("data.od.cleaned"), ("work", "education")): - df_data = pd.merge(df_data, df_distances, on = ["origin_id", "destination_id"]) + df_data = pd.merge(df_data, df_distances, on=["origin_id", "destination_id"]) df_data = df_data[["centroid_distance", "weight"]] - df_data = df_data.sort_values(by = "centroid_distance") + df_data = df_data.sort_values(by="centroid_distance") df_data["cdf"] = np.cumsum(df_data["weight"]) df_data["cdf"] /= df_data["cdf"].max() df_data = df_data[["centroid_distance", "cdf"]] diff --git a/analysis/reference/od/commute_flow.py b/analysis/reference/od/commute_flow.py index 0a693a0f..1a8b1981 100644 --- a/analysis/reference/od/commute_flow.py +++ b/analysis/reference/od/commute_flow.py @@ -1,36 +1,56 @@ import pandas as pd + def configure(context): context.stage("data.od.cleaned") context.stage("data.spatial.municipalities") + def execute(context): - df_codes = context.stage("data.spatial.municipalities")[[ - "commune_id", "departement_id" - ]] + df_codes = context.stage("data.spatial.municipalities")[ + ["commune_id", "departement_id"] + ] result = {} for df_data, name in zip(context.stage("data.od.cleaned"), ("work", "education")): df_data["origin_id"] = df_data["origin_id"].cat.remove_unused_categories() - df_data["destination_id"] = df_data["destination_id"].cat.remove_unused_categories() - - df_data = pd.merge(df_data, df_codes.rename(columns = { - "commune_id": "origin_id", - "departement_id": "origin_departement_id" - }), how = "left", on = "origin_id") - - df_data = pd.merge(df_data, df_codes.rename(columns = { - "commune_id": "destination_id", - "departement_id": "destination_departement_id" - }), how = "left", on = "destination_id") - - df_data = df_data[[ - "origin_departement_id", "destination_departement_id", "weight" - ]].rename(columns = { - "origin_departement_id": "home", - "destination_departement_id": name - }) + df_data["destination_id"] = df_data[ + "destination_id" + ].cat.remove_unused_categories() + + df_data = pd.merge( + df_data, + df_codes.rename( + columns={ + "commune_id": "origin_id", + "departement_id": "origin_departement_id", + } + ), + how="left", + on="origin_id", + ) + + df_data = pd.merge( + df_data, + df_codes.rename( + columns={ + "commune_id": "destination_id", + "departement_id": "destination_departement_id", + } + ), + how="left", + on="destination_id", + ) + + df_data = df_data[ + ["origin_departement_id", "destination_departement_id", "weight"] + ].rename( + columns={ + "origin_departement_id": "home", + "destination_departement_id": name, + } + ) df_data["home"] = df_data["home"].cat.remove_unused_categories() df_data[name] = df_data[name].cat.remove_unused_categories() diff --git a/analysis/statistics.py b/analysis/statistics.py index 498b9bff..1f577e42 100644 --- a/analysis/statistics.py +++ b/analysis/statistics.py @@ -4,11 +4,13 @@ import numpy as np import pandas as pd -@numba.jit(nopython = True, parallel = True) + +@numba.jit(nopython=True, parallel=True) def _combine_filter(filters): return np.logical_and.reduce(filters) -def marginalize(df, marginals, weight_column = "weight", count_column = "weight"): + +def marginalize(df, marginals, weight_column="weight", count_column="weight"): """ This function takes a data frame and a list of marginals in the form @@ -58,16 +60,26 @@ def marginalize(df, marginals, weight_column = "weight", count_column = "weight" results = {} for columns in marginals: - if len(columns) == 0: # Total is requested + if len(columns) == 0: # Total is requested total = len(df) if weight_column is None else df[weight_column].sum() - results[columns] = pd.DataFrame.from_records([["value", total]], columns = ["total", count_column]) + results[columns] = pd.DataFrame.from_records( + [["value", total]], columns=["total", count_column] + ) else: marginal_records = [] - value_index_lists = [np.arange(len(unique_values[column])) for column in columns] + value_index_lists = [ + np.arange(len(unique_values[column])) for column in columns + ] for value_indices in itertools.product(*value_index_lists): - marginal_values = [unique_values[column][value_index] for column, value_index in zip(columns, value_indices)] - marginal_filters = [filters[column][value_index] for column, value_index in zip(columns, value_indices)] + marginal_values = [ + unique_values[column][value_index] + for column, value_index in zip(columns, value_indices) + ] + marginal_filters = [ + filters[column][value_index] + for column, value_index in zip(columns, value_indices) + ] f = np.logical_and.reduce(marginal_filters) if weight_column is None: @@ -77,18 +89,19 @@ def marginalize(df, marginals, weight_column = "weight", count_column = "weight" marginal_records.append(marginal_values + [marginal_count]) - marginal_records = pd.DataFrame.from_records(marginal_records, columns = list(columns) + [count_column]) + marginal_records = pd.DataFrame.from_records( + marginal_records, columns=list(columns) + [count_column] + ) results[columns] = marginal_records return results + def apply_per_marginal(marginals, f): - return { - marginal: f(df) - for marginal, df in marginals.items() - } + return {marginal: f(df) for marginal, df in marginals.items()} + -def collect_sample(dfs, column = "realization"): +def collect_sample(dfs, column="realization"): """ This function combines multiple structurally equal data frames into one by adding an additional column denoting the number of the realization. @@ -108,7 +121,8 @@ def collect_sample(dfs, column = "realization"): return pd.concat(new_dfs) -def combine_marginals(realizations, column = "realization"): + +def combine_marginals(realizations, column="realization"): """ This function combines multiple realizations of the "marginalize" output into a new data structure that is equivalent to the one of "marginalize", but with @@ -117,7 +131,9 @@ def combine_marginals(realizations, column = "realization"): assert len(realizations) > 0 marginals = realizations[0].keys() - marginal_columns = { marginal: list(realizations[0][marginal].columns) for marginal in marginals } + marginal_columns = { + marginal: list(realizations[0][marginal].columns) for marginal in marginals + } # Check that all realizations have the same structure as the first for realization in realizations: @@ -130,21 +146,33 @@ def combine_marginals(realizations, column = "realization"): sample = {} for marginal in marginals: - sample[marginal] = collect_sample([realization[marginal] for realization in realizations], column) + sample[marginal] = collect_sample( + [realization[marginal] for realization in realizations], column + ) return sample -def bootstrap(df, bootstrap_size, random, realization_column = "realization", bootstrap_sample_size = None): + +def bootstrap( + df, + bootstrap_size, + random, + realization_column="realization", + bootstrap_sample_size=None, +): unique_realizations = np.unique(df[realization_column]) realizations = df[realization_column].values - indices = [list(np.where(realizations == realization)[0]) for realization in unique_realizations] + indices = [ + list(np.where(realizations == realization)[0]) + for realization in unique_realizations + ] lengths = [len(i) for i in indices] if bootstrap_sample_size is None: bootstrap_sample_size = len(indices) - counts = random.randint(len(indices), size = (bootstrap_size, bootstrap_sample_size)) + counts = random.randint(len(indices), size=(bootstrap_size, bootstrap_sample_size)) for selection in counts: selection_indices = [] @@ -159,17 +187,23 @@ def bootstrap(df, bootstrap_size, random, realization_column = "realization", bo yield df_sample -def apply_bootstrap(df, bootstrap_size, random, f, realization_column = "realization"): + +def apply_bootstrap(df, bootstrap_size, random, f, realization_column="realization"): df_bootstrap = [] - for bootstrap_realization, df_sample in enumerate(bootstrap(df, bootstrap_size, random, realization_column)): + for bootstrap_realization, df_sample in enumerate( + bootstrap(df, bootstrap_size, random, realization_column) + ): df_sample = f(df_sample) df_sample[realization_column] = bootstrap_realization df_bootstrap.append(df_sample) return pd.concat(df_bootstrap) -def analyze_sample(df, realization_column = "realization", columns = ["weight"], statistics = None): + +def analyze_sample( + df, realization_column="realization", columns=["weight"], statistics=None +): assert realization_column in df if columns is None or len(columns) == 0: @@ -183,128 +217,171 @@ def analyze_sample(df, realization_column = "realization", columns = ["weight"], assert column in df.columns group_columns = list(df.columns) - for column in columns: group_columns.remove(column) + for column in columns: + group_columns.remove(column) group_columns.remove(realization_column) if statistics is None: statistics = { column: [ - ("mean", "mean"), ("median", "median"), ("min", "min"), ("max", "max"), - ("q10", lambda x: x.quantile(0.1)), ("q90", lambda x: x.quantile(0.9)), - ("q5", lambda x: x.quantile(0.05)), ("q95", lambda x: x.quantile(0.95)) + ("mean", "mean"), + ("median", "median"), + ("min", "min"), + ("max", "max"), + ("q10", lambda x: x.quantile(0.1)), + ("q90", lambda x: x.quantile(0.9)), + ("q5", lambda x: x.quantile(0.05)), + ("q95", lambda x: x.quantile(0.95)), ] for column in columns } - df = df[group_columns + columns].groupby(group_columns).aggregate(statistics).reset_index() + df = ( + df[group_columns + columns] + .groupby(group_columns) + .aggregate(statistics) + .reset_index() + ) return df -def analyze_sample_and_flatten(df, realization_column = "realization", columns = ["weight"], statistics = None): + +def analyze_sample_and_flatten( + df, realization_column="realization", columns=["weight"], statistics=None +): df = analyze_sample(df, realization_column, columns, statistics) df.columns = [c[1] if c[0] == "weight" else c[0] for c in df.columns] return df -def sample_subsets(df, subset_size, random, realization_column = "realization"): + +def sample_subsets(df, subset_size, random, realization_column="realization"): realizations = len(np.unique(df[realization_column])) return bootstrap(df, realizations, random, realization_column, subset_size) -def average_subsets(df, subset_size, random, realization_column = "realization", weight_column = "weight"): + +def average_subsets( + df, subset_size, random, realization_column="realization", weight_column="weight" +): df_output = [] - for realization, df_subset in enumerate(sample_subsets(df, subset_size, random, realization_column)): - df_subset = analyze_sample(df_subset, realization_column, weight_column, [("weight", "mean")]) + for realization, df_subset in enumerate( + sample_subsets(df, subset_size, random, realization_column) + ): + df_subset = analyze_sample( + df_subset, realization_column, weight_column, [("weight", "mean")] + ) df_subset[realization_column] = realization df_output.append(df_subset) return pd.concat(df_output) + if __name__ == "__main__": + def create_sample(random_seed): random = np.random.RandomState(random_seed) index = np.arange(100) - ages = random.randint(10, size = 100) * 10 - gender = random.randint(2, size = 100) + ages = random.randint(10, size=100) * 10 + gender = random.randint(2, size=100) - df = pd.DataFrame.from_records(zip(index, ages, gender), columns = ["person", "age", "gender"]) - df["gender"] = df["gender"].map({ 0: "male", 1: "female" }).astype("category") + df = pd.DataFrame.from_records( + zip(index, ages, gender), columns=["person", "age", "gender"] + ) + df["gender"] = df["gender"].map({0: "male", 1: "female"}).astype("category") df["weight"] = 1.0 return df - df = pd.DataFrame.from_records([ - { "age": 20, "weight": 10.0, "abc": 10.0, "realization": 0 }, - { "age": 50, "weight": 50.0, "abc": 50.0, "realization": 0 }, - { "age": 20, "weight": 20.0, "abc": 20.0, "realization": 1 }, - { "age": 50, "weight": 60.0, "abc": 60.0, "realization": 1 }, - ]) + df = pd.DataFrame.from_records( + [ + {"age": 20, "weight": 10.0, "abc": 10.0, "realization": 0}, + {"age": 50, "weight": 50.0, "abc": 50.0, "realization": 0}, + {"age": 20, "weight": 20.0, "abc": 20.0, "realization": 1}, + {"age": 50, "weight": 60.0, "abc": 60.0, "realization": 1}, + ] + ) random = np.random.RandomState(0) statistics = { "weight": [("mean", "mean")], - "abc": [("q95", lambda x: x.quantile(0.95))] + "abc": [("q95", lambda x: x.quantile(0.95))], } - df = apply_bootstrap(df, 100, random, lambda df: analyze_sample(df, statistics = statistics, columns = ["weight", "abc"])) - - df = df.groupby("age").aggregate([ - ("mean", "mean"), - ("q10", lambda x: x.quantile(0.1)), - ("q90", lambda x: x.quantile(0.9)) - ]).reset_index() + df = apply_bootstrap( + df, + 100, + random, + lambda df: analyze_sample(df, statistics=statistics, columns=["weight", "abc"]), + ) + + df = ( + df.groupby("age") + .aggregate( + [ + ("mean", "mean"), + ("q10", lambda x: x.quantile(0.1)), + ("q90", lambda x: x.quantile(0.9)), + ] + ) + .reset_index() + ) print(df) - - - exit() random = np.random.RandomState(0) - #for df_subset in sample_subsets(df, 3, random): + # for df_subset in sample_subsets(df, 3, random): # print(df_subset) print(average_subsets(df, 3, random)) - print(apply_bootstrap(average_subsets(df, 3, random), 100, random, lambda df: analyze_sample(df))) + print( + apply_bootstrap( + average_subsets(df, 3, random), 100, random, lambda df: analyze_sample(df) + ) + ) exit() - #print(analyze(df)) + # print(analyze(df)) - #for df_sample in bootstrap(df, 100, random): + # for df_sample in bootstrap(df, 100, random): # df_sample = analyze(df_sample) # print(df_sample) - statistics = [ - ("precision", lambda x: np.mean(x < 55.0)) - ] - - df = apply_bootstrap(df, 100, random, lambda df: analyze_sample(df, statistics = statistics)) - df = df.groupby(["age"]).aggregate([ - ("mean", "mean"), - ("q10", lambda x: x.quantile(0.1)), - ("q90", lambda x: x.quantile(0.9)) - ]).reset_index() - - + statistics = [("precision", lambda x: np.mean(x < 55.0))] + + df = apply_bootstrap( + df, 100, random, lambda df: analyze_sample(df, statistics=statistics) + ) + df = ( + df.groupby(["age"]) + .aggregate( + [ + ("mean", "mean"), + ("q10", lambda x: x.quantile(0.1)), + ("q90", lambda x: x.quantile(0.9)), + ] + ) + .reset_index() + ) print(df) exit() print() - exit() sample = [create_sample(R) for R in range(2)] random = np.random.RandomState(5) - #marginals = [marginalize(df, [("age",), ("gender",), ("age", "gender"), tuple()]) for df in sample] + # marginals = [marginalize(df, [("age",), ("gender",), ("age", "gender"), tuple()]) for df in sample] marginals = [marginalize(df, [("gender",)]) for df in sample] marginals = collect_marginalized_sample(marginals) - metrics = bootstrap_sampled_marginals(marginals, 100, subset_size = 2, random = random) + metrics = bootstrap_sampled_marginals(marginals, 100, subset_size=2, random=random) print(metrics[("gender",)]) diff --git a/analysis/synthesis/commute_distance.py b/analysis/synthesis/commute_distance.py index b8a83a8e..ec9f8946 100644 --- a/analysis/synthesis/commute_distance.py +++ b/analysis/synthesis/commute_distance.py @@ -6,30 +6,52 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") - bs.configure(context, "synthesis.population.spatial.home.locations", acquisition_sample_size) - bs.configure(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size) + bs.configure( + context, "synthesis.population.spatial.home.locations", acquisition_sample_size + ) + bs.configure( + context, + "synthesis.population.spatial.primary.locations", + acquisition_sample_size, + ) bs.configure(context, "synthesis.population.sampled", acquisition_sample_size) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") feeder = zip( - bs.get_stages(context, "synthesis.population.spatial.home.locations", acquisition_sample_size), - bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size), + bs.get_stages( + context, + "synthesis.population.spatial.home.locations", + acquisition_sample_size, + ), + bs.get_stages( + context, + "synthesis.population.spatial.primary.locations", + acquisition_sample_size, + ), bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), ) probabilities = np.linspace(0.0, 1.0, 20) - quantiles = { "work": [], "education": [] } + quantiles = {"work": [], "education": []} - with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: + with context.progress( + label="Processing commute data ...", total=acquisition_sample_size + ) as progress: for df_home, df_spatial, df_persons in feeder: # Prepare home - df_home = pd.merge(df_home, df_persons[["person_id", "household_id"]], on = "household_id") - df_home = df_home[["person_id", "geometry"]].set_index("person_id").sort_index() + df_home = pd.merge( + df_home, df_persons[["person_id", "household_id"]], on="household_id" + ) + df_home = ( + df_home[["person_id", "geometry"]].set_index("person_id").sort_index() + ) assert len(df_home) == len(df_persons) for index, name in enumerate(("work", "education")): @@ -40,12 +62,11 @@ def execute(context): df_compare = df_home.loc[df_destination.index] assert len(df_destination) == len(df_compare) - distances = df_destination["geometry"].distance(df_compare["geometry"]) * 1e-3 + distances = ( + df_destination["geometry"].distance(df_compare["geometry"]) * 1e-3 + ) - quantiles[name].append([ - distances.quantile(p) - for p in probabilities - ]) + quantiles[name].append([distances.quantile(p) for p in probabilities]) progress.update() @@ -54,11 +75,11 @@ def execute(context): for name in ("work", "education"): data = np.array(quantiles[name]) - mean = np.mean(data, axis = 0) - min = np.min(data, axis = 0) - max = np.max(data, axis = 0) + mean = np.mean(data, axis=0) + min = np.min(data, axis=0) + max = np.max(data, axis=0) - df = pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities)) + df = pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities)) result[name] = df return result diff --git a/analysis/synthesis/commute_flow.py b/analysis/synthesis/commute_flow.py index 82119898..c96cd61b 100644 --- a/analysis/synthesis/commute_flow.py +++ b/analysis/synthesis/commute_flow.py @@ -5,56 +5,94 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") - bs.configure(context, "synthesis.population.spatial.home.zones", acquisition_sample_size) - bs.configure(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size) + bs.configure( + context, "synthesis.population.spatial.home.zones", acquisition_sample_size + ) + bs.configure( + context, + "synthesis.population.spatial.primary.locations", + acquisition_sample_size, + ) bs.configure(context, "synthesis.population.sampled", acquisition_sample_size) context.stage("data.spatial.municipalities") + def execute(context): - df_codes = context.stage("data.spatial.municipalities")[[ - "commune_id", "departement_id" - ]] + df_codes = context.stage("data.spatial.municipalities")[ + ["commune_id", "departement_id"] + ] acquisition_sample_size = context.config("acquisition_sample_size") feeder = zip( - bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size), - bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size), + bs.get_stages( + context, "synthesis.population.spatial.home.zones", acquisition_sample_size + ), + bs.get_stages( + context, + "synthesis.population.spatial.primary.locations", + acquisition_sample_size, + ), bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), ) work_flows = [] education_flows = [] - with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: + with context.progress( + label="Processing commute data ...", total=acquisition_sample_size + ) as progress: for realization, (df_home, df_spatial, df_persons) in enumerate(feeder): # Prepare home - df_home = pd.merge(df_persons[["person_id", "household_id"]], df_home, on = "household_id") - df_home = df_home[["person_id", "departement_id"]].rename(columns = { "departement_id": "home" }) + df_home = pd.merge( + df_persons[["person_id", "household_id"]], df_home, on="household_id" + ) + df_home = df_home[["person_id", "departement_id"]].rename( + columns={"departement_id": "home"} + ) # Prepare work df_work = df_spatial[0] - df_work = pd.merge(df_work, df_codes, how = "left", on = "commune_id") - df_work["departement_id"] = df_work["departement_id"].cat.remove_unused_categories() - df_work = df_work[["person_id", "departement_id"]].rename(columns = { "departement_id": "work" }) + df_work = pd.merge(df_work, df_codes, how="left", on="commune_id") + df_work["departement_id"] = df_work[ + "departement_id" + ].cat.remove_unused_categories() + df_work = df_work[["person_id", "departement_id"]].rename( + columns={"departement_id": "work"} + ) # Calculate work - df_work = pd.merge(df_home, df_work, on = "person_id").groupby(["home", "work"]).size().reset_index(name = "weight") + df_work = ( + pd.merge(df_home, df_work, on="person_id") + .groupby(["home", "work"]) + .size() + .reset_index(name="weight") + ) df_work["realization"] = realization work_flows.append(df_work) # Prepare work df_education = df_spatial[1] - df_education = pd.merge(df_education, df_codes, how = "left", on = "commune_id") - df_education["departement_id"] = df_education["departement_id"].cat.remove_unused_categories() - df_education = df_education[["person_id", "departement_id"]].rename(columns = { "departement_id": "education" }) + df_education = pd.merge(df_education, df_codes, how="left", on="commune_id") + df_education["departement_id"] = df_education[ + "departement_id" + ].cat.remove_unused_categories() + df_education = df_education[["person_id", "departement_id"]].rename( + columns={"departement_id": "education"} + ) # Calculate education - df_education = pd.merge(df_home, df_education, on = "person_id").groupby(["home", "education"]).size().reset_index(name = "weight") + df_education = ( + pd.merge(df_home, df_education, on="person_id") + .groupby(["home", "education"]) + .size() + .reset_index(name="weight") + ) df_education["realization"] = realization education_flows.append(df_education) @@ -66,4 +104,4 @@ def execute(context): df_work = stats.analyze_sample_and_flatten(df_work) df_education = stats.analyze_sample_and_flatten(df_education) - return dict(work = df_work, education = df_education) + return dict(work=df_work, education=df_education) diff --git a/analysis/synthesis/income.py b/analysis/synthesis/income.py index f37131e4..1a49af17 100644 --- a/analysis/synthesis/income.py +++ b/analysis/synthesis/income.py @@ -6,9 +6,13 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") - bs.configure(context, "synthesis.population.income.selected", acquisition_sample_size) + bs.configure( + context, "synthesis.population.income.selected", acquisition_sample_size + ) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") @@ -16,16 +20,20 @@ def execute(context): probabilities = np.linspace(0.0, 1.0, 20) quantiles = [] - with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: - for df_income in bs.get_stages(context, "synthesis.population.income.selected", acquisition_sample_size): + with context.progress( + label="Processing commute data ...", total=acquisition_sample_size + ) as progress: + for df_income in bs.get_stages( + context, "synthesis.population.income.selected", acquisition_sample_size + ): income = 12 * df_income["household_income"] / df_income["consumption_units"] quantiles.append([income.quantile(p) for p in probabilities]) progress.update() quantiles = np.array(quantiles) - mean = np.mean(quantiles, axis = 0) - min = np.min(quantiles, axis = 0) - max = np.max(quantiles, axis = 0) + mean = np.mean(quantiles, axis=0) + min = np.min(quantiles, axis=0) + max = np.max(quantiles, axis=0) - return pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities)) + return pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities)) diff --git a/analysis/synthesis/matching.py b/analysis/synthesis/matching.py index 1c66c14b..f3b33884 100644 --- a/analysis/synthesis/matching.py +++ b/analysis/synthesis/matching.py @@ -2,15 +2,21 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") random_seeds = (np.arange(acquisition_sample_size) * 1000 + 1000).astype(int) for index, random_seed in enumerate(random_seeds): - context.stage("synthesis.population.matched", { - "random_seed": int(random_seed), - "sampling_rate": context.config("sampling_rate") - }, alias = "seed_%d" % index) + context.stage( + "synthesis.population.matched", + { + "random_seed": int(random_seed), + "sampling_rate": context.config("sampling_rate"), + }, + alias="seed_%d" % index, + ) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") @@ -26,6 +32,6 @@ def execute(context): aggregated[key].append(value) - aggregated = { k: np.array(v) for k, v in aggregated.items() } + aggregated = {k: np.array(v) for k, v in aggregated.items()} return aggregated diff --git a/analysis/synthesis/mode_distances.py b/analysis/synthesis/mode_distances.py index eea6b36d..e5f6a686 100644 --- a/analysis/synthesis/mode_distances.py +++ b/analysis/synthesis/mode_distances.py @@ -5,50 +5,80 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") - bs.configure(context, "synthesis.population.spatial.locations", acquisition_sample_size) + bs.configure( + context, "synthesis.population.spatial.locations", acquisition_sample_size + ) bs.configure(context, "synthesis.population.trips", acquisition_sample_size) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") probabilities = np.linspace(0.0, 1.0, 20) modes = ["car", "car_passenger", "pt", "bike", "walk"] - quantiles = { mode : [] for mode in modes } + quantiles = {mode: [] for mode in modes} generator = zip( - bs.get_stages(context, "synthesis.population.spatial.locations", acquisition_sample_size), - bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size) + bs.get_stages( + context, "synthesis.population.spatial.locations", acquisition_sample_size + ), + bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size), ) - with context.progress(label = "Processing distance data ...", total = acquisition_sample_size) as progress: + with context.progress( + label="Processing distance data ...", total=acquisition_sample_size + ) as progress: for df_locations, df_trips in generator: # Load locations and calculate euclidean distances - df_locations = df_locations[["person_id", "activity_index", "geometry"]].rename(columns = { "activity_index": "trip_index" }) - df_locations["euclidean_distance"] = df_locations["geometry"].distance(df_locations["geometry"].shift(-1)) + df_locations = df_locations[ + ["person_id", "activity_index", "geometry"] + ].rename(columns={"activity_index": "trip_index"}) + df_locations["euclidean_distance"] = df_locations["geometry"].distance( + df_locations["geometry"].shift(-1) + ) # Merge mode into distances df_trips = pd.merge( - df_trips[["person_id", "trip_index", "mode", "preceding_purpose", "following_purpose", "departure_time", "arrival_time"]], - df_locations, on = ["person_id", "trip_index"], how = "inner" + df_trips[ + [ + "person_id", + "trip_index", + "mode", + "preceding_purpose", + "following_purpose", + "departure_time", + "arrival_time", + ] + ], + df_locations, + on=["person_id", "trip_index"], + how="inner", + ) + df_trips["travel_time"] = ( + df_trips["arrival_time"] - df_trips["departure_time"] ) - df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"] # Filter trips primary_activities = ["home", "work", "education"] - #primary_activities = [] - df_trips = df_trips[~( - df_trips["preceding_purpose"].isin(primary_activities) & - df_trips["following_purpose"].isin(primary_activities) - )] + # primary_activities = [] + df_trips = df_trips[ + ~( + df_trips["preceding_purpose"].isin(primary_activities) + & df_trips["following_purpose"].isin(primary_activities) + ) + ] # Calculate quantiles for mode in modes: df_mode = df_trips[df_trips["mode"] == mode] - quantiles[mode].append([df_mode["euclidean_distance"].quantile(p) for p in probabilities]) + quantiles[mode].append( + [df_mode["euclidean_distance"].quantile(p) for p in probabilities] + ) progress.update() @@ -58,14 +88,16 @@ def execute(context): df_data = [] for mode in modes: - mean = np.mean(quantiles[mode], axis = 0) - #min = np.percentile(quantiles[mode], 5, axis = 0) - #max = np.percentile(quantiles[mode], 95, axis = 0) + mean = np.mean(quantiles[mode], axis=0) + # min = np.percentile(quantiles[mode], 5, axis = 0) + # max = np.percentile(quantiles[mode], 95, axis = 0) - min = np.min(quantiles[mode], axis = 0) - max = np.max(quantiles[mode], axis = 0) + min = np.min(quantiles[mode], axis=0) + max = np.max(quantiles[mode], axis=0) - df_data.append(pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities))) + df_data.append( + pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities)) + ) df_data[-1]["mode"] = mode return pd.concat(df_data) diff --git a/analysis/synthesis/sociodemographics/chains.py b/analysis/synthesis/sociodemographics/chains.py index 3c90e9bc..c90c95c8 100644 --- a/analysis/synthesis/sociodemographics/chains.py +++ b/analysis/synthesis/sociodemographics/chains.py @@ -5,7 +5,13 @@ import analysis.statistics as stats import analysis.marginals as marginals -from analysis.chains import aggregate_chains, CHAIN_MARGINALS, CHAIN_LENGTH_LIMIT, CHAIN_TOP_K +from analysis.chains import ( + aggregate_chains, + CHAIN_MARGINALS, + CHAIN_LENGTH_LIMIT, + CHAIN_TOP_K, +) + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") @@ -13,26 +19,41 @@ def configure(context): bs.configure(context, "synthesis.population.sampled", acquisition_sample_size) bs.configure(context, "synthesis.population.activities", acquisition_sample_size) + def execute_parallel(context, data): acquisition_sample_size = context.config("acquisition_sample_size") df_population, df_chains = data - df_chains = df_chains[["person_id", "activity_index", "purpose"]].sort_values(by = ["person_id", "activity_index"]) + df_chains = df_chains[["person_id", "activity_index", "purpose"]].sort_values( + by=["person_id", "activity_index"] + ) df_chains = aggregate_chains(df_chains) marginals.prepare_classes(df_population) - df_chains = pd.merge(df_population[["person_id", "age_class", "sex", "age"]], df_chains, on = "person_id") - df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT) + df_chains = pd.merge( + df_population[["person_id", "age_class", "sex", "age"]], + df_chains, + on="person_id", + ) + df_chains["chain_length_class"] = np.minimum( + df_chains["chain_length"], CHAIN_LENGTH_LIMIT + ) - top_k_chains = df_chains.groupby("chain").size().reset_index(name = "weight").sort_values( - by = "weight", ascending = False - ).head(CHAIN_TOP_K)["chain"].values + top_k_chains = ( + df_chains.groupby("chain") + .size() + .reset_index(name="weight") + .sort_values(by="weight", ascending=False) + .head(CHAIN_TOP_K)["chain"] + .values + ) df_chains = df_chains[df_chains["chain"].isin(top_k_chains)] df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40) context.progress.update() - return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column = None) + return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column=None) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") @@ -41,10 +62,14 @@ def execute(context): feeder = zip( bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), - bs.get_stages(context, "synthesis.population.activities", acquisition_sample_size) + bs.get_stages( + context, "synthesis.population.activities", acquisition_sample_size + ), ) - with context.progress(label = "Marginalizing chain data ...", total = acquisition_sample_size): + with context.progress( + label="Marginalizing chain data ...", total=acquisition_sample_size + ): with context.parallel() as parallel: data = list(parallel.imap_unordered(execute_parallel, feeder)) diff --git a/analysis/synthesis/sociodemographics/general.py b/analysis/synthesis/sociodemographics/general.py index c396231f..854e4360 100644 --- a/analysis/synthesis/sociodemographics/general.py +++ b/analysis/synthesis/sociodemographics/general.py @@ -2,26 +2,44 @@ import analysis.statistics as stats import analysis.marginals as marginals + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") bs.configure(context, "synthesis.population.enriched", acquisition_sample_size) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] - for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size): + for df in bs.get_stages( + context, "synthesis.population.enriched", acquisition_sample_size + ): marginals.prepare_classes(df) - person_marginals.append(stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column = None)) - household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column = None)) + person_marginals.append( + stats.marginalize( + df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None + ) + ) + household_marginals.append( + stats.marginalize( + df.drop_duplicates("household_id"), + marginals.ANALYSIS_HOUSEHOLD_MARGINALS, + weight_column=None, + ) + ) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) - person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten) - household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten) + person_marginals = stats.apply_per_marginal( + person_marginals, stats.analyze_sample_and_flatten + ) + household_marginals = stats.apply_per_marginal( + household_marginals, stats.analyze_sample_and_flatten + ) - return dict(person = person_marginals, household = household_marginals) + return dict(person=person_marginals, household=household_marginals) diff --git a/analysis/synthesis/sociodemographics/spatial.py b/analysis/synthesis/sociodemographics/spatial.py index baba7e07..3204eea8 100644 --- a/analysis/synthesis/sociodemographics/spatial.py +++ b/analysis/synthesis/sociodemographics/spatial.py @@ -4,11 +4,15 @@ import pandas as pd + def configure(context): acquisition_sample_size = context.config("acquisition_sample_size") bs.configure(context, "synthesis.population.enriched", acquisition_sample_size) - bs.configure(context, "synthesis.population.spatial.home.zones", acquisition_sample_size) + bs.configure( + context, "synthesis.population.spatial.home.zones", acquisition_sample_size + ) + def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") @@ -17,21 +21,39 @@ def execute(context): household_marginals = [] feeder = zip( - bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size), - bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size) + bs.get_stages( + context, "synthesis.population.enriched", acquisition_sample_size + ), + bs.get_stages( + context, "synthesis.population.spatial.home.zones", acquisition_sample_size + ), ) for df, df_home in feeder: df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]]) marginals.prepare_classes(df) - person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None)) - household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None)) + person_marginals.append( + stats.marginalize( + df, marginals.SPATIAL_PERSON_MARGINALS, weight_column=None + ) + ) + household_marginals.append( + stats.marginalize( + df.drop_duplicates("household_id"), + marginals.SPATIAL_HOUSEHOLD_MARGINALS, + weight_column=None, + ) + ) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) - person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten) - household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten) + person_marginals = stats.apply_per_marginal( + person_marginals, stats.analyze_sample_and_flatten + ) + household_marginals = stats.apply_per_marginal( + household_marginals, stats.analyze_sample_and_flatten + ) - return dict(person = person_marginals, household = household_marginals) + return dict(person=person_marginals, household=household_marginals) diff --git a/analysis/synthesis/statistics/marginal.py b/analysis/synthesis/statistics/marginal.py index 7e140d5c..8afcec37 100644 --- a/analysis/synthesis/statistics/marginal.py +++ b/analysis/synthesis/statistics/marginal.py @@ -5,17 +5,25 @@ import analysis.statistics as stats MARGINALS = [ - ("age_class",), ("sex",), ("employed",), ("studies",), - ("socioprofessional_class",), ("age_class", "employed") + ("age_class",), + ("sex",), + ("employed",), + ("studies",), + ("socioprofessional_class",), + ("age_class", "employed"), ] + def configure(context): context.config("random_seed") - context.stage("synthesis.population.sampled", dict( - random_seed = context.config("random_seed") - ), alias = "sample") + context.stage( + "synthesis.population.sampled", + dict(random_seed=context.config("random_seed")), + alias="sample", + ) + def execute(context): df = context.stage("sample") marginals.prepare_classes(df) - return stats.marginalize(df, MARGINALS, weight_column = None) + return stats.marginalize(df, MARGINALS, weight_column=None) diff --git a/analysis/synthesis/statistics/monte_carlo.py b/analysis/synthesis/statistics/monte_carlo.py index 23b9892d..84d0bd8c 100644 --- a/analysis/synthesis/statistics/monte_carlo.py +++ b/analysis/synthesis/statistics/monte_carlo.py @@ -11,23 +11,33 @@ from analysis.synthesis.statistics.marginal import MARGINALS + def configure(context): context.stage("analysis.reference.census.sociodemographics") for sampling_rate in SAMPLING_RATES: - bt.configure(context, "analysis.synthesis.statistics.marginal", ACQUISITION_SAMPLE_SIZE, dict( - sampling_rate = sampling_rate - ), alias = "sample_%f" % sampling_rate) + bt.configure( + context, + "analysis.synthesis.statistics.marginal", + ACQUISITION_SAMPLE_SIZE, + dict(sampling_rate=sampling_rate), + alias="sample_%f" % sampling_rate, + ) + STATISTICS = [ - ("mean", "mean"), ("q5", lambda x: x.quantile(0.05)), ("q95", lambda x: x.quantile(0.95)) + ("mean", "mean"), + ("q5", lambda x: x.quantile(0.05)), + ("q95", lambda x: x.quantile(0.95)), ] STATISTICS = { - "weight": STATISTICS, "error": STATISTICS, - "error_probability": [("mean", "mean")] + "weight": STATISTICS, + "error": STATISTICS, + "error_probability": [("mean", "mean")], } + def process(context, k): reference = context.data("reference") partial_marginals = context.data("partial_marginals") @@ -40,12 +50,23 @@ def process(context, k): df_marginal = k_marginals[marginal] df_reference = reference[marginal] - df_marginal = pd.merge(df_marginal, df_reference.rename(columns = { "weight": "reference" }), on = marginal) + df_marginal = pd.merge( + df_marginal, + df_reference.rename(columns={"weight": "reference"}), + on=marginal, + ) df_marginal["weight"] /= sampling_rate df_marginal["error"] = df_marginal["weight"] / df_marginal["reference"] - 1 - df_marginal["error_probability"] = np.abs(df_marginal["error"]) <= ERROR_THRESHOLD + df_marginal["error_probability"] = ( + np.abs(df_marginal["error"]) <= ERROR_THRESHOLD + ) - df = df_marginal[list(marginal) + ["weight", "error", "error_probability"]].groupby(list(marginal)).aggregate(STATISTICS).reset_index() + df = ( + df_marginal[list(marginal) + ["weight", "error", "error_probability"]] + .groupby(list(marginal)) + .aggregate(STATISTICS) + .reset_index() + ) df["samples"] = k df["sampling_rate"] = sampling_rate @@ -55,19 +76,36 @@ def process(context, k): return output + def execute(context): reference = context.stage("analysis.reference.census.sociodemographics")["person"] - output = { marginal: [] for marginal in MARGINALS } + output = {marginal: [] for marginal in MARGINALS} total = len(SAMPLING_RATES) * len(MARGINALS) * ACQUISITION_SAMPLE_SIZE - with context.progress(label = "Running Monte Carlo analysis ...", total = total) as progress: + with context.progress( + label="Running Monte Carlo analysis ...", total=total + ) as progress: for sampling_rate in SAMPLING_RATES: - partial_marginals = list(bt.get_stages(context, "sample_%f" % sampling_rate, sample_size = ACQUISITION_SAMPLE_SIZE)) - - with context.parallel(data = dict(partial_marginals = partial_marginals, reference = reference, sampling_rate = sampling_rate)) as parallel: - - for partial_output in parallel.imap_unordered(process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1)): + partial_marginals = list( + bt.get_stages( + context, + "sample_%f" % sampling_rate, + sample_size=ACQUISITION_SAMPLE_SIZE, + ) + ) + + with context.parallel( + data=dict( + partial_marginals=partial_marginals, + reference=reference, + sampling_rate=sampling_rate, + ) + ) as parallel: + + for partial_output in parallel.imap_unordered( + process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1) + ): for marginal in MARGINALS: output[marginal].append(partial_output[marginal]) diff --git a/data/ban/raw.py b/data/ban/raw.py index 764c6c8f..7f97064f 100644 --- a/data/ban/raw.py +++ b/data/ban/raw.py @@ -7,17 +7,16 @@ This stage loads the raw data from the new French address registry (BAN). """ + def configure(context): context.stage("data.spatial.codes") context.config("data_path") context.config("ban_path", "ban_idf") -BAN_DTYPES = { - "code_insee": str, - "x": float, - "y": float -} + +BAN_DTYPES = {"code_insee": str, "x": float, "y": float} + def execute(context): # Find relevant departments @@ -27,12 +26,19 @@ def execute(context): # Load BAN df_ban = [] - for source_path in find_ban("{}/{}".format(context.config("data_path"), context.config("ban_path"))): + for source_path in find_ban( + "{}/{}".format(context.config("data_path"), context.config("ban_path")) + ): print("Reading {} ...".format(source_path)) - df_partial = pd.read_csv(source_path, - compression = "gzip", sep = ";", usecols = BAN_DTYPES.keys(), dtype = BAN_DTYPES) - + df_partial = pd.read_csv( + source_path, + compression="gzip", + sep=";", + usecols=BAN_DTYPES.keys(), + dtype=BAN_DTYPES, + ) + # Filter by departments df_partial["department_id"] = df_partial["code_insee"].str[:2] df_partial = df_partial[["department_id", "x", "y"]] @@ -40,25 +46,30 @@ def execute(context): if len(df_partial) > 0: df_ban.append(df_partial) - + df_ban = pd.concat(df_ban) df_ban = gpd.GeoDataFrame( - df_ban, geometry = gpd.points_from_xy(df_ban.x, df_ban.y), crs = "EPSG:2154") - + df_ban, geometry=gpd.points_from_xy(df_ban.x, df_ban.y), crs="EPSG:2154" + ) + # Check that we cover all requested departments at least once for department_id in requested_departments: assert np.count_nonzero(df_ban["department_id"] == department_id) > 0 return df_ban[["geometry"]] + def find_ban(path): candidates = sorted(list(glob.glob("{}/*.csv.gz".format(path)))) if len(candidates) == 0: raise RuntimeError("BAN data is not available in {}".format(path)) - + return candidates + def validate(context): - paths = find_ban("{}/{}".format(context.config("data_path"), context.config("ban_path"))) + paths = find_ban( + "{}/{}".format(context.config("data_path"), context.config("ban_path")) + ) return sum([os.path.getsize(path) for path in paths]) diff --git a/data/bdtopo/output.py b/data/bdtopo/output.py index 214fd465..c1bb95c8 100644 --- a/data/bdtopo/output.py +++ b/data/bdtopo/output.py @@ -1,14 +1,17 @@ import geopandas as gpd + def configure(context): context.config("output_path") context.config("output_prefix", "ile_de_france_") context.stage("data.bdtopo.raw") + def execute(context): df_buildings = context.stage("data.bdtopo.raw") - df_buildings.to_file("%s/%sbdtopo.gpkg" % ( - context.config("output_path"), context.config("output_prefix") - )) + df_buildings.to_file( + "%s/%sbdtopo.gpkg" + % (context.config("output_path"), context.config("output_prefix")) + ) diff --git a/data/bdtopo/raw.py b/data/bdtopo/raw.py index 354545ec..e794f895 100644 --- a/data/bdtopo/raw.py +++ b/data/bdtopo/raw.py @@ -11,13 +11,15 @@ """ This stage loads the raw data from the French building registry (BD-TOPO). """ - + + def configure(context): context.config("data_path") context.config("bdtopo_path", "bdtopo_idf") context.stage("data.spatial.departments") + def get_department_string(department_id): department_id = str(department_id) @@ -28,11 +30,14 @@ def get_department_string(department_id): else: raise RuntimeError("Department identifier should have at least two characters") + def execute(context): df_departments = context.stage("data.spatial.departments") print("Expecting data for {} departments".format(len(df_departments))) - - source_paths = find_bdtopo("{}/{}".format(context.config("data_path"), context.config("bdtopo_path"))) + + source_paths = find_bdtopo( + "{}/{}".format(context.config("data_path"), context.config("bdtopo_path")) + ) df_bdtopo = [] known_ids = set() @@ -43,8 +48,10 @@ def execute(context): with py7zr.SevenZipFile(source_path) as archive: # Find the path inside the archive - internal_path = [path for path in archive.getnames() if path.endswith(".gpkg")] - + internal_path = [ + path for path in archive.getnames() if path.endswith(".gpkg") + ] + if len(internal_path) != 1: print(" Skipping: No unambiguous geometry source found!") @@ -54,20 +61,26 @@ def execute(context): geometry_path = "{}/{}".format(context.path(), internal_path[0]) if geometry_path is not None: - with context.progress(label = " Reading ...") as progress: - data = { "cleabs": [], "nombre_de_logements": [], "geometry": [] } - with fiona.open(geometry_path, layer = "batiment") as package: + with context.progress(label=" Reading ...") as progress: + data = {"cleabs": [], "nombre_de_logements": [], "geometry": []} + with fiona.open(geometry_path, layer="batiment") as package: for item in package: data["cleabs"].append(item["properties"]["cleabs"]) - data["nombre_de_logements"].append(item["properties"]["nombre_de_logements"]) + data["nombre_de_logements"].append( + item["properties"]["nombre_de_logements"] + ) data["geometry"].append(geo.shape(item["geometry"])) progress.update() df_buildings = pd.DataFrame(data) - df_buildings = gpd.GeoDataFrame(df_buildings, crs = "EPSG:2154") - - df_buildings["building_id"] = df_buildings["cleabs"].apply(lambda x: int(x[8:])) - df_buildings["housing"] = df_buildings["nombre_de_logements"].fillna(0).astype(int) + df_buildings = gpd.GeoDataFrame(df_buildings, crs="EPSG:2154") + + df_buildings["building_id"] = df_buildings["cleabs"].apply( + lambda x: int(x[8:]) + ) + df_buildings["housing"] = ( + df_buildings["nombre_de_logements"].fillna(0).astype(int) + ) df_buildings["centroid"] = df_buildings["geometry"].centroid df_buildings = df_buildings.set_geometry("centroid") @@ -77,22 +90,36 @@ def execute(context): initial_count = len(df_buildings) df_buildings = df_buildings[df_buildings["housing"] > 0] final_count = len(df_buildings) - print(" {}/{} filtered by dwellings".format(initial_count - final_count, initial_count)) + print( + " {}/{} filtered by dwellings".format( + initial_count - final_count, initial_count + ) + ) initial_count = len(df_buildings) df_buildings = df_buildings[~df_buildings["building_id"].isin(known_ids)] final_count = len(df_buildings) - print(" {}/{} filtered duplicates".format(initial_count - final_count, initial_count)) + print( + " {}/{} filtered duplicates".format( + initial_count - final_count, initial_count + ) + ) initial_count = len(df_buildings) - df_buildings = gpd.sjoin(df_buildings, df_departments, predicate = "within") + df_buildings = gpd.sjoin(df_buildings, df_departments, predicate="within") final_count = len(df_buildings) - print(" {}/{} filtered spatially".format(initial_count - final_count, initial_count)) + print( + " {}/{} filtered spatially".format( + initial_count - final_count, initial_count + ) + ) df_buildings["department_id"] = df_buildings["departement_id"] df_buildings = df_buildings.set_geometry("geometry") - df_bdtopo.append(df_buildings[["building_id", "housing", "department_id", "geometry"]]) + df_bdtopo.append( + df_buildings[["building_id", "housing", "department_id", "geometry"]] + ) known_ids |= set(df_buildings["building_id"].unique()) os.remove(geometry_path) @@ -104,14 +131,18 @@ def execute(context): return df_bdtopo[["building_id", "housing", "geometry"]] + def find_bdtopo(path): candidates = sorted(list(glob.glob("{}/*.7z".format(path)))) if len(candidates) == 0: raise RuntimeError("BD TOPO data is not available in {}".format(path)) - + return candidates + def validate(context): - paths = find_bdtopo("{}/{}".format(context.config("data_path"), context.config("bdtopo_path"))) + paths = find_bdtopo( + "{}/{}".format(context.config("data_path"), context.config("bdtopo_path")) + ) return sum([os.path.getsize(path) for path in paths]) diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py index 30e1cad3..9797729e 100644 --- a/data/bpe/cleaned.py +++ b/data/bpe/cleaned.py @@ -10,6 +10,7 @@ - Simplify activity types for all enterprises """ + def configure(context): context.stage("data.bpe.raw") @@ -18,32 +19,38 @@ def configure(context): context.config("bpe_random_seed", 0) + ACTIVITY_TYPE_MAP = [ - ("A", "other"), # Police, post office, etc ... - ("A504", "leisure"), # Restaurant - ("B", "shop"), # Shopping - ("C", "education"), # Education - ("D", "other"), # Health - ("E", "other"), # Transport - ("F", "leisure"), # Sports & Culture - ("G", "other"), # Tourism, hotels, etc. (Hôtel = G102) + ("A", "other"), # Police, post office, etc ... + ("A504", "leisure"), # Restaurant + ("B", "shop"), # Shopping + ("C", "education"), # Education + ("D", "other"), # Health + ("E", "other"), # Transport + ("F", "leisure"), # Sports & Culture + ("G", "other"), # Tourism, hotels, etc. (Hôtel = G102) ] + def find_outside(context, commune_id): df_municipalities = context.data("df_municipalities") df = context.data("df") df = df[df["commune_id"] == commune_id] - zone = df_municipalities[df_municipalities["commune_id"] == commune_id]["geometry"].values[0] + zone = df_municipalities[df_municipalities["commune_id"] == commune_id][ + "geometry" + ].values[0] indices = [ - index for index, x, y in df[["x", "y"]].itertuples() + index + for index, x, y in df[["x", "y"]].itertuples() if not zone.contains(geo.Point(x, y)) ] context.progress.update() return indices + def execute(context): df = context.stage("data.bpe.raw") @@ -57,9 +64,9 @@ def execute(context): df["activity_type"] = df["activity_type"].astype("category") - #Add - df = df.rename(columns={"TYPEQU":"education_type"}) - df["weight"] = 500 + # Add + df = df.rename(columns={"TYPEQU": "education_type"}) + df["weight"] = 500 # Clean coordinates df["x"] = df["LAMBERT_X"].astype(str).str.replace(",", ".").astype(float) df["y"] = df["LAMBERT_Y"].astype(str).str.replace(",", ".").astype(float) @@ -77,20 +84,29 @@ def execute(context): df["commune_id"] = df["DEPCOM"].astype("category") - print("Found %d/%d (%.2f%%) observations without IRIS" % ( - (df["iris_id"] == "undefined").sum(), len(df), 100 * (df["iris_id"] == "undefined").mean() - )) + print( + "Found %d/%d (%.2f%%) observations without IRIS" + % ( + (df["iris_id"] == "undefined").sum(), + len(df), + 100 * (df["iris_id"] == "undefined").mean(), + ) + ) # Check whether all communes in BPE are within our set of requested data df_municipalities = context.stage("data.spatial.municipalities") - excess_communes = set(df["commune_id"].unique()) - set(df_municipalities["commune_id"].unique()) + excess_communes = set(df["commune_id"].unique()) - set( + df_municipalities["commune_id"].unique() + ) if len(excess_communes) > 0: raise RuntimeError("Found additional communes: %s" % excess_communes) # We notice that we have some additional IRIS. Make sure they will be placed randomly in there commune later. df_iris = context.stage("data.spatial.iris") - excess_iris = set(df[df["iris_id"] != "undefined"]["iris_id"].unique()) - set(df_iris["iris_id"].unique()) + excess_iris = set(df[df["iris_id"] != "undefined"]["iris_id"].unique()) - set( + df_iris["iris_id"].unique() + ) df.loc[df["iris_id"].isin(excess_iris), "iris_id"] = "undefined" print("Excess IRIS without valid code:", excess_iris) @@ -100,19 +116,42 @@ def execute(context): f_undefined = df["iris_id"] == "undefined" f_missing = df["x"].isna() - print("Found %d/%d (%.2f%%) observations without coordinate" % ( - ((f_missing & ~f_undefined).sum(), len(df), 100 * (f_missing & ~f_undefined).mean() - ))) + print( + "Found %d/%d (%.2f%%) observations without coordinate" + % ( + ( + (f_missing & ~f_undefined).sum(), + len(df), + 100 * (f_missing & ~f_undefined).mean(), + ) + ) + ) if np.count_nonzero(f_missing & ~f_undefined) > 0: # Impute missing coordinates for known IRIS - df.update(spatial_utils.sample_from_zones( - context, df_iris, df[f_missing & ~f_undefined], "iris_id", random, label = "Imputing IRIS coordinates ...")) + df.update( + spatial_utils.sample_from_zones( + context, + df_iris, + df[f_missing & ~f_undefined], + "iris_id", + random, + label="Imputing IRIS coordinates ...", + ) + ) if np.count_nonzero(f_missing & f_undefined) > 0: # Impute missing coordinates for unknown IRIS - df.update(spatial_utils.sample_from_zones( - context, df_municipalities, df[f_missing & f_undefined], "commune_id", random, label = "Imputing municipality coordinates ...")) + df.update( + spatial_utils.sample_from_zones( + context, + df_municipalities, + df[f_missing & f_undefined], + "commune_id", + random, + label="Imputing municipality coordinates ...", + ) + ) # Consolidate df["imputed"] = f_missing @@ -122,8 +161,12 @@ def execute(context): # the respective municipality. Find them and move them back in. outside_indices = [] - with context.progress(label = "Finding outside observations ...", total = len(df["commune_id"].unique())): - with context.parallel(dict(df = df, df_municipalities = df_municipalities)) as parallel: + with context.progress( + label="Finding outside observations ...", total=len(df["commune_id"].unique()) + ): + with context.parallel( + dict(df=df, df_municipalities=df_municipalities) + ) as parallel: for partial in parallel.imap(find_outside, df["commune_id"].unique()): outside_indices += partial @@ -131,14 +174,33 @@ def execute(context): df.loc[outside_indices, "x"] = np.nan df.loc[outside_indices, "y"] = np.nan - df.update(spatial_utils.sample_from_zones( - context, df_municipalities, df.loc[outside_indices], "commune_id", random, label = "Fixing outside locations ...")) + df.update( + spatial_utils.sample_from_zones( + context, + df_municipalities, + df.loc[outside_indices], + "commune_id", + random, + label="Fixing outside locations ...", + ) + ) df.loc[outside_indices, "imputed"] = True # Package up data set - df = df[["enterprise_id", "activity_type","education_type", "commune_id", "imputed", "x", "y","weight"]] + df = df[ + [ + "enterprise_id", + "activity_type", + "education_type", + "commune_id", + "imputed", + "x", + "y", + "weight", + ] + ] - df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154") + df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y), crs="EPSG:2154") return df diff --git a/data/bpe/raw.py b/data/bpe/raw.py index 98135631..95429e10 100644 --- a/data/bpe/raw.py +++ b/data/bpe/raw.py @@ -6,27 +6,38 @@ This stage loads the raw data from the French service registry. """ + def configure(context): context.config("data_path") context.config("bpe_path", "bpe_2021/bpe21_ensemble_xy_csv.zip") context.config("bpe_csv", "bpe21_ensemble_xy.csv") context.stage("data.spatial.codes") + def execute(context): df_records = [] df_codes = context.stage("data.spatial.codes") requested_departements = df_codes["departement_id"].unique() - with context.progress(label = "Reading BPE ...") as progress: - with zipfile.ZipFile("{}/{}".format(context.config("data_path"), context.config("bpe_path"))) as archive: + with context.progress(label="Reading BPE ...") as progress: + with zipfile.ZipFile( + "{}/{}".format(context.config("data_path"), context.config("bpe_path")) + ) as archive: with archive.open(context.config("bpe_csv")) as f: - csv = pd.read_csv(f, usecols = [ - "DCIRIS", "LAMBERT_X", "LAMBERT_Y", - "TYPEQU", "DEPCOM", "DEP" - ], sep = ";", - dtype = dict(DEPCOM = str, DEP = str, DCIRIS = str), - chunksize = 10240 + csv = pd.read_csv( + f, + usecols=[ + "DCIRIS", + "LAMBERT_X", + "LAMBERT_Y", + "TYPEQU", + "DEPCOM", + "DEP", + ], + sep=";", + dtype=dict(DEPCOM=str, DEP=str, DCIRIS=str), + chunksize=10240, ) for df_chunk in csv: @@ -39,8 +50,13 @@ def execute(context): return pd.concat(df_records) + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("bpe_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("bpe_path")) + ): raise RuntimeError("BPE data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("bpe_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("bpe_path")) + ) diff --git a/data/census/cleaned.py b/data/census/cleaned.py index 789d0adb..1d4d4f95 100644 --- a/data/census/cleaned.py +++ b/data/census/cleaned.py @@ -9,6 +9,7 @@ - Clean up spatial information and sociodemographic attributes """ + def configure(context): context.stage("data.census.raw") context.stage("data.spatial.codes") @@ -16,27 +17,32 @@ def configure(context): if context.config("use_urban_type", False): context.stage("data.spatial.urban_type") + def execute(context): df = context.stage("data.census.raw") # Construct household IDs for persons with NUMMI != Z df_household_ids = df[["CANTVILLE", "NUMMI"]] df_household_ids = df_household_ids[df_household_ids["NUMMI"] != "Z"] - df_household_ids["temporary"] = df_household_ids["CANTVILLE"] + df_household_ids["NUMMI"] + df_household_ids["temporary"] = ( + df_household_ids["CANTVILLE"] + df_household_ids["NUMMI"] + ) df_household_ids = df_household_ids.drop_duplicates("temporary") df_household_ids["household_id"] = np.arange(len(df_household_ids)) - df = pd.merge(df, df_household_ids, on = ["CANTVILLE", "NUMMI"], how = "left") + df = pd.merge(df, df_household_ids, on=["CANTVILLE", "NUMMI"], how="left") # Fill up undefined household ids (those where NUMMI == Z) f = np.isnan(df["household_id"]) - df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1 + df.loc[f, "household_id"] = ( + np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1 + ) df["household_id"] = df["household_id"].astype(int) # Put person IDs df["person_id"] = np.arange(len(df)) # Sorting - df = df.sort_values(by = ["household_id", "person_id"]) + df = df.sort_values(by=["household_id", "person_id"]) # Spatial information df["departement_id"] = df["DEPT"].astype("category") @@ -52,7 +58,9 @@ def execute(context): df["iris_id"] = df["iris_id"].astype("category") # Age - df["age"] = df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int) + df["age"] = ( + df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int) + ) # Clean COUPLE df["couple"] = df["COUPLE"] == "1" @@ -81,42 +89,63 @@ def execute(context): df["studies"] = df["ETUD"] == "1" # Number of vehicles - df["number_of_vehicles"] = df["VOIT"].apply( - lambda x: str(x).replace("Z", "0").replace("X", "0") - ).astype(int) - - df["number_of_vehicles"] += df["DEROU"].apply( - lambda x: str(x).replace("U", "0").replace("Z", "0").replace("X", "0") - ).astype(int) + df["number_of_vehicles"] = ( + df["VOIT"] + .apply(lambda x: str(x).replace("Z", "0").replace("X", "0")) + .astype(int) + ) + + df["number_of_vehicles"] += ( + df["DEROU"] + .apply(lambda x: str(x).replace("U", "0").replace("Z", "0").replace("X", "0")) + .astype(int) + ) # Household size - df_size = df[["household_id"]].groupby("household_id").size().reset_index(name = "household_size") + df_size = ( + df[["household_id"]] + .groupby("household_id") + .size() + .reset_index(name="household_size") + ) df = pd.merge(df, df_size) # Socioprofessional category df["socioprofessional_class"] = df["CS1"].astype(int) # Consumption units - df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id") - - df = df[[ - "person_id", "household_id", "weight", - "iris_id", "commune_id", "departement_id", - "age", "sex", "couple", - "commute_mode", "employed", - "studies", "number_of_vehicles", "household_size", - "consumption_units", "socioprofessional_class" - ]] + df = pd.merge(df, hts.calculate_consumption_units(df), on="household_id") + + df = df[ + [ + "person_id", + "household_id", + "weight", + "iris_id", + "commune_id", + "departement_id", + "age", + "sex", + "couple", + "commute_mode", + "employed", + "studies", + "number_of_vehicles", + "household_size", + "consumption_units", + "socioprofessional_class", + ] + ] if context.config("use_urban_type"): - df_urban_type = context.stage("data.spatial.urban_type")[[ - "commune_id", "urban_type" - ]] - + df_urban_type = context.stage("data.spatial.urban_type")[ + ["commune_id", "urban_type"] + ] + # Impute urban type - df = pd.merge(df, df_urban_type, on = "commune_id", how = "left") + df = pd.merge(df, df_urban_type, on="commune_id", how="left") df.loc[df["commune_id"] == "undefined", "urban_type"] = "none" df["commune_id"] = df["commune_id"].astype("category") - assert ~np.any(df["urban_type"].isna()) + assert ~np.any(df["urban_type"].isna()) return df diff --git a/data/census/filtered.py b/data/census/filtered.py index ecd3bdcd..ac1f5771 100644 --- a/data/census/filtered.py +++ b/data/census/filtered.py @@ -7,10 +7,12 @@ Île-de-France. """ + def configure(context): context.stage("data.census.cleaned") context.stage("data.spatial.codes") + def execute(context): df = context.stage("data.census.cleaned") @@ -20,7 +22,9 @@ def execute(context): requested_departements = df_codes["departement_id"].unique() df = df[df["departement_id"].isin(requested_departements)] - excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique()) + excess_communes = set(df["commune_id"].unique()) - set( + df_codes["commune_id"].unique() + ) if not excess_communes == {"undefined"}: raise RuntimeError("Found additional communes: %s" % excess_communes) diff --git a/data/census/projection.py b/data/census/projection.py index dc9a8f9f..8a26b816 100644 --- a/data/census/projection.py +++ b/data/census/projection.py @@ -5,28 +5,32 @@ This stage loads and cleans projection data about the French population. """ + def configure(context): context.config("data_path") context.config("projection_path", "projection_2021") context.config("projection_scenario", "00_central") context.config("projection_year", None) + def execute(context): source_path = "{}/{}/{}.xlsx".format( - context.config("data_path"), - context.config("projection_path"), - context.config("projection_scenario")) - + context.config("data_path"), + context.config("projection_path"), + context.config("projection_scenario"), + ) + projection_year = int(context.config("projection_year")) - df_all = pd.read_excel( - source_path, sheet_name = "population", skiprows = 1).iloc[:107] - - df_male = pd.read_excel( - source_path, sheet_name = "populationH", skiprows = 1).iloc[:107] - - df_female = pd.read_excel( - source_path, sheet_name = "populationF", skiprows = 1).iloc[:107] + df_all = pd.read_excel(source_path, sheet_name="population", skiprows=1).iloc[:107] + + df_male = pd.read_excel(source_path, sheet_name="populationH", skiprows=1).iloc[ + :107 + ] + + df_female = pd.read_excel(source_path, sheet_name="populationF", skiprows=1).iloc[ + :107 + ] df_male["sex"] = "male" df_female["sex"] = "female" @@ -35,10 +39,9 @@ def execute(context): assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes" assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes" - df_sex = pd.concat([ - df_male.iloc[-1:], - df_female.iloc[-1:] - ]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]] + df_sex = pd.concat([df_male.iloc[-1:], df_female.iloc[-1:]]).drop( + columns=["Âge au 1er janvier"] + )[["sex", projection_year]] df_sex.columns = ["sex", "projection"] df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1] @@ -48,28 +51,28 @@ def execute(context): df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1] df_male.columns = ["age", "sex", "projection"] - df_female.columns = ["age","sex", "projection"] + df_female.columns = ["age", "sex", "projection"] df_cross = pd.concat([df_male, df_female]) df_cross["sex"] = df_cross["sex"].astype("category") - df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]] + df_total = df_all.iloc[-1:].drop(columns=["Âge au 1er janvier"])[[projection_year]] df_total.columns = ["projection"] - return { - "total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross - } + return {"total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross} + def validate(context): if context.config("projection_year") is not None: source_path = "{}/{}/{}.xlsx".format( - context.config("data_path"), - context.config("projection_path"), - context.config("projection_scenario")) + context.config("data_path"), + context.config("projection_path"), + context.config("projection_scenario"), + ) if not os.path.exists(source_path): raise RuntimeError("Projection data is not available") return os.path.getsize(source_path) - + return 0 diff --git a/data/census/raw.py b/data/census/raw.py index 73eebd4a..93099f36 100644 --- a/data/census/raw.py +++ b/data/census/raw.py @@ -6,6 +6,7 @@ This stage loads the raw data from the French population census. """ + def configure(context): context.stage("data.spatial.codes") @@ -15,24 +16,26 @@ def configure(context): context.config("projection_year", None) + COLUMNS_DTYPES = { - "CANTVILLE":"str", - "NUMMI":"str", - "AGED":"str", - "COUPLE":"str", - "CS1":"str", - "DEPT":"str", - "ETUD":"str", - "IPONDI":"str", - "IRIS":"str", - "REGION":"str", - "SEXE":"str", - "TACT":"str", - "TRANS":"str", - "VOIT":"str", - "DEROU":"str" + "CANTVILLE": "str", + "NUMMI": "str", + "AGED": "str", + "COUPLE": "str", + "CS1": "str", + "DEPT": "str", + "ETUD": "str", + "IPONDI": "str", + "IRIS": "str", + "REGION": "str", + "SEXE": "str", + "TACT": "str", + "TRANS": "str", + "VOIT": "str", + "DEROU": "str", } + def execute(context): df_records = [] df_codes = context.stage("data.spatial.codes") @@ -42,20 +45,26 @@ def execute(context): # only pre-filter if we don't need to reweight the census later prefilter_departments = context.config("projection_year") is None - with context.progress(label = "Reading census ...") as progress: + with context.progress(label="Reading census ...") as progress: with zipfile.ZipFile( - "{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive: + "{}/{}".format(context.config("data_path"), context.config("census_path")) + ) as archive: with archive.open(context.config("census_csv")) as f: - csv = pd.read_csv(f, - usecols = COLUMNS_DTYPES.keys(), sep = ";", - dtype = COLUMNS_DTYPES, - chunksize = 10240) - + csv = pd.read_csv( + f, + usecols=COLUMNS_DTYPES.keys(), + sep=";", + dtype=COLUMNS_DTYPES, + chunksize=10240, + ) + for df_chunk in csv: progress.update(len(df_chunk)) - + if prefilter_departments: - df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)] + df_chunk = df_chunk[ + df_chunk["DEPT"].isin(requested_departements) + ] if len(df_chunk) > 0: df_records.append(df_chunk) @@ -64,7 +73,11 @@ def execute(context): def validate(context): - if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("census_path"))): + if not os.path.exists( + "{}/{}".format(context.config("data_path"), context.config("census_path")) + ): raise RuntimeError("RP 2019 data is not available") - return os.path.getsize("{}/{}".format(context.config("data_path"), context.config("census_path"))) + return os.path.getsize( + "{}/{}".format(context.config("data_path"), context.config("census_path")) + ) diff --git a/data/external/education.py b/data/external/education.py index 78950ce1..7c384a65 100644 --- a/data/external/education.py +++ b/data/external/education.py @@ -3,6 +3,7 @@ import pandas as pd import geopandas as gpd + def configure(context): context.stage("data.bpe.cleaned") context.stage("data.spatial.municipalities") @@ -10,24 +11,35 @@ def configure(context): context.config("data_path") context.config("education_file", "education/education_addresses.geojson") + def execute(context): - df_locations = context.stage("data.bpe.cleaned")[[ - "activity_type", "education_type", "commune_id","weight", "geometry" - ]] + df_locations = context.stage("data.bpe.cleaned")[ + ["activity_type", "education_type", "commune_id", "weight", "geometry"] + ] df_locations = df_locations[df_locations["activity_type"] == "education"] - df_locations = df_locations[["activity_type","education_type", "commune_id", "geometry"]].copy() - df_locations["fake"] = False - - df_zones = context.stage("data.spatial.municipalities") - required_communes = set(df_zones["commune_id"].unique()) + df_locations = df_locations[ + ["activity_type", "education_type", "commune_id", "geometry"] + ].copy() + df_locations["fake"] = False + df_zones = context.stage("data.spatial.municipalities") + required_communes = set(df_zones["commune_id"].unique()) - df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["education_type", "commune_id","weight", "geometry"]] + df_education = gpd.read_file( + "{}/{}".format(context.config("data_path"), context.config("education_file")) + )[["education_type", "commune_id", "weight", "geometry"]] df_education["fake"] = False df_education = df_education.to_crs("2154") df_education["activity_type"] = "education" list_type = set(df_education["education_type"].unique()) - df_locations = pd.concat([df_locations[~(df_locations["education_type"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]]) + df_locations = pd.concat( + [ + df_locations[ + ~(df_locations["education_type"].str.startswith(tuple(list_type))) + ], + df_education[df_education["commune_id"].isin(required_communes)], + ] + ) return df_locations diff --git a/data/gtfs/cleaned.py b/data/gtfs/cleaned.py index 81d0475e..f883fe08 100644 --- a/data/gtfs/cleaned.py +++ b/data/gtfs/cleaned.py @@ -6,15 +6,19 @@ selected regions and departments) and merges them together. """ + def configure(context): context.config("data_path") context.config("gtfs_path", "gtfs_idf") context.stage("data.spatial.municipalities") + def execute(context): - input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("gtfs_path"))) - + input_files = get_input_files( + "{}/{}".format(context.config("data_path"), context.config("gtfs_path")) + ) + # Prepare bounding area df_area = context.stage("data.spatial.municipalities") @@ -25,7 +29,9 @@ def execute(context): feed = gtfs.cut_feed(feed, df_area) # This was fixed in pt2matsim, so we can remove one a new release (> 20.7) is available. - feed = gtfs.despace_stop_ids(feed) # Necessary as MATSim does not like stops/links with spaces + feed = gtfs.despace_stop_ids( + feed + ) # Necessary as MATSim does not like stops/links with spaces feeds.append(feed) @@ -34,10 +40,27 @@ def execute(context): # Fix for pt2matsim (will be fixed after PR #173) # Order of week days must be fixed - days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] + days = [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday", + ] columns = list(merged_feed["calendar"].columns) - for day in days: columns.remove(day) - columns += ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] + for day in days: + columns.remove(day) + columns += [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday", + ] merged_feed["calendar"] = merged_feed["calendar"][columns] # Write feed (not as a ZIP, but as files, for pt2matsim) @@ -45,6 +68,7 @@ def execute(context): return "gtfs" + def get_input_files(base_path): gtfs_paths = [ str(child) @@ -54,11 +78,14 @@ def get_input_files(base_path): if len(gtfs_paths) == 0: raise RuntimeError("Did not find any GTFS data (.zip) in {}".format(base_path)) - + return gtfs_paths + def validate(context): - input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("gtfs_path"))) + input_files = get_input_files( + "{}/{}".format(context.config("data_path"), context.config("gtfs_path")) + ) total_size = 0 for path in input_files: diff --git a/data/gtfs/output.py b/data/gtfs/output.py index 68c98ca9..f9fbf66f 100644 --- a/data/gtfs/output.py +++ b/data/gtfs/output.py @@ -4,18 +4,22 @@ Writes out the consolidated GTFS feed """ + def configure(context): context.config("output_path") context.config("output_prefix") context.stage("data.gtfs.cleaned") + def execute(context): source_path = "%s/output" % context.path("data.gtfs.cleaned") output_path = "%s/%sgtfs.zip" % ( - context.config("output_path"), context.config("output_prefix")) + context.config("output_path"), + context.config("output_prefix"), + ) - f = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) + f = zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) print(source_path) for path in sorted(list(glob.glob("%s/*.txt" % source_path))): diff --git a/data/gtfs/utils.py b/data/gtfs/utils.py index 10585a7c..a95ba1e5 100644 --- a/data/gtfs/utils.py +++ b/data/gtfs/utils.py @@ -5,16 +5,24 @@ import os import numpy as np -REQUIRED_SLOTS = [ - "agency", "stops", "routes", "trips", "stop_times" -] +REQUIRED_SLOTS = ["agency", "stops", "routes", "trips", "stop_times"] OPTIONAL_SLOTS = [ - "calendar", "calendar_dates", "fare_attributes", "fare_rules", - "shapes", "frequencies", "transfers", "pathways", "levels", - "feed_info", "translations", "attributions" + "calendar", + "calendar_dates", + "fare_attributes", + "fare_rules", + "shapes", + "frequencies", + "transfers", + "pathways", + "levels", + "feed_info", + "translations", + "attributions", ] + def read_feed(path): feed = {} @@ -38,8 +46,13 @@ def read_feed(path): if not "%s%s.txt" % (prefix, slot) in available_slots: raise RuntimeError("Missing GTFS information: %s" % slot) - if not "%scalendar.txt" % prefix in available_slots and not "%scalendar_dates.txt" % prefix in available_slots: - raise RuntimeError("At least calendar.txt or calendar_dates.txt must be specified.") + if ( + not "%scalendar.txt" % prefix in available_slots + and not "%scalendar_dates.txt" % prefix in available_slots + ): + raise RuntimeError( + "At least calendar.txt or calendar_dates.txt must be specified." + ) print("Loading GTFS data from %s ..." % path) @@ -48,22 +61,27 @@ def read_feed(path): print(" Loading %s.txt ..." % slot) with zip.open("%s%s.txt" % (prefix, slot)) as f: - feed[slot] = pd.read_csv(f, skipinitialspace = True) + feed[slot] = pd.read_csv(f, skipinitialspace=True) else: print(" Not loading %s.txt" % slot) # Some cleanup for slot in ("calendar", "calendar_dates", "trips"): - if slot in feed and "service_id" in feed[slot] and pd.api.types.is_string_dtype(feed[slot]["service_id"]): + if ( + slot in feed + and "service_id" in feed[slot] + and pd.api.types.is_string_dtype(feed[slot]["service_id"]) + ): initial_count = len(feed[slot]) feed[slot] = feed[slot][feed[slot]["service_id"].str.len() > 0] final_count = len(feed[slot]) if final_count != initial_count: - print("WARNING Removed %d/%d entries from %s with empty service_id" % ( - initial_count - final_count, initial_count, slot - )) + print( + "WARNING Removed %d/%d entries from %s with empty service_id" + % (initial_count - final_count, initial_count, slot) + ) if "stops" in feed: df_stops = feed["stops"] @@ -83,7 +101,9 @@ def read_feed(path): print("WARNING NaN numbers for min_transfer_time in transfers") df_transfers = df_transfers[~f] - df_transfers["min_transfer_time"] = df_transfers["min_transfer_time"].astype(int) + df_transfers["min_transfer_time"] = df_transfers["min_transfer_time"].astype( + int + ) feed["transfers"] = df_transfers if "agency" in feed: @@ -99,17 +119,19 @@ def read_feed(path): df_routes.loc[df_routes["agency_id"].isna(), "agency_id"] = agency_id - if "shapes" in feed: del feed["shapes"] + if "shapes" in feed: + del feed["shapes"] feed["trips"]["shape_id"] = np.nan # Fixes for Nantes PDL for item in feed.keys(): - feed[item] = feed[item].drop(columns = [ - c for c in feed[item].columns if c.startswith("ext_") - ]) + feed[item] = feed[item].drop( + columns=[c for c in feed[item].columns if c.startswith("ext_")] + ) return feed + def write_feed(feed, path): print("Writing GTFS data to %s ..." % path) @@ -121,7 +143,7 @@ def write_feed(feed, path): # We cannot write directly to the file handle as it # is binary, but pandas only writes in text mode. - zip.writestr("%s.txt" % slot, feed[slot].to_csv(index = None)) + zip.writestr("%s.txt" % slot, feed[slot].to_csv(index=None)) else: if not os.path.exists(path): @@ -134,9 +156,10 @@ def write_feed(feed, path): if slot in feed: with open("%s/%s.txt" % (path, slot), "w+", encoding="utf-8") as f: print(" Writing %s.txt ..." % slot) - feed[slot].to_csv(f, index = None, lineterminator='\n') + feed[slot].to_csv(f, index=None, lineterminator="\n") + -def cut_feed(feed, df_area, crs = None): +def cut_feed(feed, df_area, crs=None): feed = copy_feed(feed) df_stops = feed["stops"] @@ -148,11 +171,10 @@ def cut_feed(feed, df_area, crs = None): df_stations = df_stops[df_stops["location_type"] == 1].copy() df_stations["geometry"] = [ - geo.Point(*xy) - for xy in zip(df_stations["stop_lon"], df_stations["stop_lat"]) + geo.Point(*xy) for xy in zip(df_stations["stop_lon"], df_stations["stop_lat"]) ] - df_stations = gpd.GeoDataFrame(df_stations, crs = "EPSG:4326") + df_stations = gpd.GeoDataFrame(df_stations, crs="EPSG:4326") if not crs is None: print("Converting stops to custom CRS", crs) @@ -164,20 +186,22 @@ def cut_feed(feed, df_area, crs = None): print("Filtering stations ...") initial_count = len(df_stations) - df_stations = gpd.sjoin(df_stations, df_area, predicate = "within") + df_stations = gpd.sjoin(df_stations, df_area, predicate="within") final_count = len(df_stations) - print("Found %d/%d stations inside the specified area" % (final_count, initial_count)) + print( + "Found %d/%d stations inside the specified area" % (final_count, initial_count) + ) inside_stations = df_stations["stop_id"] # 1) Remove stations that are not inside stations and not have a parent stop df_stops = feed["stops"] df_stops = df_stops[ - df_stops["parent_station"].isin(inside_stations) | - ( - df_stops["parent_station"].isna() & - df_stops["stop_id"].isin(inside_stations) + df_stops["parent_station"].isin(inside_stations) + | ( + df_stops["parent_station"].isna() + & df_stops["stop_id"].isin(inside_stations) ) ] @@ -186,15 +210,17 @@ def cut_feed(feed, df_area, crs = None): # 2) Remove stop times df_times = feed["stop_times"] - df_times = df_times[df_times["stop_id"].astype(str).isin(remaining_stops.astype(str))] + df_times = df_times[ + df_times["stop_id"].astype(str).isin(remaining_stops.astype(str)) + ] feed["stop_times"] = df_times.copy() # 3) Remove transfers if "transfers" in feed: df_transfers = feed["transfers"] df_transfers = df_transfers[ - df_transfers["from_stop_id"].isin(remaining_stops) & - df_transfers["to_stop_id"].isin(remaining_stops) + df_transfers["from_stop_id"].isin(remaining_stops) + & df_transfers["to_stop_id"].isin(remaining_stops) ] feed["transfers"] = df_transfers.copy() @@ -202,8 +228,8 @@ def cut_feed(feed, df_area, crs = None): if "pathways" in feed: df_pathways = feed["pathways"] df_pathways = df_pathways[ - df_pathways["from_stop_id"].isin(remaining_stops) & - df_pathways["to_stop_id"].isin(remaining_stops) + df_pathways["from_stop_id"].isin(remaining_stops) + & df_pathways["to_stop_id"].isin(remaining_stops) ] feed["pathways"] = df_pathways.copy() @@ -212,9 +238,7 @@ def cut_feed(feed, df_area, crs = None): remaining_trips = trip_counts[trip_counts > 1].index.values df_trips = feed["trips"] - df_trips = df_trips[ - df_trips["trip_id"].isin(remaining_trips) - ] + df_trips = df_trips[df_trips["trip_id"].isin(remaining_trips)] feed["trips"] = df_trips.copy() feed["stop_times"] = feed["stop_times"][ @@ -224,44 +248,73 @@ def cut_feed(feed, df_area, crs = None): # 6) Remove frequencies if "frequencies" in feed: df_frequencies = feed["frequencies"] - df_frequencies = df_frequencies[ - df_frequencies["trip_id"].isin(remaining_trips) - ] + df_frequencies = df_frequencies[df_frequencies["trip_id"].isin(remaining_trips)] feed["frequencies"] = df_frequencies.copy() return feed + SLOT_COLLISIONS = [ - { "slot": "agency", "identifier": "agency_id", "references": [ - ("routes", "agency_id"), ("fare_attributes", "agency_id")] }, - { "slot": "stops", "identifier": "stop_id", "references": [ - ("stops", "parent_station"), ("stop_times", "stop_id"), - ("transfers", "from_stop_id"), ("transfers", "to_stop_id"), - ("pathways", "from_stop_id"), ("pathways", "to_stop_id")] }, - { "slot": "routes", "identifier": "route_id", "references": [ - ("trips", "route_id"), ("fare_rules", "route_id"), - ("attributions", "route_id")] }, - { "slot": "trips", "identifier": "trip_id", "references": [ - ("stop_times", "trip_id"), ("frequencies", "trip_id"), - ("attributions", "trip_id")] }, - { "slot": "calendar", "identifier": "service_id", "references": [ - ("calendar_dates", "service_id"), ("trips", "service_id")] }, - { "slot": "calendar_dates", "identifier": "service_id", "references": [ - ("trips", "service_id"), ("calendar", "service_id")] }, - { "slot": "fare_attributes", "identifier": "fare_id", "references": [ - ("fare_rules", "fare_id")] }, - { "slot": "shapes", "identifier": "shape_id", "references": [ - ("trips", "shape_id")] }, - { "slot": "pathways", "identifier": "pathway_id", "references": [] }, - { "slot": "levels", "identifier": "level_id", "references": [ - ("stops", "level_id")] }, - { "slot": "attributions", "identifier": "attribution_id" }, + { + "slot": "agency", + "identifier": "agency_id", + "references": [("routes", "agency_id"), ("fare_attributes", "agency_id")], + }, + { + "slot": "stops", + "identifier": "stop_id", + "references": [ + ("stops", "parent_station"), + ("stop_times", "stop_id"), + ("transfers", "from_stop_id"), + ("transfers", "to_stop_id"), + ("pathways", "from_stop_id"), + ("pathways", "to_stop_id"), + ], + }, + { + "slot": "routes", + "identifier": "route_id", + "references": [ + ("trips", "route_id"), + ("fare_rules", "route_id"), + ("attributions", "route_id"), + ], + }, + { + "slot": "trips", + "identifier": "trip_id", + "references": [ + ("stop_times", "trip_id"), + ("frequencies", "trip_id"), + ("attributions", "trip_id"), + ], + }, + { + "slot": "calendar", + "identifier": "service_id", + "references": [("calendar_dates", "service_id"), ("trips", "service_id")], + }, + { + "slot": "calendar_dates", + "identifier": "service_id", + "references": [("trips", "service_id"), ("calendar", "service_id")], + }, + { + "slot": "fare_attributes", + "identifier": "fare_id", + "references": [("fare_rules", "fare_id")], + }, + {"slot": "shapes", "identifier": "shape_id", "references": [("trips", "shape_id")]}, + {"slot": "pathways", "identifier": "pathway_id", "references": []}, + {"slot": "levels", "identifier": "level_id", "references": [("stops", "level_id")]}, + {"slot": "attributions", "identifier": "attribution_id"}, ] + def copy_feed(feed): - return { - slot: feed[slot].copy() for slot in feed - } + return {slot: feed[slot].copy() for slot in feed} + def merge_feeds(feeds): result = {} @@ -271,7 +324,8 @@ def merge_feeds(feeds): return result -def merge_two_feeds(first, second, suffix = "_merged"): + +def merge_two_feeds(first, second, suffix="_merged"): feed = {} print("Merging GTFS data ...") @@ -284,35 +338,52 @@ def merge_two_feeds(first, second, suffix = "_merged"): df_first = first[collision["slot"]] df_second = second[collision["slot"]] - df_first[collision["identifier"]] = df_first[collision["identifier"]].astype(str) - df_second[collision["identifier"]] = df_second[collision["identifier"]].astype(str) - - df_concat = pd.concat([df_first, df_second], sort = True).drop_duplicates() - duplicate_ids = list(df_concat[df_concat[collision["identifier"]].duplicated()][ - collision["identifier"]].astype(str).unique()) + df_first[collision["identifier"]] = df_first[ + collision["identifier"] + ].astype(str) + df_second[collision["identifier"]] = df_second[ + collision["identifier"] + ].astype(str) + + df_concat = pd.concat([df_first, df_second], sort=True).drop_duplicates() + duplicate_ids = list( + df_concat[df_concat[collision["identifier"]].duplicated()][ + collision["identifier"] + ] + .astype(str) + .unique() + ) if len(duplicate_ids) > 0: - print(" Found %d duplicate identifiers in %s" % ( - len(duplicate_ids), collision["slot"])) + print( + " Found %d duplicate identifiers in %s" + % (len(duplicate_ids), collision["slot"]) + ) replacement_ids = [str(id) + suffix for id in duplicate_ids] - df_second[collision["identifier"]] = df_second[collision["identifier"]].replace( - duplicate_ids, replacement_ids - ) + df_second[collision["identifier"]] = df_second[ + collision["identifier"] + ].replace(duplicate_ids, replacement_ids) for ref_slot, ref_identifier in collision["references"]: if ref_slot in first and ref_slot in second: - first[ref_slot][ref_identifier] = first[ref_slot][ref_identifier].astype(str) - second[ref_slot][ref_identifier] = second[ref_slot][ref_identifier].astype(str) + first[ref_slot][ref_identifier] = first[ref_slot][ + ref_identifier + ].astype(str) + second[ref_slot][ref_identifier] = second[ref_slot][ + ref_identifier + ].astype(str) - second[ref_slot][ref_identifier] = second[ref_slot][ref_identifier].replace( - duplicate_ids, replacement_ids - ) + second[ref_slot][ref_identifier] = second[ref_slot][ + ref_identifier + ].replace(duplicate_ids, replacement_ids) for slot in REQUIRED_SLOTS + OPTIONAL_SLOTS: if slot in first and slot in second: - feed[slot] = pd.concat([first[slot], second[slot]], sort = True).drop_duplicates() + feed[slot] = pd.concat( + [first[slot], second[slot]], sort=True + ).drop_duplicates() elif slot in first: feed[slot] = first[slot].copy() elif slot in second: @@ -320,7 +391,8 @@ def merge_two_feeds(first, second, suffix = "_merged"): return feed -def despace_stop_ids(feed, replacement = ":::"): + +def despace_stop_ids(feed, replacement=":::"): feed = copy_feed(feed) references = None @@ -332,14 +404,20 @@ def despace_stop_ids(feed, replacement = ":::"): df_stops = feed["stops"] df_stops["stop_id"] = df_stops["stop_id"].astype(str) - search_ids = list(df_stops[df_stops["stop_id"].str.contains(" ")]["stop_id"].unique()) + search_ids = list( + df_stops[df_stops["stop_id"].str.contains(" ")]["stop_id"].unique() + ) replacement_ids = [item.replace(" ", replacement) for item in search_ids] df_stops["stop_id"] = df_stops["stop_id"].replace(search_ids, replacement_ids) for reference_slot, reference_field in references: if reference_slot in feed: - feed[reference_slot][reference_field] = feed[reference_slot][reference_field].astype(str).replace(search_ids, replacement_ids) + feed[reference_slot][reference_field] = ( + feed[reference_slot][reference_field] + .astype(str) + .replace(search_ids, replacement_ids) + ) print("De-spaced %d/%d stops" % (len(search_ids), len(df_stops))) diff --git a/data/hts/commute_distance.py b/data/hts/commute_distance.py index 2a83893d..249201ef 100644 --- a/data/hts/commute_distance.py +++ b/data/hts/commute_distance.py @@ -1,25 +1,37 @@ import pandas as pd import numpy as np + def configure(context): context.config("random_seed") context.stage("data.hts.selected") + def get_commuting_distance(df_persons, df_trips, activity_type, random): if "euclidean_distance" in df_trips: distance_slot = "euclidean_distance" distance_factor = 1.0 else: distance_slot = "routed_distance" - distance_factor = 1.0 # / 1.3 + distance_factor = 1.0 # / 1.3 # Add commuting distances - df_commute_distance = df_trips[ - ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == activity_type)) | - ((df_trips["preceding_purpose"] == activity_type) & (df_trips["following_purpose"] == "home")) - ].drop_duplicates("person_id", keep = "first")[["person_id", distance_slot]].rename(columns = { distance_slot: "commute_distance" }) + df_commute_distance = ( + df_trips[ + ( + (df_trips["preceding_purpose"] == "home") + & (df_trips["following_purpose"] == activity_type) + ) + | ( + (df_trips["preceding_purpose"] == activity_type) + & (df_trips["following_purpose"] == "home") + ) + ] + .drop_duplicates("person_id", keep="first")[["person_id", distance_slot]] + .rename(columns={distance_slot: "commute_distance"}) + ) - df_persons = pd.merge(df_persons, df_commute_distance, on = "person_id", how = "left") + df_persons = pd.merge(df_persons, df_commute_distance, on="person_id", how="left") # For the ones without commuting distance, sample from the distribution f_missing = df_persons["commute_distance"].isna() @@ -39,7 +51,7 @@ def get_commuting_distance(df_persons, df_trips, activity_type, random): indices = [ np.searchsorted(cdf, r) - for r in random.random_sample(size = np.count_nonzero(f_missing)) + for r in random.random_sample(size=np.count_nonzero(f_missing)) ] df_persons.loc[f_missing, "commute_distance"] = values[indices] @@ -50,17 +62,19 @@ def get_commuting_distance(df_persons, df_trips, activity_type, random): # Attach euclidean factor df_persons["commute_distance"] *= distance_factor - print("Missing %s commute distances: %.2f%%" % ( - activity_type, 100 * np.count_nonzero(f_missing) / len(f_missing) - )) + print( + "Missing %s commute distances: %.2f%%" + % (activity_type, 100 * np.count_nonzero(f_missing) / len(f_missing)) + ) return df_persons + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.selected") random = np.random.RandomState(context.config("random_seed")) return dict( - work = get_commuting_distance(df_persons, df_trips, "work", random), - education = get_commuting_distance(df_persons, df_trips, "education", random) + work=get_commuting_distance(df_persons, df_trips, "work", random), + education=get_commuting_distance(df_persons, df_trips, "education", random), ) diff --git a/data/hts/comparison.py b/data/hts/comparison.py index 3b59979a..891d4c41 100644 --- a/data/hts/comparison.py +++ b/data/hts/comparison.py @@ -7,28 +7,34 @@ Comparison of various attributes between EGT, ENTD and census. """ + def configure(context): context.stage("data.hts.egt.filtered") context.stage("data.hts.entd.filtered") context.stage("data.census.filtered") + def combine(htss): households, persons, trips = [], [], [] for name, (df_hts_households, df_hts_persons, df_hts_trips) in htss.items(): - df_hts_households = pd.DataFrame(df_hts_households, copy = True) - df_hts_persons = pd.DataFrame(df_hts_persons, copy = True) - df_hts_trips = pd.DataFrame(df_hts_trips, copy = True) + df_hts_households = pd.DataFrame(df_hts_households, copy=True) + df_hts_persons = pd.DataFrame(df_hts_persons, copy=True) + df_hts_trips = pd.DataFrame(df_hts_trips, copy=True) df_hts_households["hts"] = name df_hts_persons["hts"] = name df_hts_trips["hts"] = name if "routed_distance" in df_hts_trips: - df_hts_trips = df_hts_trips.rename(columns = { "routed_distance": "hts_distance" }) + df_hts_trips = df_hts_trips.rename( + columns={"routed_distance": "hts_distance"} + ) df_hts_trips["distance_type"] = "routed" elif "euclidean_distance" in df_hts_trips: - df_hts_trips = df_hts_trips.rename(columns = { "euclidean_distance": "hts_distance" }) + df_hts_trips = df_hts_trips.rename( + columns={"euclidean_distance": "hts_distance"} + ) df_hts_trips["distance_type"] = "euclidean" else: raise RuntimeError("No distance slot available") @@ -39,11 +45,12 @@ def combine(htss): return pd.concat(households), pd.concat(persons), pd.concat(trips) + def execute(context): egt = context.stage("data.hts.egt.filtered") entd = context.stage("data.hts.entd.filtered") - htss = dict(egt = egt, entd = entd) + htss = dict(egt=egt, entd=entd) names = sorted(list(htss.keys())) # Make data set of all HTS @@ -61,41 +68,80 @@ def execute(context): "number_of_households": np.count_nonzero(f_hts_households), "number_of_persons": np.count_nonzero(f_hts_persons), "number_of_trips": np.count_nonzero(f_hts_trips), - "weighted_number_of_households": df_households[f_hts_households]["household_weight"].sum(), - "weighted_number_of_persons": df_persons[f_hts_persons]["person_weight"].sum(), + "weighted_number_of_households": df_households[f_hts_households][ + "household_weight" + ].sum(), + "weighted_number_of_persons": df_persons[f_hts_persons][ + "person_weight" + ].sum(), "weighted_number_of_trips": df_trips[f_hts_trips]["trip_weight"].sum(), - "weighted_number_of_trips_per_mobile_person": (df_persons[f_hts_persons & f_any_trips]["number_of_trips"] * df_persons[f_hts_persons & f_any_trips]["trip_weight"]).sum() / df_persons[f_hts_persons & f_any_trips]["trip_weight"].sum(), - "share_of_students": (df_persons[f_hts_persons]["studies"] * df_persons[f_hts_persons]["person_weight"]).sum() / df_persons[f_hts_persons]["person_weight"].sum(), - "share_of_employed": (df_persons[f_hts_persons]["employed"] * df_persons[f_hts_persons]["person_weight"]).sum() / df_persons[f_hts_persons]["person_weight"].sum(), - "number_of_activity_chains": len(df_trips[f_hts_trips]["person_id"].unique()), - "number_of_activity_chains": len(df_trips[f_hts_trips]["person_id"].unique()), + "weighted_number_of_trips_per_mobile_person": ( + df_persons[f_hts_persons & f_any_trips]["number_of_trips"] + * df_persons[f_hts_persons & f_any_trips]["trip_weight"] + ).sum() + / df_persons[f_hts_persons & f_any_trips]["trip_weight"].sum(), + "share_of_students": ( + df_persons[f_hts_persons]["studies"] + * df_persons[f_hts_persons]["person_weight"] + ).sum() + / df_persons[f_hts_persons]["person_weight"].sum(), + "share_of_employed": ( + df_persons[f_hts_persons]["employed"] + * df_persons[f_hts_persons]["person_weight"] + ).sum() + / df_persons[f_hts_persons]["person_weight"].sum(), + "number_of_activity_chains": len( + df_trips[f_hts_trips]["person_id"].unique() + ), + "number_of_activity_chains": len( + df_trips[f_hts_trips]["person_id"].unique() + ), } # Trip distance distribution - df_trips["distance_class"] = np.digitize(df_trips["hts_distance"], np.arange(1, 10) * 1000) - df_distance = df_trips.groupby(["hts", "distance_class"])["trip_weight"].sum().reset_index(name = "trip_weight") + df_trips["distance_class"] = np.digitize( + df_trips["hts_distance"], np.arange(1, 10) * 1000 + ) + df_distance = ( + df_trips.groupby(["hts", "distance_class"])["trip_weight"] + .sum() + .reset_index(name="trip_weight") + ) # Age distribution AGE_BOUNDS = [14, 29, 44, 59, 74, 1000] - df_persons["age_class"] = np.digitize(df_persons["age"], AGE_BOUNDS, right = True) - df_age = df_persons.groupby(["hts", "age_class"])["person_weight"].sum().reset_index(name = "person_weight") - - df_census = pd.DataFrame(context.stage("data.census.filtered")[["age", "studies", "weight", "employed"]], copy = True) + df_persons["age_class"] = np.digitize(df_persons["age"], AGE_BOUNDS, right=True) + df_age = ( + df_persons.groupby(["hts", "age_class"])["person_weight"] + .sum() + .reset_index(name="person_weight") + ) + + df_census = pd.DataFrame( + context.stage("data.census.filtered")[["age", "studies", "weight", "employed"]], + copy=True, + ) df_census["hts"] = "census" - df_census["age_class"] = np.digitize(df_census["age"], AGE_BOUNDS, right = True) - df_age_census = df_census.groupby(["hts", "age_class"])["weight"].sum().reset_index(name = "person_weight") + df_census["age_class"] = np.digitize(df_census["age"], AGE_BOUNDS, right=True) + df_age_census = ( + df_census.groupby(["hts", "age_class"])["weight"] + .sum() + .reset_index(name="person_weight") + ) df_age = pd.concat([df_age, df_age_census]) # Add student and employment share for census info["census"] = { - "share_of_students": (df_census["studies"] * df_census["weight"]).sum() / df_census["weight"].sum(), - "share_of_employed": (df_census["employed"] * df_census["weight"]).sum() / df_census["weight"].sum() + "share_of_students": (df_census["studies"] * df_census["weight"]).sum() + / df_census["weight"].sum(), + "share_of_employed": (df_census["employed"] * df_census["weight"]).sum() + / df_census["weight"].sum(), } return { "info": info, "distance_distribution": df_distance, - "age_distribution": df_age + "age_distribution": df_age, } diff --git a/data/hts/edgt_44/cleaned.py b/data/hts/edgt_44/cleaned.py index 1fa9b526..f9a95888 100644 --- a/data/hts/edgt_44/cleaned.py +++ b/data/hts/edgt_44/cleaned.py @@ -6,26 +6,54 @@ This stage cleans the Loire Atlantique EDGT. """ + def configure(context): context.stage("data.hts.edgt_44.raw") + PURPOSE_MAP = { "home": [1, 2], "work": [11, 12, 13, 81], "education": [21, 22, 23, 24, 25, 26, 27, 28, 29], "shop": [30, 31, 32, 33, 34, 35, 82], "leisure": [51, 52, 53, 54], - "other": [41, 42, 43, 44, 45, 61, 62, 63, 64, 71, 72, 73, 74, 91] + "other": [41, 42, 43, 44, 45, 61, 62, 63, 64, 71, 72, 73, 74, 91], } MODES_MAP = { "car": [13, 15, 21, 81], "car_passenger": [14, 16, 22, 82], - "pt": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 72, 73, 91, 92, 94, 95], + "pt": [ + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 51, + 52, + 53, + 61, + 71, + 72, + 73, + 91, + 92, + 94, + 95, + ], "bike": [11, 17, 12, 18, 93, 19], - "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk + "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk } + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.raw") @@ -33,9 +61,13 @@ def execute(context): df_households["departement_id"] = "44" # Transform original IDs to integer (they are hierarchichal) - df_households["edgt_household_id"] = (df_households["ECH"] + df_households["MTIR"]).astype(int) + df_households["edgt_household_id"] = ( + df_households["ECH"] + df_households["MTIR"] + ).astype(int) df_persons["edgt_person_id"] = df_persons["PER"].astype(int) - df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PTIR"]).astype(int) + df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PTIR"]).astype( + int + ) df_trips["edgt_person_id"] = df_trips["PER"].astype(int) df_trips["edgt_household_id"] = (df_trips["ECH"] + df_trips["DTIR"]).astype(int) df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int) @@ -44,15 +76,19 @@ def execute(context): df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( - df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]], - on = ["edgt_household_id"] - ).sort_values(by = ["household_id", "edgt_person_id"]) + df_persons, + df_households[["edgt_household_id", "household_id", "departement_id"]], + on=["edgt_household_id"], + ).sort_values(by=["household_id", "edgt_person_id"]) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( - df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]], - on = ["edgt_person_id", "edgt_household_id"] - ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"]) + df_trips, + df_persons[ + ["edgt_person_id", "edgt_household_id", "person_id", "household_id"] + ], + on=["edgt_person_id", "edgt_household_id"], + ).sort_values(by=["household_id", "person_id", "edgt_trip_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags @@ -71,8 +107,10 @@ def execute(context): df_persons["sex"] = df_persons["sex"].astype("category") # Household size - df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size") - df_households = pd.merge(df_households, df_size, on = "household_id") + df_size = ( + df_persons.groupby("household_id").size().reset_index(name="household_size") + ) + df_households = pd.merge(df_households, df_size, on="household_id") # Clean departement df_trips["origin_departement_id"] = "44" @@ -80,8 +118,12 @@ def execute(context): df_households["departement_id"] = df_households["departement_id"].astype("category") df_persons["departement_id"] = df_persons["departement_id"].astype("category") - df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category") - df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category") + df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype( + "category" + ) + df_trips["destination_departement_id"] = df_trips[ + "destination_departement_id" + ].astype("category") # Clean employment df_persons["employed"] = df_persons["P7"].isin(["1", "2"]) @@ -91,7 +133,9 @@ def execute(context): # Number of vehicles df_households["number_of_vehicles"] = df_households["M6"] + df_households["M5"] - df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int) + df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype( + int + ) df_households["number_of_bikes"] = df_households["M7"].astype(int) # License @@ -100,7 +144,7 @@ def execute(context): # Has subscription (not availabile in EDGT 44) df_persons["has_pt_subscription"] = False - # Survey respondents + # Survey respondents # PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers # PENQ 2 : nonrespondent of travel questionary section df_persons.loc[df_persons["PENQ"] == 1, "travel_respondent"] = True @@ -138,13 +182,13 @@ def execute(context): df_trips["routed_distance"] = df_trips["DIST"] # Trip times - df_trips["departure_time"] = 3600.0 * df_trips["D4A"] # hour - df_trips["departure_time"] += 60.0 * df_trips["D4B"] # minute + df_trips["departure_time"] = 3600.0 * df_trips["D4A"] # hour + df_trips["departure_time"] += 60.0 * df_trips["D4B"] # minute - df_trips["arrival_time"] = 3600.0 * df_trips["D8A"] # hour - df_trips["arrival_time"] += 60.0 * df_trips["D8B"] # minute + df_trips["arrival_time"] = 3600.0 * df_trips["D8A"] # hour + df_trips["arrival_time"] += 60.0 * df_trips["D8B"] # minute - df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"]) + df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"]) df_trips = hts.fix_trip_times(df_trips) # Durations @@ -153,16 +197,25 @@ def execute(context): # Add weight to trips df_trips = pd.merge( - df_trips, df_persons[["person_id", "COEQ"]], on = "person_id", how = "left" - ).rename(columns = { "COEQ": "trip_weight" }) + df_trips, df_persons[["person_id", "COEQ"]], on="person_id", how="left" + ).rename(columns={"COEQ": "trip_weight"}) df_persons["trip_weight"] = df_persons["COEQ"] # Chain length - df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips") + df_count = ( + df_trips[["person_id"]] + .groupby("person_id") + .size() + .reset_index(name="number_of_trips") + ) # People with at least one trip (number_of_trips > 0) - df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left") + df_persons = pd.merge(df_persons, df_count, on="person_id", how="left") # People that awnsered the travel questionary section but stayed at home (number_of_trips = 0) - df_persons.loc[(df_persons["travel_respondent"] == True) & (df_persons["number_of_trips"].isna()), "number_of_trips"] = 0 + df_persons.loc[ + (df_persons["travel_respondent"] == True) + & (df_persons["number_of_trips"].isna()), + "number_of_trips", + ] = 0 # Nonrespondent of travel questionary section (number_of_trips = -1) df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int) @@ -173,11 +226,15 @@ def execute(context): # Calculate consumption units hts.check_household_size(df_households, df_persons) - df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") + df_households = pd.merge( + df_households, hts.calculate_consumption_units(df_persons), on="household_id" + ) # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype(int) - df_persons.loc[df_persons["socioprofessional_class"] > 6, "socioprofessional_class"] = 8 + df_persons.loc[ + df_persons["socioprofessional_class"] > 6, "socioprofessional_class" + ] = 8 df_persons.loc[df_persons["P7"] == "7", "socioprofessional_class"] = 7 # Check departure and arrival times diff --git a/data/hts/edgt_44/filtered.py b/data/hts/edgt_44/filtered.py index df52ab89..cebf650a 100644 --- a/data/hts/edgt_44/filtered.py +++ b/data/hts/edgt_44/filtered.py @@ -5,17 +5,20 @@ This stage filters out observations which live or work outside of the area. """ + def configure(context): context.stage("data.hts.edgt_44.cleaned") context.stage("data.spatial.codes") - - context.config("filter_hts",True) + + context.config("filter_hts", True) + + def execute(context): - filter_edgt = context.config("filter_hts") + filter_edgt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.cleaned") - if filter_edgt : + if filter_edgt: # Filter for non-residents requested_departments = df_codes["departement_id"].unique() f = df_persons["departement_id"].astype(str).isin(requested_departments) @@ -24,15 +27,26 @@ def execute(context): # Filter for people going outside of the area remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set( + df_trips[ + ~df_trips["origin_departement_id"] + .astype(str) + .isin(requested_departments) + | ~df_trips["destination_departement_id"] + .astype(str) + .isin(requested_departments) + ]["person_id"].unique() + ) df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + df_trips = df_trips[ + df_trips["person_id"].isin(df_persons["person_id"].unique()) + ] + df_households = df_households[ + df_households["household_id"].isin(df_persons["household_id"]) + ] # Finish up df_households = df_households[hts.HOUSEHOLD_COLUMNS] diff --git a/data/hts/edgt_44/format.py b/data/hts/edgt_44/format.py index 0dd224e1..402a5e59 100644 --- a/data/hts/edgt_44/format.py +++ b/data/hts/edgt_44/format.py @@ -24,12 +24,22 @@ (50, 1, "M6", "NOMBRE DE DEUX OU TROIS ROUES MOTORISES A DISPOSITION"), (51, 2, "M7", "NOMBRE DE VELOS A DISPOSITION"), (53, 4, "MLA", "ANNEE INSTALLATION DANS LE LOGEMENT"), - (57, 2, "MLB1", "PREMIER CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)"), - (59, 2, "MLB2", "DEUXIEME CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)"), + ( + 57, + 2, + "MLB1", + "PREMIER CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)", + ), + ( + 59, + 2, + "MLB2", + "DEUXIEME CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)", + ), (61, 5, "MLC", "ANCIENNE COMMUNE DE RESIDENCE"), (66, 1, "MLD", "TYPE D'OCCUPATION DE L'ANCIEN LOGEMENT (le ménage était-il ?)"), (67, 8, "COEM", "COEFFICIENT DE REDRESSEMENT MENAGE"), - (75, 1, "MFIN", "FIN FICHIER MENAGE") + (75, 1, "MFIN", "FIN FICHIER MENAGE"), ] PERSON_FORMAT = [ @@ -50,15 +60,35 @@ (24, 1, "P9", "PCS"), (25, 1, "P12", "TRAVAIL OU ETUDES A DOMICILE"), (26, 6, "P13A", "LIEU DE TRAVAIL OU D'ETUDES (OCCUPATION PRINCIPALE)"), - (32, 1, "P15", "DISPOSITION D'UNE VOITURE EN GÉNÉRAL (DÉPLACEMENTS DOMICILE TRAVAIL OU ÉTUDES)"), - (33, 1, "P17", "PROBLÈMES DE STATIONNEMENT EN GÉNÉRAL (SUR LIEU DE TRAVAIL OU D'ÉTUDES)"), - (34, 1, "P17A", "DIFFICULTÉS DE STATIONNEMENT SUR OU À PROXIMITÉ DE VOTRE LIEU DE TRAVAIL OU DE VOTRE LIEU D'ÉTUDES"), + ( + 32, + 1, + "P15", + "DISPOSITION D'UNE VOITURE EN GÉNÉRAL (DÉPLACEMENTS DOMICILE TRAVAIL OU ÉTUDES)", + ), + ( + 33, + 1, + "P17", + "PROBLÈMES DE STATIONNEMENT EN GÉNÉRAL (SUR LIEU DE TRAVAIL OU D'ÉTUDES)", + ), + ( + 34, + 1, + "P17A", + "DIFFICULTÉS DE STATIONNEMENT SUR OU À PROXIMITÉ DE VOTRE LIEU DE TRAVAIL OU DE VOTRE LIEU D'ÉTUDES", + ), (35, 1, "P23A", "FRÉQUENCE D'UTILISATION EN SEMAINE : MARCHE A PIED"), (36, 1, "P20", "FRÉQUENCE D'UTILISATION EN SEMAINE : BICYCLETTE"), (37, 1, "P21", "FRÉQUENCE D'UTILISATION EN SEMAINE : 2 ROUES À MOTEUR CONDUCTEUR"), (38, 1, "P23", "FRÉQUENCE D'UTILISATION EN SEMAINE : VOITURE CONDUCTEUR"), (39, 1, "P24", "FRÉQUENCE D'UTILISATION EN SEMAINE : VOITURE PASSAGER"), - (40, 1, "P25", "FRÉQUENCE D'UTILISATION EN SEMAINE : RESEAUX DE TRANSPORT EN COMMUN (TRAM, BUS, CAR…)"), + ( + 40, + 1, + "P25", + "FRÉQUENCE D'UTILISATION EN SEMAINE : RESEAUX DE TRANSPORT EN COMMUN (TRAM, BUS, CAR…)", + ), (41, 1, "P19", "SITUATION DE LA PERSONNE LA VEILLE"), (42, 1, "P19A", "SITUATION DES ACTIFS LA VEILLE"), (43, 1, "PL27", "FRÉQUENCE D'UTILISATION EN SEMAINE : TRAIN"), @@ -67,7 +97,7 @@ (46, 6, "DP13", "Distance DOMICILE-TRAVAIL"), (52, 8, "COEP", "COEFFICIENT DE REDRESSEMENT TOUTES PERSONNES"), (60, 8, "COEQ", "COEFFICIENT DE REDRESSEMENT PERSONNES ENQUETEES"), - (68, 1, "PFIN", "FIN FICHIER PERSONNE") + (68, 1, "PFIN", "FIN FICHIER PERSONNE"), ] TRIP_FORMAT = [ @@ -95,5 +125,5 @@ (54, 8, "DOIB", "DISTANCE VOL OISEAU (en mètres)"), (62, 8, "DIST", "DiSTANCE PARCOURUE (en mètres)"), (70, 8, "DISP", "DiSTANCE PARCOURUE dans périmètre (en mètres)"), - (78, 1, "DFIN", "FIN FICHIER DEPLACEMENT") + (78, 1, "DFIN", "FIN FICHIER DEPLACEMENT"), ] diff --git a/data/hts/edgt_44/raw.py b/data/hts/edgt_44/raw.py index cb58bd27..5a55d8cb 100644 --- a/data/hts/edgt_44/raw.py +++ b/data/hts/edgt_44/raw.py @@ -10,35 +10,66 @@ Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes) """ + def configure(context): context.config("data_path") + from .format import HOUSEHOLD_FORMAT, PERSON_FORMAT, TRIP_FORMAT HOUSEHOLD_COLUMNS = { - "MP2": str, "MTIR": str, "ECH": str, "COEM": float, - "M6": int, "M7": int, "M5": int + "MP2": str, + "MTIR": str, + "ECH": str, + "COEM": float, + "M6": int, + "M7": int, + "M5": int, } PERSON_COLUMNS = { - "ECH": str, "PTIR": str, "PER": int, "PP2": str, "PENQ": int, - "P3": int, "P2": int, "P4": int, - "P7": str, "P12": str, - "P9": str, "P5": str, - "COEP": float, "COEQ": float, "P1": int + "ECH": str, + "PTIR": str, + "PER": int, + "PP2": str, + "PENQ": int, + "P3": int, + "P2": int, + "P4": int, + "P7": str, + "P12": str, + "P9": str, + "P5": str, + "COEP": float, + "COEQ": float, + "P1": int, } TRIP_COLUMNS = { - "ECH": str, "DTIR": str, "PER": int, "NDEP": int, "DP2": str, - "D2A": int, "D5A": int, "D3": str, "D4A": int, "D4B": int, - "D7": str, "D8A": int, "D8B": int, - "D8C": int, "MODP": int, "DOIB": int, "DIST": int + "ECH": str, + "DTIR": str, + "PER": int, + "NDEP": int, + "DP2": str, + "D2A": int, + "D5A": int, + "D3": str, + "D4A": int, + "D4B": int, + "D7": str, + "D8A": int, + "D8B": int, + "D8C": int, + "MODP": int, + "DOIB": int, + "DIST": int, } + def execute(context): # Load households df_household_dictionary = pd.DataFrame.from_records( - HOUSEHOLD_FORMAT, columns = ["position", "size", "variable", "description"] + HOUSEHOLD_FORMAT, columns=["position", "size", "variable", "description"] ) column_widths = df_household_dictionary["size"].values @@ -46,13 +77,17 @@ def execute(context): df_households = pd.read_fwf( "%s/edgt_44_2015/02a_EDGT_44_MENAGE_FAF_TEL_2015-08-07_modifZF.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(HOUSEHOLD_COLUMNS.keys()), + dtype=HOUSEHOLD_COLUMNS, ) # Load persons df_person_dictionary = pd.DataFrame.from_records( - PERSON_FORMAT, columns = ["position", "size", "variable", "description"] + PERSON_FORMAT, columns=["position", "size", "variable", "description"] ) column_widths = df_person_dictionary["size"].values @@ -60,13 +95,17 @@ def execute(context): df_persons = pd.read_fwf( "%s/edgt_44_2015/02b_EDGT_44_PERSO_FAF_TEL_ModifPCS_2016-04-14.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(PERSON_COLUMNS.keys()), + dtype=PERSON_COLUMNS, ) # Load trips df_trip_dictionary = pd.DataFrame.from_records( - TRIP_FORMAT, columns = ["position", "size", "variable", "description"] + TRIP_FORMAT, columns=["position", "size", "variable", "description"] ) column_widths = df_trip_dictionary["size"].values @@ -74,21 +113,29 @@ def execute(context): df_trips = pd.read_fwf( "%s/edgt_44_2015/02c_EDGT_44_DEPLA_FAF_TEL_DIST_2015-11-10.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(TRIP_COLUMNS.keys()), + dtype=TRIP_COLUMNS, ) return df_households, df_persons, df_trips + FILES = [ "02a_EDGT_44_MENAGE_FAF_TEL_2015-08-07_modifZF.txt", "02b_EDGT_44_PERSO_FAF_TEL_ModifPCS_2016-04-14.txt", "02c_EDGT_44_DEPLA_FAF_TEL_DIST_2015-11-10.txt", ] + def validate(context): for name in FILES: - if not os.path.exists("%s/edgt_44_2015/%s" % (context.config("data_path"), name)): + if not os.path.exists( + "%s/edgt_44_2015/%s" % (context.config("data_path"), name) + ): raise RuntimeError("File missing from EDGT: %s" % name) return [ diff --git a/data/hts/edgt_44/reweighted.py b/data/hts/edgt_44/reweighted.py index 1bbcbd4d..647ccdef 100644 --- a/data/hts/edgt_44/reweighted.py +++ b/data/hts/edgt_44/reweighted.py @@ -1,8 +1,10 @@ import numpy as np + def configure(context): context.stage("data.hts.edgt_44.filtered") + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.filtered") diff --git a/data/hts/edgt_lyon/cleaned_adisp.py b/data/hts/edgt_lyon/cleaned_adisp.py index eed34608..905df318 100644 --- a/data/hts/edgt_lyon/cleaned_adisp.py +++ b/data/hts/edgt_lyon/cleaned_adisp.py @@ -7,43 +7,76 @@ This stage cleans the Lyon EDGT. """ + def configure(context): context.stage("data.hts.edgt_lyon.raw_adisp") + PURPOSE_MAP = { "home": [1, 2], "work": [11, 12, 13, 14, 81], "education": [21, 22, 23, 24, 25, 26, 27, 28, 29, 96, 97], "shop": [30, 31, 32, 33, 34, 35, 82, 98], "leisure": [51, 52, 53, 54], - "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91] + "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91], } MODES_MAP = { - "car": [10, 13, 15, 21, 81], # 10 is (driving) an ambulance + "car": [10, 13, 15, 21, 81], # 10 is (driving) an ambulance "car_passenger": [14, 16, 22, 82], - "pt": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 91, 92, 94, 95], + "pt": [ + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 51, + 52, + 53, + 61, + 71, + 91, + 92, + 94, + 95, + ], "bike": [11, 17, 12, 18, 93], - "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk + "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk } + def execute(context): - df_households, df_persons, df_trips, df_spatial = context.stage("data.hts.edgt_lyon.raw_adisp") + df_households, df_persons, df_trips, df_spatial = context.stage( + "data.hts.edgt_lyon.raw_adisp" + ) # Merge departement into households df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy() - df_spatial["ZFM"] = df_spatial["ZF__2015"].astype(str).str.pad(width=8, side='left', fillchar='0') + df_spatial["ZFM"] = ( + df_spatial["ZF__2015"].astype(str).str.pad(width=8, side="left", fillchar="0") + ) df_spatial["departement_id"] = df_spatial["DepCom"].str[:2] df_spatial = df_spatial[["ZFM", "departement_id"]] # Attention, some households get lost here! - df_households = pd.merge(df_households, df_spatial, on = "ZFM", how = "left") + df_households = pd.merge(df_households, df_spatial, on="ZFM", how="left") df_households["departement_id"] = df_households["departement_id"].fillna("unknown") # Transform original IDs to integer (they are hierarchichal) - df_households["edgt_household_id"] = (df_households["ZFM"] + df_households["ECH"]).astype(int) + df_households["edgt_household_id"] = ( + df_households["ZFM"] + df_households["ECH"] + ).astype(int) df_persons["edgt_person_id"] = df_persons["PER"].astype(int) - df_persons["edgt_household_id"] = (df_persons["ZFP"] + df_persons["ECH"]).astype(int) + df_persons["edgt_household_id"] = (df_persons["ZFP"] + df_persons["ECH"]).astype( + int + ) df_trips["edgt_person_id"] = df_trips["PER"].astype(int) df_trips["edgt_household_id"] = (df_trips["ZFD"] + df_trips["ECH"]).astype(int) df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int) @@ -52,15 +85,19 @@ def execute(context): df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( - df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]], - on = ["edgt_household_id"] - ).sort_values(by = ["household_id", "edgt_person_id"]) + df_persons, + df_households[["edgt_household_id", "household_id", "departement_id"]], + on=["edgt_household_id"], + ).sort_values(by=["household_id", "edgt_person_id"]) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( - df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]], - on = ["edgt_person_id", "edgt_household_id"] - ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"]) + df_trips, + df_persons[ + ["edgt_person_id", "edgt_household_id", "person_id", "household_id"] + ], + on=["edgt_person_id", "edgt_household_id"], + ).sort_values(by=["household_id", "person_id", "edgt_trip_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags @@ -79,25 +116,45 @@ def execute(context): df_persons["sex"] = df_persons["sex"].astype("category") # Household size - df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size") - df_households = pd.merge(df_households, df_size, on = "household_id") + df_size = ( + df_persons.groupby("household_id").size().reset_index(name="household_size") + ) + df_households = pd.merge(df_households, df_size, on="household_id") # Clean departement - df_trips = pd.merge(df_trips, df_spatial.rename(columns = { - "ZFM": "D3", "departement_id": "origin_departement_id" - }), on = "D3", how = "left") + df_trips = pd.merge( + df_trips, + df_spatial.rename( + columns={"ZFM": "D3", "departement_id": "origin_departement_id"} + ), + on="D3", + how="left", + ) - df_trips = pd.merge(df_trips, df_spatial.rename(columns = { - "ZFM": "D7", "departement_id": "destination_departement_id" - }), on = "D7", how = "left") + df_trips = pd.merge( + df_trips, + df_spatial.rename( + columns={"ZFM": "D7", "departement_id": "destination_departement_id"} + ), + on="D7", + how="left", + ) - df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna("unknown") - df_trips["destination_departement_id"] = df_trips["destination_departement_id"].fillna("unknown") + df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna( + "unknown" + ) + df_trips["destination_departement_id"] = df_trips[ + "destination_departement_id" + ].fillna("unknown") df_households["departement_id"] = df_households["departement_id"].astype("category") df_persons["departement_id"] = df_persons["departement_id"].astype("category") - df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category") - df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category") + df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype( + "category" + ) + df_trips["destination_departement_id"] = df_trips[ + "destination_departement_id" + ].astype("category") # Clean employment df_persons["employed"] = df_persons["P9"].isin(["1", "2"]) @@ -107,16 +164,20 @@ def execute(context): # Number of vehicles df_households["number_of_vehicles"] = df_households["M6"] + df_households["M14"] - df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int) + df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype( + int + ) df_households["number_of_bikes"] = df_households["M21"].astype(int) # License df_persons["has_license"] = df_persons["P7"] == "1" # Has subscription - df_persons["has_pt_subscription"] = df_persons["P12"].isin(["1", "2", "3", "5", "6"]) + df_persons["has_pt_subscription"] = df_persons["P12"].isin( + ["1", "2", "3", "5", "6"] + ) - # Survey respondents + # Survey respondents # PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers # PENQ 2 : nonrespondent of travel questionary section df_persons["PENQ"] = df_persons["PENQ"].fillna("2").astype("int") @@ -151,13 +212,13 @@ def execute(context): df_trips["routed_distance"] = df_trips["D12"] # Trip times - df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour - df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute + df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour + df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute - df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour - df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute + df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour + df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute - df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"]) + df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"]) df_trips = hts.fix_trip_times(df_trips) # Durations @@ -166,16 +227,25 @@ def execute(context): # Add weight to trips df_trips = pd.merge( - df_trips, df_persons[["person_id", "COE1"]], on = "person_id", how = "left" - ).rename(columns = { "COE1": "trip_weight" }) + df_trips, df_persons[["person_id", "COE1"]], on="person_id", how="left" + ).rename(columns={"COE1": "trip_weight"}) df_persons["trip_weight"] = df_persons["COE1"] # Chain length - df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips") + df_count = ( + df_trips[["person_id"]] + .groupby("person_id") + .size() + .reset_index(name="number_of_trips") + ) # People with at least one trip (number_of_trips > 0) - df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left") + df_persons = pd.merge(df_persons, df_count, on="person_id", how="left") # People that answered the travel questionary section but stayed at home (number_of_trips = 0) - df_persons.loc[(df_persons["travel_respondent"] == True) & (df_persons["number_of_trips"].isna()), "number_of_trips"] = 0 + df_persons.loc[ + (df_persons["travel_respondent"] == True) + & (df_persons["number_of_trips"].isna()), + "number_of_trips", + ] = 0 # Nonrespondent of travel questionary section (number_of_trips = -1) df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int) @@ -186,7 +256,9 @@ def execute(context): # Calculate consumption units hts.check_household_size(df_households, df_persons) - df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") + df_households = pd.merge( + df_households, hts.calculate_consumption_units(df_persons), on="household_id" + ) # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["PCSC"].fillna(8).astype(int) diff --git a/data/hts/edgt_lyon/cleaned_cerema.py b/data/hts/edgt_lyon/cleaned_cerema.py index d452820b..850a53ac 100644 --- a/data/hts/edgt_lyon/cleaned_cerema.py +++ b/data/hts/edgt_lyon/cleaned_cerema.py @@ -6,28 +6,55 @@ This stage cleans the Lyon EDGT. """ + def configure(context): context.stage("data.hts.edgt_lyon.raw_cerema") + PURPOSE_MAP = { "home": [1, 2], "work": [11, 12, 13, 81], "education": [21, 22, 23, 24, 25, 26, 27, 28, 29], "shop": [30, 31, 32, 33, 34, 35, 82], "leisure": [51, 52, 53, 54], - "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91] + "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91], } MODES_MAP = { "car": [13, 15, 21, 81], "car_passenger": [14, 16, 22, 82], - "pt": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 91, 92, 94, 95], + "pt": [ + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 51, + 52, + 53, + 61, + 71, + 91, + 92, + 94, + 95, + ], "bike": [11, 17, 12, 18, 93], - "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk + "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk } + def execute(context): - df_households, df_persons, df_trips, df_spatial = context.stage("data.hts.edgt_lyon.raw_cerema") + df_households, df_persons, df_trips, df_spatial = context.stage( + "data.hts.edgt_lyon.raw_cerema" + ) # Merge departement into households df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy() @@ -36,13 +63,17 @@ def execute(context): df_spatial = df_spatial[["MP2", "departement_id"]] # Attention, some households get lost here! - df_households = pd.merge(df_households, df_spatial, on = "MP2", how = "left") + df_households = pd.merge(df_households, df_spatial, on="MP2", how="left") df_households["departement_id"] = df_households["departement_id"].fillna("unknown") # Transform original IDs to integer (they are hierarchichal) - df_households["edgt_household_id"] = (df_households["ECH"] + df_households["MP2"]).astype(int) + df_households["edgt_household_id"] = ( + df_households["ECH"] + df_households["MP2"] + ).astype(int) df_persons["edgt_person_id"] = df_persons["PER"].astype(int) - df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PP2"]).astype(int) + df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PP2"]).astype( + int + ) df_trips["edgt_person_id"] = df_trips["PER"].astype(int) df_trips["edgt_household_id"] = (df_trips["ECH"] + df_trips["DP2"]).astype(int) df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int) @@ -51,15 +82,19 @@ def execute(context): df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( - df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]], - on = ["edgt_household_id"] - ).sort_values(by = ["household_id", "edgt_person_id"]) + df_persons, + df_households[["edgt_household_id", "household_id", "departement_id"]], + on=["edgt_household_id"], + ).sort_values(by=["household_id", "edgt_person_id"]) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( - df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]], - on = ["edgt_person_id", "edgt_household_id"] - ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"]) + df_trips, + df_persons[ + ["edgt_person_id", "edgt_household_id", "person_id", "household_id"] + ], + on=["edgt_person_id", "edgt_household_id"], + ).sort_values(by=["household_id", "person_id", "edgt_trip_id"]) df_trips["trip_id"] = np.arange(len(df_trips)) # Trip flags @@ -78,25 +113,45 @@ def execute(context): df_persons["sex"] = df_persons["sex"].astype("category") # Household size - df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size") - df_households = pd.merge(df_households, df_size, on = "household_id") + df_size = ( + df_persons.groupby("household_id").size().reset_index(name="household_size") + ) + df_households = pd.merge(df_households, df_size, on="household_id") # Clean departement - df_trips = pd.merge(df_trips, df_spatial.rename(columns = { - "MP2": "D3", "departement_id": "origin_departement_id" - }), on = "D3", how = "left") + df_trips = pd.merge( + df_trips, + df_spatial.rename( + columns={"MP2": "D3", "departement_id": "origin_departement_id"} + ), + on="D3", + how="left", + ) - df_trips = pd.merge(df_trips, df_spatial.rename(columns = { - "MP2": "D7", "departement_id": "destination_departement_id" - }), on = "D7", how = "left") + df_trips = pd.merge( + df_trips, + df_spatial.rename( + columns={"MP2": "D7", "departement_id": "destination_departement_id"} + ), + on="D7", + how="left", + ) - df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna("unknown") - df_trips["destination_departement_id"] = df_trips["destination_departement_id"].fillna("unknown") + df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna( + "unknown" + ) + df_trips["destination_departement_id"] = df_trips[ + "destination_departement_id" + ].fillna("unknown") df_households["departement_id"] = df_households["departement_id"].astype("category") df_persons["departement_id"] = df_persons["departement_id"].astype("category") - df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category") - df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category") + df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype( + "category" + ) + df_trips["destination_departement_id"] = df_trips[ + "destination_departement_id" + ].astype("category") # Clean employment df_persons["employed"] = df_persons["P7"].isin(["1", "2"]) @@ -106,7 +161,9 @@ def execute(context): # Number of vehicles df_households["number_of_vehicles"] = df_households["M6"] + df_households["M5"] - df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int) + df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype( + int + ) df_households["number_of_bikes"] = df_households["M7"].astype(int) # License @@ -115,7 +172,7 @@ def execute(context): # Has subscription df_persons["has_pt_subscription"] = df_persons["P10"].isin(["1", "2", "3"]) - # Survey respondents + # Survey respondents # PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers # PENQ 2 : nonrespondent of travel questionary section df_persons["PENQ"] = df_persons["PENQ"].fillna("2").astype(int) @@ -149,13 +206,13 @@ def execute(context): df_trips["routed_distance"] = df_trips["DIST"] # Trip times - df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour - df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute + df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour + df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute - df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour - df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute + df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour + df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute - df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"]) + df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"]) df_trips = hts.fix_trip_times(df_trips) # Durations @@ -164,18 +221,26 @@ def execute(context): # Add weight to trips df_trips = pd.merge( - df_trips, df_persons[["person_id", "COEQ"]], on = "person_id", how = "left" - ).rename(columns = { "COEQ": "trip_weight" }) + df_trips, df_persons[["person_id", "COEQ"]], on="person_id", how="left" + ).rename(columns={"COEQ": "trip_weight"}) df_persons["trip_weight"] = df_persons["COEQ"] # Chain length - df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips") + df_count = ( + df_trips[["person_id"]] + .groupby("person_id") + .size() + .reset_index(name="number_of_trips") + ) # People with at least one trip (number_of_trips > 0) - df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left") - + df_persons = pd.merge(df_persons, df_count, on="person_id", how="left") + # People that awnsered the travel questionary section but stayed at home (number_of_trips = 0) - df_persons.loc[df_persons["travel_respondent"] & df_persons["number_of_trips"].isna(), "number_of_trips"] = 0 + df_persons.loc[ + df_persons["travel_respondent"] & df_persons["number_of_trips"].isna(), + "number_of_trips", + ] = 0 # Nonrespondent of travel questionary section (number_of_trips = -1) df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int) @@ -187,11 +252,15 @@ def execute(context): # Calculate consumption units hts.check_household_size(df_households, df_persons) - df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") + df_households = pd.merge( + df_households, hts.calculate_consumption_units(df_persons), on="household_id" + ) # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype(int) - df_persons.loc[df_persons["socioprofessional_class"] > 6, "socioprofessional_class"] = 8 + df_persons.loc[ + df_persons["socioprofessional_class"] > 6, "socioprofessional_class" + ] = 8 df_persons.loc[df_persons["P7"] == "7", "socioprofessional_class"] = 7 # Check departure and arrival times diff --git a/data/hts/edgt_lyon/filtered.py b/data/hts/edgt_lyon/filtered.py index cf957685..bac0a1a3 100644 --- a/data/hts/edgt_lyon/filtered.py +++ b/data/hts/edgt_lyon/filtered.py @@ -5,27 +5,39 @@ This stage filters out observations which live or work outside of the area. """ + def configure(context): edgt_lyon_source = context.config("edgt_lyon_source", "unchosen") if edgt_lyon_source == "unchosen": - raise RuntimeError("Using 'hts: edgt_lyon' without specifying 'edgt_lyon_source' (either 'cerema' or 'adisp')") + raise RuntimeError( + "Using 'hts: edgt_lyon' without specifying 'edgt_lyon_source' (either 'cerema' or 'adisp')" + ) elif edgt_lyon_source == "adisp": - context.stage("data.hts.edgt_lyon.cleaned_adisp", alias="data.hts.edgt_lyon.cleaned") + context.stage( + "data.hts.edgt_lyon.cleaned_adisp", alias="data.hts.edgt_lyon.cleaned" + ) elif edgt_lyon_source == "cerema": - context.stage("data.hts.edgt_lyon.cleaned_cerema", alias="data.hts.edgt_lyon.cleaned") + context.stage( + "data.hts.edgt_lyon.cleaned_cerema", alias="data.hts.edgt_lyon.cleaned" + ) else: - raise RuntimeError("Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s" % edgt_lyon_source) - + raise RuntimeError( + "Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s" + % edgt_lyon_source + ) + context.stage("data.spatial.codes") - - context.config("filter_hts",True) + + context.config("filter_hts", True) + + def execute(context): filter_edgt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.cleaned") - if filter_edgt : + if filter_edgt: # Filter for non-residents requested_departments = df_codes["departement_id"].unique() f = df_persons["departement_id"].astype(str).isin(requested_departments) @@ -34,15 +46,26 @@ def execute(context): # Filter for people going outside of the area remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set( + df_trips[ + ~df_trips["origin_departement_id"] + .astype(str) + .isin(requested_departments) + | ~df_trips["destination_departement_id"] + .astype(str) + .isin(requested_departments) + ]["person_id"].unique() + ) df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + df_trips = df_trips[ + df_trips["person_id"].isin(df_persons["person_id"].unique()) + ] + df_households = df_households[ + df_households["household_id"].isin(df_persons["household_id"]) + ] # Finish up df_households = df_households[hts.HOUSEHOLD_COLUMNS] diff --git a/data/hts/edgt_lyon/raw_adisp.py b/data/hts/edgt_lyon/raw_adisp.py index 5ec5f111..31f9d877 100644 --- a/data/hts/edgt_lyon/raw_adisp.py +++ b/data/hts/edgt_lyon/raw_adisp.py @@ -10,77 +10,121 @@ Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes) """ + def configure(context): context.config("data_path") + HOUSEHOLD_COLUMNS = { - "ECH": str, "ZFM": str, # id - "M6": int, "M21": int, "M14": int, # number_of_cars, number_of_bikes, number_of_motorbikes - "COE0": float # weights + "ECH": str, + "ZFM": str, # id + "M6": int, + "M21": int, + "M14": int, # number_of_cars, number_of_bikes, number_of_motorbikes + "COE0": float, # weights } PERSON_COLUMNS = { - "ECH": str, "PER": int, "ZFP": str, # id - "PENQ": str, # respondents of travel questionary section - "P2": int, "P4": int, # sex, age - "P9": str, # employed, studies - "P7": str, "P12": str, # has_license, has_pt_subscription - "PCSC": str, # socioprofessional_class - "COEP": float, "COE1": float # weights + "ECH": str, + "PER": int, + "ZFP": str, # id + "PENQ": str, # respondents of travel questionary section + "P2": int, + "P4": int, # sex, age + "P9": str, # employed, studies + "P7": str, + "P12": str, # has_license, has_pt_subscription + "PCSC": str, # socioprofessional_class + "COEP": float, + "COE1": float, # weights } TRIP_COLUMNS = { - "ECH": str, "PER": int, "NDEP": int, "ZFD": str, # id - "D2A": int, "D5A": int, # preceding_purpose, following_purpose - "D3": str, "D7": str, # origin_zone, destination_zone - "D4": int, "D8": int, # time_departure, time_arrival - "MODP": int, "D11": int, "D12": int # mode, euclidean_distance, routed_distance + "ECH": str, + "PER": int, + "NDEP": int, + "ZFD": str, # id + "D2A": int, + "D5A": int, # preceding_purpose, following_purpose + "D3": str, + "D7": str, # origin_zone, destination_zone + "D4": int, + "D8": int, # time_departure, time_arrival + "MODP": int, + "D11": int, + "D12": int, # mode, euclidean_distance, routed_distance } + def execute(context): # Load households - df_households = pd.concat([ - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_faf_men.csv" - % context.config("data_path"), sep=";", usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS - ), - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_tel_men.csv" - % context.config("data_path"), sep=";", usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS - ) - ]) + df_households = pd.concat( + [ + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_faf_men.csv" + % context.config("data_path"), + sep=";", + usecols=list(HOUSEHOLD_COLUMNS.keys()), + dtype=HOUSEHOLD_COLUMNS, + ), + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_tel_men.csv" + % context.config("data_path"), + sep=";", + usecols=list(HOUSEHOLD_COLUMNS.keys()), + dtype=HOUSEHOLD_COLUMNS, + ), + ] + ) # Load persons - df_persons = pd.concat([ - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_faf_pers.csv" - % context.config("data_path"), sep=";", usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS - ), - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_tel_pers.csv" - % context.config("data_path"), sep=";", usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS - ) - ]) + df_persons = pd.concat( + [ + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_faf_pers.csv" + % context.config("data_path"), + sep=";", + usecols=list(PERSON_COLUMNS.keys()), + dtype=PERSON_COLUMNS, + ), + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_tel_pers.csv" + % context.config("data_path"), + sep=";", + usecols=list(PERSON_COLUMNS.keys()), + dtype=PERSON_COLUMNS, + ), + ] + ) # Load trips - df_trips = pd.concat([ - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_faf_depl.csv" - % context.config("data_path"), sep=";", usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS - ), - pd.read_csv( - "%s/edgt_lyon_2015/lyon_2015_std_tel_depl.csv" - % context.config("data_path"), sep=";", usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS - ) - ]) + df_trips = pd.concat( + [ + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_faf_depl.csv" + % context.config("data_path"), + sep=";", + usecols=list(TRIP_COLUMNS.keys()), + dtype=TRIP_COLUMNS, + ), + pd.read_csv( + "%s/edgt_lyon_2015/lyon_2015_std_tel_depl.csv" + % context.config("data_path"), + sep=";", + usecols=list(TRIP_COLUMNS.keys()), + dtype=TRIP_COLUMNS, + ), + ] + ) # Load spatial data df_spatial = gpd.read_file( - "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" - % context.config("data_path")) + "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" % context.config("data_path") + ) return df_households, df_persons, df_trips, df_spatial + FILES = [ "lyon_2015_std_faf_men.csv", "lyon_2015_std_tel_men.csv", @@ -92,12 +136,15 @@ def execute(context): "EDGT_AML2015_ZF_GT.ID", "EDGT_AML2015_ZF_GT.IND", "EDGT_AML2015_ZF_GT.MAP", - "EDGT_AML2015_ZF_GT.TAB" + "EDGT_AML2015_ZF_GT.TAB", ] + def validate(context): for name in FILES: - if not os.path.exists("%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)): + if not os.path.exists( + "%s/edgt_lyon_2015/%s" % (context.config("data_path"), name) + ): raise RuntimeError("File missing from EDGT: %s" % name) return [ diff --git a/data/hts/edgt_lyon/raw_cerema.py b/data/hts/edgt_lyon/raw_cerema.py index c48b28a3..76e3835d 100644 --- a/data/hts/edgt_lyon/raw_cerema.py +++ b/data/hts/edgt_lyon/raw_cerema.py @@ -10,82 +10,134 @@ Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes) """ + def configure(context): context.config("data_path") + HOUSEHOLD_COLUMNS = { - "MP2": str, "ECH": str, "COEM": float, - "M6": int, "M7": int, "M5": int + "MP2": str, + "ECH": str, + "COEM": float, + "M6": int, + "M7": int, + "M5": int, } PERSON_COLUMNS = { - "ECH": str, "PER": int, "PP2": str, "PENQ": str, - "P3": int, "P2": int, "P4": int, - "P7": str, "P12": str, - "P10": str, "P9": str, "P5": str, - "COEP": float, "COEQ": float, "P1": int + "ECH": str, + "PER": int, + "PP2": str, + "PENQ": str, + "P3": int, + "P2": int, + "P4": int, + "P7": str, + "P12": str, + "P10": str, + "P9": str, + "P5": str, + "COEP": float, + "COEQ": float, + "P1": int, } TRIP_COLUMNS = { - "ECH": str, "PER": int, "NDEP": int, "DP2": str, - "D2A": int, "D5A": int, "D3": str, "D4": int, - "D7": str, "D8": int, - "D8C": int, "MODP": int, "DOIB": int, "DIST": int + "ECH": str, + "PER": int, + "NDEP": int, + "DP2": str, + "D2A": int, + "D5A": int, + "D3": str, + "D4": int, + "D7": str, + "D8": int, + "D8C": int, + "MODP": int, + "DOIB": int, + "DIST": int, } + def execute(context): # Load households df_household_dictionary = pd.read_excel( "%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls" - % context.config("data_path"), skiprows = 1, nrows = 21, - usecols = [1,2], names = ["size", "variable"]) + % context.config("data_path"), + skiprows=1, + nrows=21, + usecols=[1, 2], + names=["size", "variable"], + ) column_widths = df_household_dictionary["size"].values column_names = df_household_dictionary["variable"].values df_households = pd.read_fwf( "%s/edgt_lyon_2015/EDGT_AML_MENAGE_FAF_TEL_2015-08-03.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(HOUSEHOLD_COLUMNS.keys()), + dtype=HOUSEHOLD_COLUMNS, ) # Load persons df_person_dictionary = pd.read_excel( "%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls" - % context.config("data_path"), skiprows = 25, nrows = 34, - usecols = [1,2], names = ["size", "variable"]) + % context.config("data_path"), + skiprows=25, + nrows=34, + usecols=[1, 2], + names=["size", "variable"], + ) column_widths = df_person_dictionary["size"].values column_names = df_person_dictionary["variable"].values df_persons = pd.read_fwf( "%s/edgt_lyon_2015/EDGT_AML_PERSO_DIST_DT_2015-10-27.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(PERSON_COLUMNS.keys()), + dtype=PERSON_COLUMNS, ) # Load trips df_trip_dictionary = pd.read_excel( "%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls" - % context.config("data_path"), skiprows = 62, nrows = 24, - usecols = [1,2], names = ["size", "variable"]) + % context.config("data_path"), + skiprows=62, + nrows=24, + usecols=[1, 2], + names=["size", "variable"], + ) column_widths = df_trip_dictionary["size"].values column_names = df_trip_dictionary["variable"].values df_trips = pd.read_fwf( "%s/edgt_lyon_2015/EDGT_AML_DEPLA_DIST_2015-10-27.txt" - % context.config("data_path"), widths = column_widths, header = None, - names = column_names, usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS + % context.config("data_path"), + widths=column_widths, + header=None, + names=column_names, + usecols=list(TRIP_COLUMNS.keys()), + dtype=TRIP_COLUMNS, ) # Load spatial data df_spatial = gpd.read_file( - "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" - % context.config("data_path")) + "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" % context.config("data_path") + ) return df_households, df_persons, df_trips, df_spatial + FILES = [ "EDGT_AML_MENAGE_FAF_TEL_2015-08-03.txt", "EDGT_AML_PERSO_DIST_DT_2015-10-27.txt", @@ -95,12 +147,15 @@ def execute(context): "EDGT_AML2015_ZF_GT.ID", "EDGT_AML2015_ZF_GT.IND", "EDGT_AML2015_ZF_GT.MAP", - "EDGT_AML2015_ZF_GT.TAB" + "EDGT_AML2015_ZF_GT.TAB", ] + def validate(context): for name in FILES: - if not os.path.exists("%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)): + if not os.path.exists( + "%s/edgt_lyon_2015/%s" % (context.config("data_path"), name) + ): raise RuntimeError("File missing from EDGT: %s" % name) return [ diff --git a/data/hts/edgt_lyon/reweighted.py b/data/hts/edgt_lyon/reweighted.py index f858d79b..368c5558 100644 --- a/data/hts/edgt_lyon/reweighted.py +++ b/data/hts/edgt_lyon/reweighted.py @@ -1,8 +1,10 @@ import numpy as np + def configure(context): context.stage("data.hts.edgt_lyon.filtered") + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.filtered") diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py index 490320c9..7f54f317 100644 --- a/data/hts/egt/cleaned.py +++ b/data/hts/egt/cleaned.py @@ -7,43 +7,46 @@ This stage cleans the regional HTS. """ + def configure(context): context.stage("data.hts.egt.raw") if context.config("use_urban_type", False): context.stage("data.spatial.urban_type") + INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6] PURPOSE_MAP = { - 1 : "home", - 2 : "work", - 3 : "work", - 4 : "education", - 5 : "shop", - 6 : "other", - 7 : "other", - 8 : "leisure" + 1: "home", + 2: "work", + 3: "work", + 4: "education", + 5: "shop", + 6: "other", + 7: "other", + 8: "leisure", # 9 : "other" # default } MODES_MAP = { - 1 : "pt", - 2 : "car", - 3 : "car_passenger", - 4 : "car", - 5 : "bike", - #6 : "pt", # default (other) - 7 : "walk" + 1: "pt", + 2: "car", + 3: "car_passenger", + 4: "car", + 5: "bike", + # 6 : "pt", # default (other) + 7: "walk", } + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.egt.raw") # Make copies - df_households = pd.DataFrame(df_households, copy = True) - df_persons = pd.DataFrame(df_persons, copy = True) - df_trips = pd.DataFrame(df_trips, copy = True) + df_households = pd.DataFrame(df_households, copy=True) + df_persons = pd.DataFrame(df_persons, copy=True) + df_trips = pd.DataFrame(df_trips, copy=True) # Transform original IDs to integer (they are hierarchichal) df_households["egt_household_id"] = df_households["NQUEST"].astype(int) @@ -57,14 +60,16 @@ def execute(context): df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( - df_persons, df_households[["egt_household_id", "household_id"]], - on = "egt_household_id" + df_persons, + df_households[["egt_household_id", "household_id"]], + on="egt_household_id", ) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( - df_trips, df_persons[["egt_person_id", "egt_household_id", "person_id", "household_id"]], - on = ["egt_person_id", "egt_household_id"] + df_trips, + df_persons[["egt_person_id", "egt_household_id", "person_id", "household_id"]], + on=["egt_person_id", "egt_household_id"], ) df_trips["trip_id"] = np.arange(len(df_trips)) @@ -88,9 +93,13 @@ def execute(context): # Clean departement df_persons["departement_id"] = df_persons["RESDEP"].astype(str).astype("category") - df_households["departement_id"] = df_households["RESDEP"].astype(str).astype("category") + df_households["departement_id"] = ( + df_households["RESDEP"].astype(str).astype("category") + ) df_trips["origin_departement_id"] = df_trips["ORDEP"].astype(str).astype("category") - df_trips["destination_departement_id"] = df_trips["DESTDEP"].astype(str).astype("category") + df_trips["destination_departement_id"] = ( + df_trips["DESTDEP"].astype(str).astype("category") + ) # Clean employment df_persons["employed"] = df_persons["OCCP"].isin([1.0, 2.0]) @@ -99,38 +108,50 @@ def execute(context): df_persons["studies"] = df_persons["OCCP"].isin([3.0, 4.0, 5.0]) # Number of vehicles - df_households["number_of_vehicles"] = df_households["NB_2RM"] + df_households["NB_VD"] - df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int) + df_households["number_of_vehicles"] = ( + df_households["NB_2RM"] + df_households["NB_VD"] + ) + df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype( + int + ) df_households["number_of_bikes"] = df_households["NB_VELO"].astype(int) # License - df_persons["has_license"] = (df_persons["PERMVP"] == 1) | (df_persons["PERM2RM"] == 1) + df_persons["has_license"] = (df_persons["PERMVP"] == 1) | ( + df_persons["PERM2RM"] == 1 + ) # Has subscription df_persons["has_pt_subscription"] = df_persons["ABONTC"] > 1 # Household income df_households["income_class"] = df_households["REVENU"] - 1 - df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1 + df_households.loc[ + df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class" + ] = -1 df_households["income_class"] = df_households["income_class"].astype(int) # Impute urban type if context.config("use_urban_type"): - df_urban_type = context.stage("data.spatial.urban_type")[[ - "commune_id", "urban_type" - ]] + df_urban_type = context.stage("data.spatial.urban_type")[ + ["commune_id", "urban_type"] + ] # Household municipality df_households["commune_id"] = df_households["RESCOMM"].astype(str) - df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left") + df_persons = pd.merge( + df_persons, df_households[["household_id", "commune_id"]], how="left" + ) assert np.all(~df_persons["commune_id"].isna()) - + # Impute urban type - df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left") - df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category") + df_persons = pd.merge(df_persons, df_urban_type, on="commune_id", how="left") + df_persons["urban_type"] = ( + df_persons["urban_type"].fillna("none").astype("category") + ) - df_households.drop(columns = ["commune_id"]) - df_persons.drop(columns = ["commune_id"]) + df_households.drop(columns=["commune_id"]) + df_persons.drop(columns=["commune_id"]) # Trip purpose df_trips["following_purpose"] = "other" @@ -165,8 +186,8 @@ def execute(context): # Add weight to trips df_trips = pd.merge( - df_trips, df_persons[["person_id", "person_weight"]], on = "person_id", how = "left" - ).rename(columns = { "person_weight": "trip_weight" }) + df_trips, df_persons[["person_id", "person_weight"]], on="person_id", how="left" + ).rename(columns={"person_weight": "trip_weight"}) df_persons["trip_weight"] = df_persons["person_weight"] # Chain length @@ -179,7 +200,9 @@ def execute(context): # Calculate consumption units hts.check_household_size(df_households, df_persons) - df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") + df_households = pd.merge( + df_households, hts.calculate_consumption_units(df_persons), on="household_id" + ) # Socioprofessional class df_persons["socioprofessional_class"] = df_persons["CS8"].fillna(8).astype(int) @@ -194,19 +217,29 @@ def execute(context): nan_count = np.count_nonzero(f) total_count = len(df_persons) - print("Dropping %d/%d persons because of NaN values in departure and arrival times" % (nan_count, total_count)) + print( + "Dropping %d/%d persons because of NaN values in departure and arrival times" + % (nan_count, total_count) + ) df_persons = df_persons[~f] df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + df_households = df_households[ + df_households["household_id"].isin(df_persons["household_id"]) + ] # Fix activity types (because of inconsistent EGT data and removing in the timing fixing step) hts.fix_activity_types(df_trips) return df_households, df_persons, df_trips + def calculate_income_class(df): assert "household_income" in df assert "consumption_units" in df - return np.digitize(df["household_income"] / df["consumption_units"], INCOME_CLASS_BOUNDS, right = True) + return np.digitize( + df["household_income"] / df["consumption_units"], + INCOME_CLASS_BOUNDS, + right=True, + ) diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py index 29f06604..54701c26 100644 --- a/data/hts/egt/filtered.py +++ b/data/hts/egt/filtered.py @@ -6,49 +6,70 @@ Île-de-France. """ + def configure(context): context.stage("data.hts.egt.cleaned") context.stage("data.spatial.codes") - context.config("filter_hts",True) + context.config("filter_hts", True) + + def execute(context): - filter_egt = context.config("filter_hts") + filter_egt = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned") - if filter_egt : + if filter_egt: # Filter for non-residents requested_departments = df_codes["departement_id"].unique() - f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug! + f = ( + df_persons["departement_id"].astype(str).isin(requested_departments) + ) # pandas bug! df_persons = df_persons[f] # Filter for people going outside of the area (because they have NaN distances) remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set( + df_trips[ + ~df_trips["origin_departement_id"] + .astype(str) + .isin(requested_departments) + | ~df_trips["destination_departement_id"] + .astype(str) + .isin(requested_departments) + ]["person_id"].unique() + ) - remove_ids |= set(df_persons[ - ~df_persons["departement_id"].isin(requested_departments) - ]) + remove_ids |= set( + df_persons[~df_persons["departement_id"].isin(requested_departments)] + ) df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + df_trips = df_trips[ + df_trips["person_id"].isin(df_persons["person_id"].unique()) + ] + df_households = df_households[ + df_households["household_id"].isin(df_persons["household_id"]) + ] # Finish up household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"] df_households = df_households[household_columns] - + person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"] - if "urban_type" in df_persons: person_columns.append("urban_type") + if "urban_type" in df_persons: + person_columns.append("urban_type") df_persons = df_persons[person_columns] - - trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"] + + trip_columns = ( + hts.TRIP_COLUMNS + + ["euclidean_distance"] + + ["egt_household_id", "egt_person_id", "egt_trip_id"] + ) df_trips = df_trips[trip_columns] hts.check(df_households, df_persons, df_trips) diff --git a/data/hts/egt/raw.py b/data/hts/egt/raw.py index 53b88a21..2ce75ed8 100644 --- a/data/hts/egt/raw.py +++ b/data/hts/egt/raw.py @@ -7,49 +7,97 @@ """ MENAGES_COLUMNS = [ - "RESDEP", "NQUEST", "POIDSM", "NB_VELO", "NB_VD", "REVENU", "RESCOMM", - "NB_2RM", "MNP" + "RESDEP", + "NQUEST", + "POIDSM", + "NB_VELO", + "NB_VD", + "REVENU", + "RESCOMM", + "NB_2RM", + "MNP", ] PERSONNES_COLUMNS = [ - "RESDEP", "NP", "POIDSP", "NQUEST", "SEXE", "AGE", "PERMVP", - "ABONTC", "OCCP", "PERM2RM", "NBDEPL", "CS8" + "RESDEP", + "NP", + "POIDSP", + "NQUEST", + "SEXE", + "AGE", + "PERMVP", + "ABONTC", + "OCCP", + "PERM2RM", + "NBDEPL", + "CS8", ] DEPLACEMENTS_COLUMNS = [ - "NQUEST", "NP", "ND", - "ORDEP", "DESTDEP", "ORH", "DESTH", "ORM", "DESTM", "ORCOMM", "DESTCOMM", - "DPORTEE", "MODP_H7", "DESTMOT_H9", "ORMOT_H9" + "NQUEST", + "NP", + "ND", + "ORDEP", + "DESTDEP", + "ORH", + "DESTH", + "ORM", + "DESTM", + "ORCOMM", + "DESTCOMM", + "DPORTEE", + "MODP_H7", + "DESTMOT_H9", + "ORMOT_H9", ] + def configure(context): context.config("data_path") + def execute(context): df_menages = pd.read_csv( "%s/egt_2010/Menages_semaine.csv" % context.config("data_path"), - sep = ",", encoding = "latin1", usecols = MENAGES_COLUMNS + sep=",", + encoding="latin1", + usecols=MENAGES_COLUMNS, ) df_personnes = pd.read_csv( "%s/egt_2010/Personnes_semaine.csv" % context.config("data_path"), - sep = ",", encoding = "latin1", usecols = PERSONNES_COLUMNS + sep=",", + encoding="latin1", + usecols=PERSONNES_COLUMNS, ) df_deplacements = pd.read_csv( "%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path"), - sep = ",", encoding = "latin1", usecols = DEPLACEMENTS_COLUMNS + sep=",", + encoding="latin1", + usecols=DEPLACEMENTS_COLUMNS, ) return df_menages, df_personnes, df_deplacements + def validate(context): - for name in ("Menages_semaine.csv", "Personnes_semaine.csv", "Deplacements_semaine.csv"): + for name in ( + "Menages_semaine.csv", + "Personnes_semaine.csv", + "Deplacements_semaine.csv", + ): if not os.path.exists("%s/egt_2010/%s" % (context.config("data_path"), name)): raise RuntimeError("File missing from EGT: %s" % name) return [ - os.path.getsize("%s/egt_2010/Menages_semaine.csv" % context.config("data_path")), - os.path.getsize("%s/egt_2010/Personnes_semaine.csv" % context.config("data_path")), - os.path.getsize("%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path")) + os.path.getsize( + "%s/egt_2010/Menages_semaine.csv" % context.config("data_path") + ), + os.path.getsize( + "%s/egt_2010/Personnes_semaine.csv" % context.config("data_path") + ), + os.path.getsize( + "%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path") + ), ] diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py index 51bfd966..51618eee 100644 --- a/data/hts/entd/cleaned.py +++ b/data/hts/entd/cleaned.py @@ -7,10 +7,27 @@ This stage cleans the national HTS. """ + def configure(context): context.stage("data.hts.entd.raw") -INCOME_CLASS_BOUNDS = [400, 600, 800, 1000, 1200, 1500, 1800, 2000, 2500, 3000, 4000, 6000, 10000, 1e6] + +INCOME_CLASS_BOUNDS = [ + 400, + 600, + 800, + 1000, + 1200, + 1500, + 1800, + 2000, + 2500, + 3000, + 4000, + 6000, + 10000, + 1e6, +] PURPOSE_MAP = [ ("1", "home"), @@ -22,38 +39,47 @@ def configure(context): ("6", "other"), ("7", "leisure"), ("8", "leisure"), - ("9", "work") + ("9", "work"), ] MODES_MAP = [ ("1", "walk"), - ("2", "car"), # - ("2.20", "bike"), # bike - ("2.23", "car_passenger"), # motorcycle passenger - ("2.25", "car_passenger"), # same + ("2", "car"), # + ("2.20", "bike"), # bike + ("2.23", "car_passenger"), # motorcycle passenger + ("2.25", "car_passenger"), # same ("3", "car"), ("3.32", "car_passenger"), - ("4", "pt"), # taxi + ("4", "pt"), # taxi ("5", "pt"), ("6", "pt"), - ("7", "pt"), # Plane - ("8", "pt"), # Boat -# ("9", "pt") # Other + ("7", "pt"), # Plane + ("8", "pt"), # Boat + # ("9", "pt") # Other ] + def convert_time(x): - return np.dot(np.array(x.split(":"), dtype = float), [3600.0, 60.0, 1.0]) + return np.dot(np.array(x.split(":"), dtype=float), [3600.0, 60.0, 1.0]) + def execute(context): - df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc = context.stage("data.hts.entd.raw") + df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc = context.stage( + "data.hts.entd.raw" + ) # Make copies - df_persons = pd.DataFrame(df_tcm_individu, copy = True) - df_households = pd.DataFrame(df_tcm_menage, copy = True) - df_trips = pd.DataFrame(df_deploc, copy = True) + df_persons = pd.DataFrame(df_tcm_individu, copy=True) + df_households = pd.DataFrame(df_tcm_menage, copy=True) + df_trips = pd.DataFrame(df_deploc, copy=True) # Get weights for persons that actually have trips - df_persons = pd.merge(df_persons, df_trips[["IDENT_IND", "PONDKI"]].drop_duplicates("IDENT_IND"), on = "IDENT_IND", how = "left") + df_persons = pd.merge( + df_persons, + df_trips[["IDENT_IND", "PONDKI"]].drop_duplicates("IDENT_IND"), + on="IDENT_IND", + how="left", + ) df_persons["is_kish"] = ~df_persons["PONDKI"].isna() df_persons["trip_weight"] = df_persons["PONDKI"].fillna(0.0) @@ -64,13 +90,21 @@ def execute(context): print("Filtering out %d non-reference day trips" % np.count_nonzero(~f)) # Merge in additional information from ENTD - df_households = pd.merge(df_households, df_menage[[ - "idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO", "V1_JNBVELOADT" - ]], on = "idENT_MEN", how = "left") + df_households = pd.merge( + df_households, + df_menage[ + ["idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO", "V1_JNBVELOADT"] + ], + on="idENT_MEN", + how="left", + ) - df_persons = pd.merge(df_persons, df_individu[[ - "IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R", "V1_ICARTABON" - ]], on = "IDENT_IND", how = "left") + df_persons = pd.merge( + df_persons, + df_individu[["IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R", "V1_ICARTABON"]], + on="IDENT_IND", + how="left", + ) # Transform original IDs to integer (they are hierarchichal) df_persons["entd_person_id"] = df_persons["IDENT_IND"].astype(int) @@ -82,14 +116,16 @@ def execute(context): df_households["household_id"] = np.arange(len(df_households)) df_persons = pd.merge( - df_persons, df_households[["entd_household_id", "household_id"]], - on = "entd_household_id" + df_persons, + df_households[["entd_household_id", "household_id"]], + on="entd_household_id", ) df_persons["person_id"] = np.arange(len(df_persons)) df_trips = pd.merge( - df_trips, df_persons[["entd_person_id", "person_id", "household_id"]], - on = ["entd_person_id"] + df_trips, + df_persons[["entd_person_id", "person_id", "household_id"]], + on=["entd_person_id"], ) df_trips["trip_id"] = np.arange(len(df_trips)) @@ -109,19 +145,24 @@ def execute(context): df_households["household_size"] = df_households["NPERS"] # Clean departement - df_households["departement_id"] = df_households["DEP"].fillna("undefined").astype("category") - df_persons["departement_id"] = df_persons["DEP"].fillna("undefined").astype("category") + df_households["departement_id"] = ( + df_households["DEP"].fillna("undefined").astype("category") + ) + df_persons["departement_id"] = ( + df_persons["DEP"].fillna("undefined").astype("category") + ) - df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category") - df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category") + df_trips["origin_departement_id"] = ( + df_trips["V2_MORIDEP"].fillna("undefined").astype("category") + ) + df_trips["destination_departement_id"] = ( + df_trips["V2_MDESDEP"].fillna("undefined").astype("category") + ) # Clean urban type - df_households["urban_type"] = df_households["numcom_UU2010"].replace({ - "B": "suburb", - "C": "central_city", - "I": "isolated_city", - "R": "none" - }) + df_households["urban_type"] = df_households["numcom_UU2010"].replace( + {"B": "suburb", "C": "central_city", "I": "isolated_city", "R": "none"} + ) assert np.all(~df_households["urban_type"].isna()) df_households["urban_type"] = df_households["urban_type"].astype("category") @@ -139,32 +180,67 @@ def execute(context): df_households["number_of_vehicles"] += df_households["V1_JNBVEH"].fillna(0) df_households["number_of_vehicles"] += df_households["V1_JNBMOTO"].fillna(0) df_households["number_of_vehicles"] += df_households["V1_JNBCYCLO"].fillna(0) - df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int) + df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype( + int + ) - df_households["number_of_bikes"] = df_households["V1_JNBVELOADT"].fillna(0).astype(int) + df_households["number_of_bikes"] = ( + df_households["V1_JNBVELOADT"].fillna(0).astype(int) + ) # License - df_persons["has_license"] = (df_persons["V1_GPERMIS"] == 1) | (df_persons["V1_GPERMIS2R"] == 1) + df_persons["has_license"] = (df_persons["V1_GPERMIS"] == 1) | ( + df_persons["V1_GPERMIS2R"] == 1 + ) # Has subscription df_persons["has_pt_subscription"] = df_persons["V1_ICARTABON"] == 1 # Household income df_households["income_class"] = -1 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"), "income_class"] = 0 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 400"), "income_class"] = 1 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 600"), "income_class"] = 2 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 800"), "income_class"] = 3 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"), "income_class"] = 4 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"), "income_class"] = 5 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"), "income_class"] = 6 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"), "income_class"] = 7 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"), "income_class"] = 8 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"), "income_class"] = 9 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"), "income_class"] = 10 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"), "income_class"] = 11 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"), "income_class"] = 12 - df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("10 000"), "income_class"] = 13 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"), + "income_class", + ] = 0 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 400"), "income_class" + ] = 1 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 600"), "income_class" + ] = 2 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 800"), "income_class" + ] = 3 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"), "income_class" + ] = 4 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"), "income_class" + ] = 5 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"), "income_class" + ] = 6 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"), "income_class" + ] = 7 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"), "income_class" + ] = 8 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"), "income_class" + ] = 9 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"), "income_class" + ] = 10 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"), "income_class" + ] = 11 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"), "income_class" + ] = 12 + df_households.loc[ + df_households["TrancheRevenuMensuel"].str.startswith("10 000"), "income_class" + ] = 13 df_households["income_class"] = df_households["income_class"].astype(int) # Trip purpose @@ -173,11 +249,13 @@ def execute(context): for prefix, activity_type in PURPOSE_MAP: df_trips.loc[ - df_trips["V2_MMOTIFDES"].astype(str).str.startswith(prefix), "following_purpose" + df_trips["V2_MMOTIFDES"].astype(str).str.startswith(prefix), + "following_purpose", ] = activity_type df_trips.loc[ - df_trips["V2_MMOTIFORI"].astype(str).str.startswith(prefix), "preceding_purpose" + df_trips["V2_MMOTIFORI"].astype(str).str.startswith(prefix), + "preceding_purpose", ] = activity_type df_trips["following_purpose"] = df_trips["following_purpose"].astype("category") @@ -187,15 +265,17 @@ def execute(context): df_trips["mode"] = "pt" for prefix, mode in MODES_MAP: - df_trips.loc[ - df_trips["V2_MTP"].astype(str).str.startswith(prefix), "mode" - ] = mode + df_trips.loc[df_trips["V2_MTP"].astype(str).str.startswith(prefix), "mode"] = ( + mode + ) df_trips["mode"] = df_trips["mode"].astype("category") # Further trip attributes df_trips["routed_distance"] = df_trips["V2_MDISTTOT"] * 1000.0 - df_trips["routed_distance"] = df_trips["routed_distance"].fillna(0.0) # This should be just one within Île-de-France + df_trips["routed_distance"] = df_trips["routed_distance"].fillna( + 0.0 + ) # This should be just one within Île-de-France # Only leave weekday trips f = df_trips["V2_TYPJOUR"] == 1 @@ -205,10 +285,14 @@ def execute(context): # Only leave one day per person initial_count = len(df_trips) - df_first_day = df_trips[["person_id", "IDENT_JOUR"]].sort_values( - by = ["person_id", "IDENT_JOUR"] - ).drop_duplicates("person_id") - df_trips = pd.merge(df_trips, df_first_day, how = "inner", on = ["person_id", "IDENT_JOUR"]) + df_first_day = ( + df_trips[["person_id", "IDENT_JOUR"]] + .sort_values(by=["person_id", "IDENT_JOUR"]) + .drop_duplicates("person_id") + ) + df_trips = pd.merge( + df_trips, df_first_day, how="inner", on=["person_id", "IDENT_JOUR"] + ) final_count = len(df_trips) print("Removed %d trips for non-primary days" % (initial_count - final_count)) @@ -217,7 +301,9 @@ def execute(context): df_trips = hts.compute_first_last(df_trips) # Trip times - df_trips["departure_time"] = df_trips["V2_MORIHDEP"].apply(convert_time).astype(float) + df_trips["departure_time"] = ( + df_trips["V2_MORIHDEP"].apply(convert_time).astype(float) + ) df_trips["arrival_time"] = df_trips["V2_MDESHARR"].apply(convert_time).astype(float) df_trips = hts.fix_trip_times(df_trips) @@ -230,11 +316,17 @@ def execute(context): # Chain length df_persons = pd.merge( - df_persons, df_trips[["person_id", "NDEP"]].drop_duplicates("person_id").rename(columns = { "NDEP": "number_of_trips" }), - on = "person_id", how = "left" + df_persons, + df_trips[["person_id", "NDEP"]] + .drop_duplicates("person_id") + .rename(columns={"NDEP": "number_of_trips"}), + on="person_id", + how="left", ) df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int) - df_persons.loc[(df_persons["number_of_trips"] == -1) & df_persons["is_kish"], "number_of_trips"] = 0 + df_persons.loc[ + (df_persons["number_of_trips"] == -1) & df_persons["is_kish"], "number_of_trips" + ] = 0 # Passenger attribute df_persons["is_passenger"] = df_persons["person_id"].isin( @@ -243,18 +335,23 @@ def execute(context): # Calculate consumption units hts.check_household_size(df_households, df_persons) - df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id") + df_households = pd.merge( + df_households, hts.calculate_consumption_units(df_persons), on="household_id" + ) # Socioprofessional class - df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10 + df_persons["socioprofessional_class"] = ( + df_persons["CS24"].fillna(80).astype(int) // 10 + ) # Fix activity types (because of 1 inconsistent ENTD data) hts.fix_activity_types(df_trips) return df_households, df_persons, df_trips + def calculate_income_class(df): assert "household_income" in df assert "consumption_units" in df - return np.digitize(df["household_income"], INCOME_CLASS_BOUNDS, right = True) + return np.digitize(df["household_income"], INCOME_CLASS_BOUNDS, right=True) diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py index e9bb2ca1..71d36485 100644 --- a/data/hts/entd/filtered.py +++ b/data/hts/entd/filtered.py @@ -6,17 +6,20 @@ Île-de-France. """ + def configure(context): context.stage("data.hts.entd.cleaned") context.stage("data.spatial.codes") - context.config("filter_hts",True) + context.config("filter_hts", True) + + def execute(context): - filter_entd = context.config("filter_hts") + filter_entd = context.config("filter_hts") df_codes = context.stage("data.spatial.codes") df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned") - if filter_entd : + if filter_entd: # Filter for non-residents requested_departments = df_codes["departement_id"].unique() f = df_persons["departement_id"].astype(str).isin(requested_departments) @@ -25,18 +28,31 @@ def execute(context): # Filter for people going outside of the area (because they have NaN distances) remove_ids = set() - remove_ids |= set(df_trips[ - ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments) - ]["person_id"].unique()) + remove_ids |= set( + df_trips[ + ~df_trips["origin_departement_id"] + .astype(str) + .isin(requested_departments) + | ~df_trips["destination_departement_id"] + .astype(str) + .isin(requested_departments) + ]["person_id"].unique() + ) df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)] # Only keep trips and households that still have a person - df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())] - df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])] + df_trips = df_trips[ + df_trips["person_id"].isin(df_persons["person_id"].unique()) + ] + df_households = df_households[ + df_households["household_id"].isin(df_persons["household_id"]) + ] # Finish up - df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]] + df_households = df_households[ + hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"] + ] df_persons = df_persons[hts.PERSON_COLUMNS] df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]] diff --git a/data/hts/entd/raw.py b/data/hts/entd/raw.py index f4bdd91a..16b1ab85 100644 --- a/data/hts/entd/raw.py +++ b/data/hts/entd/raw.py @@ -7,80 +7,132 @@ """ Q_MENAGE_COLUMNS = [ - "DEP", "idENT_MEN", "PONDV1", "RG", + "DEP", + "idENT_MEN", + "PONDV1", + "RG", "V1_JNBVELOADT", - "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO" + "V1_JNBVEH", + "V1_JNBMOTO", + "V1_JNBCYCLO", ] Q_TCM_MENAGE_COLUMNS = [ - "NPERS", "PONDV1", "TrancheRevenuMensuel", - "DEP", "idENT_MEN", "RG", "numcom_UU2010" + "NPERS", + "PONDV1", + "TrancheRevenuMensuel", + "DEP", + "idENT_MEN", + "RG", + "numcom_UU2010", ] Q_INDIVIDU_COLUMNS = [ - "IDENT_IND", "idENT_MEN", - "RG", "V1_GPERMIS", "V1_ICARTABON", - "V1_GPERMIS2R" + "IDENT_IND", + "idENT_MEN", + "RG", + "V1_GPERMIS", + "V1_ICARTABON", + "V1_GPERMIS2R", ] Q_TCM_INDIVIDU_COLUMNS = [ - "AGE", "ETUDES", "IDENT_IND", "IDENT_MEN", - "PONDV1", "CS24", "SEXE", "DEP", "SITUA", + "AGE", + "ETUDES", + "IDENT_IND", + "IDENT_MEN", + "PONDV1", + "CS24", + "SEXE", + "DEP", + "SITUA", ] K_DEPLOC_COLUMNS = [ - "IDENT_IND", "V2_MMOTIFDES", "V2_MMOTIFORI", - "V2_TYPJOUR", "V2_MORIHDEP", "V2_MDESHARR", "V2_MDISTTOT", - "IDENT_JOUR", "V2_MTP", - "V2_MDESDEP", "V2_MORIDEP", "NDEP", "V2_MOBILREF", - "PONDKI" + "IDENT_IND", + "V2_MMOTIFDES", + "V2_MMOTIFORI", + "V2_TYPJOUR", + "V2_MORIHDEP", + "V2_MDESHARR", + "V2_MDISTTOT", + "IDENT_JOUR", + "V2_MTP", + "V2_MDESDEP", + "V2_MORIDEP", + "NDEP", + "V2_MOBILREF", + "PONDKI", ] + def configure(context): context.config("data_path") + def execute(context): df_individu = pd.read_csv( "%s/entd_2008/Q_individu.csv" % context.config("data_path"), - sep = ";", encoding = "latin1", usecols = Q_INDIVIDU_COLUMNS, - dtype = { "DEP": str } + sep=";", + encoding="latin1", + usecols=Q_INDIVIDU_COLUMNS, + dtype={"DEP": str}, ) df_tcm_individu = pd.read_csv( "%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path"), - sep = ";", encoding = "latin1", usecols = Q_TCM_INDIVIDU_COLUMNS, - dtype = { "DEP": str } + sep=";", + encoding="latin1", + usecols=Q_TCM_INDIVIDU_COLUMNS, + dtype={"DEP": str}, ) df_menage = pd.read_csv( "%s/entd_2008/Q_menage.csv" % context.config("data_path"), - sep = ";", encoding = "latin1", usecols = Q_MENAGE_COLUMNS, - dtype = { "DEP": str } + sep=";", + encoding="latin1", + usecols=Q_MENAGE_COLUMNS, + dtype={"DEP": str}, ) df_tcm_menage = pd.read_csv( "%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path"), - sep = ";", encoding = "latin1", usecols = Q_TCM_MENAGE_COLUMNS, - dtype = { "DEP": str } + sep=";", + encoding="latin1", + usecols=Q_TCM_MENAGE_COLUMNS, + dtype={"DEP": str}, ) df_deploc = pd.read_csv( "%s/entd_2008/K_deploc.csv" % context.config("data_path"), - sep = ";", encoding = "latin1", usecols = K_DEPLOC_COLUMNS, - dtype = { "DEP": str, "V2_MTP": str } + sep=";", + encoding="latin1", + usecols=K_DEPLOC_COLUMNS, + dtype={"DEP": str, "V2_MTP": str}, ) return df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc + def validate(context): - for name in ("Q_individu.csv", "Q_tcm_individu.csv", "Q_menage.csv", "Q_tcm_menage_0.csv", "K_deploc.csv"): + for name in ( + "Q_individu.csv", + "Q_tcm_individu.csv", + "Q_menage.csv", + "Q_tcm_menage_0.csv", + "K_deploc.csv", + ): if not os.path.exists("%s/entd_2008/%s" % (context.config("data_path"), name)): raise RuntimeError("File missing from ENTD: %s" % name) return [ os.path.getsize("%s/entd_2008/Q_individu.csv" % context.config("data_path")), - os.path.getsize("%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path")), + os.path.getsize( + "%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path") + ), os.path.getsize("%s/entd_2008/Q_menage.csv" % context.config("data_path")), - os.path.getsize("%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path")), - os.path.getsize("%s/entd_2008/K_deploc.csv" % context.config("data_path")) + os.path.getsize( + "%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path") + ), + os.path.getsize("%s/entd_2008/K_deploc.csv" % context.config("data_path")), ] diff --git a/data/hts/entd/reweighted.py b/data/hts/entd/reweighted.py index 517a3ca9..2367e68d 100644 --- a/data/hts/entd/reweighted.py +++ b/data/hts/entd/reweighted.py @@ -1,8 +1,10 @@ import numpy as np + def configure(context): context.stage("data.hts.entd.filtered") + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.entd.filtered") diff --git a/data/hts/hts.py b/data/hts/hts.py index 86bc0365..59ce73e7 100644 --- a/data/hts/hts.py +++ b/data/hts/hts.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np + def swap_departure_arrival_times(df, f): assert "arrival_time" in df assert "departure_time" in df @@ -11,6 +12,7 @@ def swap_departure_arrival_times(df, f): df.loc[f, "departure_time"] = arrival_times df.loc[f, "arrival_time"] = departure_times + def fix_trip_times(df_trips): """ - Negative duration: @@ -22,7 +24,16 @@ def fix_trip_times(df_trips): - Intresecting trips """ - columns = ["trip_id", "person_id", "departure_time", "arrival_time", "preceding_purpose", "following_purpose", "is_first_trip", "is_last_trip"] + columns = [ + "trip_id", + "person_id", + "departure_time", + "arrival_time", + "preceding_purpose", + "following_purpose", + "is_first_trip", + "is_last_trip", + ] df_main = df_trips df_next = df_main.shift(-1) df_previous = df_main.shift(1) @@ -33,9 +44,16 @@ def fix_trip_times(df_trips): # 1.1) Departure and arrival time may have been swapped, and chain is consistent f_swap = np.copy(f_negative) - f_swap &= (df_main["arrival_time"] > df_previous["arrival_time"]) | df_main["is_first_trip"] - f_swap &= (df_main["departure_time"] < df_next["departure_time"]) | df_main["is_last_trip"] - print(" of which %d can swap departure and arrival time without conflicts with previous or following trip" % np.count_nonzero(f_swap)) + f_swap &= (df_main["arrival_time"] > df_previous["arrival_time"]) | df_main[ + "is_first_trip" + ] + f_swap &= (df_main["departure_time"] < df_next["departure_time"]) | df_main[ + "is_last_trip" + ] + print( + " of which %d can swap departure and arrival time without conflicts with previous or following trip" + % np.count_nonzero(f_swap) + ) swap_departure_arrival_times(df_main, f_swap) f_negative[f_swap] = False @@ -44,13 +62,19 @@ def fix_trip_times(df_trips): # However, the offset duration is unlikely to be a trip over midnight offset = df_main["departure_time"] - df_main["arrival_time"] f_swap = (offset > 0) & (offset < 10 * 3600) - print(" of which %d are unlikely to cover midnight, so we swap arrival and departure time although there are conflicts" % np.count_nonzero(f_swap)) + print( + " of which %d are unlikely to cover midnight, so we swap arrival and departure time although there are conflicts" + % np.count_nonzero(f_swap) + ) swap_departure_arrival_times(df_main, f_swap) f_negative[f_swap] = False # 1.3) Covering midnight -> Shift arrival time - print(" of which %d seem to cover midnight, so we shift arrival time by 24h" % np.count_nonzero(f_negative)) + print( + " of which %d seem to cover midnight, so we shift arrival time by 24h" + % np.count_nonzero(f_negative) + ) df_main.loc[f_negative, "arrival_time"] += 24 * 3600.0 # 2) Current trip is after following trip @@ -83,10 +107,16 @@ def fix_trip_times(df_trips): # Intersecting trips f = ~df_main["is_last_trip"] f &= df_main["arrival_time"] > df_next["departure_time"] - print("Found %d occurences where current trip ends after next trip starts" % np.count_nonzero(f)) + print( + "Found %d occurences where current trip ends after next trip starts" + % np.count_nonzero(f) + ) f &= df_main["departure_time"] <= df_next["departure_time"] - print(" of which we're able to shorten %d to make it consistent" % np.count_nonzero(f)) + print( + " of which we're able to shorten %d to make it consistent" + % np.count_nonzero(f) + ) df_main.loc[f, "arrival_time"] = df_next["departure_time"] # Included trips (moving the first one to the start of the following trip and setting duration to zero) @@ -95,10 +125,14 @@ def fix_trip_times(df_trips): f &= df_main["arrival_time"] <= df_next["arrival_time"] df_main.loc[f, "departure_time"] = df_next["departure_time"] df_main.loc[f, "arrival_time"] = df_next["departure_time"] - print("Found %d occurences where current trip is included in next trip" % np.count_nonzero(f)) + print( + "Found %d occurences where current trip is included in next trip" + % np.count_nonzero(f) + ) return df_main + def check_trip_times(df_trips): print("Validating trip times...") any_errors = False @@ -168,31 +202,43 @@ def check_trip_times(df_trips): print(" => All trip times are consistent!") return True + def fix_activity_types(df_trips): - f = (df_trips["preceding_purpose"] != df_trips["following_purpose"].shift(1)) & ~df_trips["is_first_trip"] - df_trips.loc[f, "preceding_purpose"] = df_trips.shift(1)["following_purpose"][f].values + f = ( + df_trips["preceding_purpose"] != df_trips["following_purpose"].shift(1) + ) & ~df_trips["is_first_trip"] + df_trips.loc[f, "preceding_purpose"] = df_trips.shift(1)["following_purpose"][ + f + ].values print("Fixing %d inconsistent activity types" % np.count_nonzero(f)) check_activity_types(df_trips) + def check_activity_types(df_trips): - f = (df_trips["following_purpose"] != df_trips["preceding_purpose"].shift(-1)) & ~df_trips["is_last_trip"] - f |= (df_trips["following_purpose"].shift(1) != df_trips["preceding_purpose"]) & ~df_trips["is_first_trip"] + f = ( + df_trips["following_purpose"] != df_trips["preceding_purpose"].shift(-1) + ) & ~df_trips["is_last_trip"] + f |= ( + df_trips["following_purpose"].shift(1) != df_trips["preceding_purpose"] + ) & ~df_trips["is_first_trip"] error_count = np.count_nonzero(f) print("Trips with inconsistent activity types: %d" % error_count) return error_count == 0 + def compute_first_last(df_trips): assert "person_id" in df_trips - df_trips = df_trips.sort_values(by = ["person_id", "trip_id"]) + df_trips = df_trips.sort_values(by=["person_id", "trip_id"]) df_trips["is_first_trip"] = df_trips["person_id"].ne(df_trips["person_id"].shift(1)) df_trips["is_last_trip"] = df_trips["person_id"].ne(df_trips["person_id"].shift(-1)) return df_trips + def compute_activity_duration(df_trips): assert "departure_time" in df_trips assert "arrival_time" in df_trips @@ -201,13 +247,17 @@ def compute_activity_duration(df_trips): df_trips["activity_duration"] = df_next["departure_time"] - df_trips["arrival_time"] df_trips.loc[df_trips["is_last_trip"], "activity_duration"] = np.nan + def check_household_size(df_households, df_persons): - df_size = df_persons.groupby("household_id").size().reset_index(name = "count") - df_size = pd.merge(df_households[["household_id", "household_size"]], df_size, on = "household_id") + df_size = df_persons.groupby("household_id").size().reset_index(name="count") + df_size = pd.merge( + df_households[["household_id", "household_size"]], df_size, on="household_id" + ) assert len(df_size) == len(df_households) assert (df_size["household_size"] == df_size["count"]).all() + def calculate_consumption_units(df_persons): df_units = df_persons[["household_id", "age"]].copy() df_units["under_14"] = df_units["age"] < 14 @@ -220,28 +270,52 @@ def calculate_consumption_units(df_persons): return df_units[["household_id", "consumption_units"]] + HOUSEHOLD_COLUMNS = [ - "household_id", "household_weight", "household_size", - "number_of_vehicles", "number_of_bikes", "departement_id", - "consumption_units", # "income_class" + "household_id", + "household_weight", + "household_size", + "number_of_vehicles", + "number_of_bikes", + "departement_id", + "consumption_units", # "income_class" ] PERSON_COLUMNS = [ - "person_id", "household_id", "person_weight", - "age", "sex", "employed", "studies", - "has_license", "has_pt_subscription", - "number_of_trips", "departement_id", "trip_weight", - "is_passenger", "socioprofessional_class" + "person_id", + "household_id", + "person_weight", + "age", + "sex", + "employed", + "studies", + "has_license", + "has_pt_subscription", + "number_of_trips", + "departement_id", + "trip_weight", + "is_passenger", + "socioprofessional_class", ] TRIP_COLUMNS = [ - "person_id", "trip_id", "trip_weight", - "departure_time", "arrival_time", - "trip_duration", "activity_duration", - "following_purpose", "preceding_purpose", "is_last_trip", "is_first_trip", - "mode", "origin_departement_id", "destination_departement_id" + "person_id", + "trip_id", + "trip_weight", + "departure_time", + "arrival_time", + "trip_duration", + "activity_duration", + "following_purpose", + "preceding_purpose", + "is_last_trip", + "is_first_trip", + "mode", + "origin_departement_id", + "destination_departement_id", ] + def check(df_households, df_persons, df_trips): assert check_trip_times(df_trips) assert check_activity_types(df_trips) diff --git a/data/hts/output.py b/data/hts/output.py index cee14cad..1ee0eca3 100644 --- a/data/hts/output.py +++ b/data/hts/output.py @@ -9,23 +9,34 @@ pipeline. """ + def configure(context): context.stage("data.hts.selected") context.config("output_path") context.config("output_prefix", "ile_de_france_") + def execute(context): df_households, df_persons, df_trips = context.stage("data.hts.selected") - df_households.to_csv("%s/%shts_households.csv" % ( - context.config("output_path"), context.config("output_prefix") - ), sep = ";", index = False) + df_households.to_csv( + "%s/%shts_households.csv" + % (context.config("output_path"), context.config("output_prefix")), + sep=";", + index=False, + ) - df_persons.to_csv("%s/%shts_persons.csv" % ( - context.config("output_path"), context.config("output_prefix") - ), sep = ";", index = False) + df_persons.to_csv( + "%s/%shts_persons.csv" + % (context.config("output_path"), context.config("output_prefix")), + sep=";", + index=False, + ) - df_trips.to_csv("%s/%shts_trips.csv" % ( - context.config("output_path"), context.config("output_prefix") - ), sep = ";", index = False) + df_trips.to_csv( + "%s/%shts_trips.csv" + % (context.config("output_path"), context.config("output_prefix")), + sep=";", + index=False, + ) diff --git a/data/hts/selected.py b/data/hts/selected.py index d5c5bd43..1832fbc7 100644 --- a/data/hts/selected.py +++ b/data/hts/selected.py @@ -1,19 +1,21 @@ import pandas as pd import numpy as np + def configure(context): hts = context.config("hts") if hts == "egt": - context.stage("data.hts.egt.filtered", alias = "hts") + context.stage("data.hts.egt.filtered", alias="hts") elif hts == "entd": - context.stage("data.hts.entd.reweighted", alias = "hts") + context.stage("data.hts.entd.reweighted", alias="hts") elif hts == "edgt_lyon": - context.stage("data.hts.edgt_lyon.reweighted", alias = "hts") + context.stage("data.hts.edgt_lyon.reweighted", alias="hts") elif hts == "edgt_44": - context.stage("data.hts.edgt_44.reweighted", alias = "hts") + context.stage("data.hts.edgt_44.reweighted", alias="hts") else: raise RuntimeError("Unknown HTS: %s" % hts) + def execute(context): return context.stage("hts") diff --git a/data/income/municipality.py b/data/income/municipality.py index 7bf65015..76fbb2f2 100644 --- a/data/income/municipality.py +++ b/data/income/municipality.py @@ -19,21 +19,44 @@ EQASIM_INCOME_ATTRIBUTES = ["size", "family_comp"] # final columns of the income DataFrame -INCOME_DF_COLUMNS = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "attribute", "value", "is_imputed", "is_missing", "reference_median"] +INCOME_DF_COLUMNS = [ + "commune_id", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "attribute", + "value", + "is_imputed", + "is_missing", + "reference_median", +] def configure(context): context.config("data_path") context.stage("data.spatial.municipalities") - context.config("income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip") + context.config( + "income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip" + ) context.config("income_com_xlsx", "FILO2019_DISP_COM.xlsx") context.config("income_year", 19) -def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_municipalities): +def _income_distributions_from_filosofi_ensemble_sheet( + filsofi_sheets, year, df_municipalities +): requested_communes = set(df_municipalities["commune_id"].unique()) - df = filsofi_sheets["ENSEMBLE"][["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]] + df = filsofi_sheets["ENSEMBLE"][ + ["CODGEO"] + + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)] + ] df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"] df.loc[:, "reference_median"] = df["q5"].values @@ -42,13 +65,21 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_ # Find communes without data df["commune_id"] = df["commune_id"].astype("category") - missing_communes = set(df_municipalities["commune_id"].unique()) - set(df["commune_id"].unique()) - print("Found %d/%d municipalities that are missing" % (len(missing_communes), len(requested_communes))) + missing_communes = set(df_municipalities["commune_id"].unique()) - set( + df["commune_id"].unique() + ) + print( + "Found %d/%d municipalities that are missing" + % (len(missing_communes), len(requested_communes)) + ) # Find communes without full distribution df["is_imputed"] = df["q2"].isna() df["is_missing"] = False - print("Found %d/%d municipalities which do not have full distribution" % (sum(df["is_imputed"]), len(requested_communes))) + print( + "Found %d/%d municipalities which do not have full distribution" + % (sum(df["is_imputed"]), len(requested_communes)) + ) # First, find suitable distribution for incomplete cases by finding the one with the most similar median incomplete_medians = df[df["is_imputed"]]["q5"].values @@ -56,29 +87,44 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_ df_complete = df[~df["is_imputed"]] complete_medians = df_complete["q5"].values - indices = np.argmin(np.abs(complete_medians[:, np.newaxis] - incomplete_medians[np.newaxis, :]), axis = 0) + indices = np.argmin( + np.abs(complete_medians[:, np.newaxis] - incomplete_medians[np.newaxis, :]), + axis=0, + ) for k in range(1, 10): - df.loc[df["is_imputed"], "q%d" % k] = df_complete.iloc[indices]["q%d" % k].values + df.loc[df["is_imputed"], "q%d" % k] = df_complete.iloc[indices][ + "q%d" % k + ].values # Second, add missing municipalities by neirest neighbor # ... build tree of existing communes - df_existing = df_municipalities[df_municipalities["commune_id"].astype(str).isin(df["commune_id"])] # pandas Bug - coordinates = np.vstack([df_existing["geometry"].centroid.x, df_existing["geometry"].centroid.y]).T + df_existing = df_municipalities[ + df_municipalities["commune_id"].astype(str).isin(df["commune_id"]) + ] # pandas Bug + coordinates = np.vstack( + [df_existing["geometry"].centroid.x, df_existing["geometry"].centroid.y] + ).T kd_tree = KDTree(coordinates) # ... query tree for missing communes - df_missing = df_municipalities[df_municipalities["commune_id"].astype(str).isin(missing_communes)] # pandas Bug + df_missing = df_municipalities[ + df_municipalities["commune_id"].astype(str).isin(missing_communes) + ] # pandas Bug if len(df_missing) > 0: - coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T + coordinates = np.vstack( + [df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y] + ).T indices = kd_tree.query(coordinates)[1].flatten() # ... build data frame of imputed communes - df_reconstructed = pd.concat([ - df[df["commune_id"] == df_existing.iloc[index]["commune_id"]] - for index in indices - ]) + df_reconstructed = pd.concat( + [ + df[df["commune_id"] == df_existing.iloc[index]["commune_id"]] + for index in indices + ] + ) df_reconstructed["commune_id"] = df_missing["commune_id"].values df_reconstructed["is_imputed"] = True df_reconstructed["is_missing"] = True @@ -97,11 +143,15 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_ return df[INCOME_DF_COLUMNS] -def _income_distributions_from_filosofi_attribute_sheets(filsofi_sheets, year, df_municipalities, attributes): +def _income_distributions_from_filosofi_attribute_sheets( + filsofi_sheets, year, df_municipalities, attributes +): requested_communes = set(df_municipalities["commune_id"].unique()) # read attributes - df_with_attributes = read_filosofi_attributes(filsofi_sheets, year, attributes, requested_communes) + df_with_attributes = read_filosofi_attributes( + filsofi_sheets, year, attributes, requested_communes + ) df_with_attributes.rename( columns={ @@ -139,8 +189,8 @@ def _read_filosofi_excel(context): sheet_list = sheet_list + [x["sheet"] for x in attr["modalities"]] # open and read income data file - with zipfile.ZipFile("{}/{}".format( - context.config("data_path"), context.config("income_com_path")) + with zipfile.ZipFile( + "{}/{}".format(context.config("data_path"), context.config("income_com_path")) ) as archive: with archive.open(context.config("income_com_xlsx")) as f: df = pd.read_excel(f, sheet_name=sheet_list, skiprows=5) @@ -159,17 +209,25 @@ def execute(context): filosofi_excel, attributes = _read_filosofi_excel(context) # Read ENSEMBLE sheet: global distributions, by commune - ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet(filosofi_excel, year, df_municipalities) + ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet( + filosofi_excel, year, df_municipalities + ) # Read attribute sheets: distributions on individuals with specific attribute values # (ex: sheet TYPMENR_2 corresponds to households with `family_comp`=`Single_wom`) - attribute_distributions = _income_distributions_from_filosofi_attribute_sheets(filosofi_excel, year, df_municipalities, attributes) + attribute_distributions = _income_distributions_from_filosofi_attribute_sheets( + filosofi_excel, year, df_municipalities, attributes + ) return pd.concat([ensemble_distributions, attribute_distributions]) def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_com_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("income_com_path")) + ): raise RuntimeError("Municipality Filosofi data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("income_com_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("income_com_path")) + ) diff --git a/data/income/region.py b/data/income/region.py index 29643d0c..bb062ae3 100644 --- a/data/income/region.py +++ b/data/income/region.py @@ -6,19 +6,22 @@ Loads the regional aggregated income distribution. """ + def configure(context): context.config("data_path") - context.config("income_reg_path", "filosofi_2019/indic-struct-distrib-revenu-2019-SUPRA.zip") + context.config( + "income_reg_path", "filosofi_2019/indic-struct-distrib-revenu-2019-SUPRA.zip" + ) context.config("income_reg_xlsx", "FILO2019_DISP_REG.xlsx") context.config("income_year", 19) + def execute(context): - with zipfile.ZipFile("{}/{}".format( - context.config("data_path"), context.config("income_reg_path"))) as archive: + with zipfile.ZipFile( + "{}/{}".format(context.config("data_path"), context.config("income_reg_path")) + ) as archive: with archive.open(context.config("income_reg_xlsx")) as f: - df = pd.read_excel(f, - sheet_name = "ENSEMBLE", skiprows = 5 - ) + df = pd.read_excel(f, sheet_name="ENSEMBLE", skiprows=5) values = df[df["CODGEO"] == 11][ [ @@ -29,8 +32,13 @@ def execute(context): return values + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_reg_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("income_reg_path")) + ): raise RuntimeError("Regional Filosofi data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("income_reg_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("income_reg_path")) + ) diff --git a/data/od/cleaned.py b/data/od/cleaned.py index e13348f1..2690cab9 100644 --- a/data/od/cleaned.py +++ b/data/od/cleaned.py @@ -6,25 +6,34 @@ and education. """ + def configure(context): context.stage("data.od.raw") context.stage("data.spatial.codes") -RENAME = { "COMMUNE" : "origin_id", "DCLT" : "destination_id", "IPONDI" : "weight", "DCETUF" : "destination_id" } + +RENAME = { + "COMMUNE": "origin_id", + "DCLT": "destination_id", + "IPONDI": "weight", + "DCETUF": "destination_id", +} + def execute(context): - - + # Load data df_work, df_education = context.stage("data.od.raw") # Renaming - df_work = df_work.rename(RENAME, axis = 1) - df_education = df_education.rename(RENAME, axis = 1) + df_work = df_work.rename(RENAME, axis=1) + df_education = df_education.rename(RENAME, axis=1) # Fix arrondissements df_work.loc[~df_work["ARM"].str.contains("Z"), "origin_id"] = df_work["ARM"] - df_education.loc[~df_education["ARM"].str.contains("Z"), "origin_id"] = df_education["ARM"] + df_education.loc[~df_education["ARM"].str.contains("Z"), "origin_id"] = ( + df_education["ARM"] + ) # Verify spatial data for work df_codes = context.stage("data.spatial.codes") @@ -32,7 +41,9 @@ def execute(context): df_work["origin_id"] = df_work["origin_id"].astype("category") df_work["destination_id"] = df_work["destination_id"].astype("category") - excess_communes = (set(df_work["origin_id"].unique()) | set(df_work["destination_id"].unique())) - set(df_codes["commune_id"].unique()) + excess_communes = ( + set(df_work["origin_id"].unique()) | set(df_work["destination_id"].unique()) + ) - set(df_codes["commune_id"].unique()) if len(excess_communes) > 0: raise RuntimeError("Found additional communes: %s" % excess_communes) @@ -42,7 +53,10 @@ def execute(context): df_education["origin_id"] = df_education["origin_id"].astype("category") df_education["destination_id"] = df_education["destination_id"].astype("category") - excess_communes = (set(df_education["origin_id"].unique()) | set(df_education["destination_id"].unique())) - set(df_codes["commune_id"].unique()) + excess_communes = ( + set(df_education["origin_id"].unique()) + | set(df_education["destination_id"].unique()) + ) - set(df_codes["commune_id"].unique()) if len(excess_communes) > 0: raise RuntimeError("Found additional communes: %s" % excess_communes) @@ -55,7 +69,7 @@ def execute(context): df_work.loc[df_work["TRANS"] == 5, "commute_mode"] = "car" df_work.loc[df_work["TRANS"] == 6, "commute_mode"] = "pt" df_work["commute_mode"] = df_work["commute_mode"].astype("category") - + assert not np.any(df_work["commute_mode"].isna()) # Clean age range for education @@ -65,15 +79,23 @@ def execute(context): df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school" df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education" df_education["age_range"] = df_education["age_range"].astype("category") - + assert not np.any(df_education["age_range"].isna()) # Aggregate the flows print("Aggregating work ...") - df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index() + df_work = ( + df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"] + .sum() + .reset_index() + ) print("Aggregating education ...") - df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index() + df_education = ( + df_education.groupby(["origin_id", "destination_id", "age_range"])["weight"] + .sum() + .reset_index() + ) df_work["weight"] = df_work["weight"].fillna(0.0) df_education["weight"] = df_education["weight"].fillna(0.0) diff --git a/data/od/raw.py b/data/od/raw.py index 41bc515b..fb70cce9 100644 --- a/data/od/raw.py +++ b/data/od/raw.py @@ -6,6 +6,7 @@ Loads raw OD data from French census data. """ + def configure(context): context.stage("data.spatial.codes") context.config("data_path") @@ -14,27 +15,34 @@ def configure(context): context.config("od_pro_csv", "FD_MOBPRO_2019.csv") context.config("od_sco_csv", "FD_MOBSCO_2019.csv") + def execute(context): df_codes = context.stage("data.spatial.codes") requested_communes = df_codes["commune_id"].unique() # First, load work - with context.progress(label = "Reading work flows ...") as progress: + with context.progress(label="Reading work flows ...") as progress: df_records = [] COLUMNS_DTYPES = { - "COMMUNE":"str", - "ARM":"str", - "TRANS":"int", - "IPONDI":"float", - "DCLT":"str" + "COMMUNE": "str", + "ARM": "str", + "TRANS": "int", + "IPONDI": "float", + "DCLT": "str", } with zipfile.ZipFile( - "{}/{}".format(context.config("data_path"), context.config("od_pro_path"))) as archive: + "{}/{}".format(context.config("data_path"), context.config("od_pro_path")) + ) as archive: with archive.open(context.config("od_pro_csv")) as f: - csv = pd.read_csv(f, usecols = COLUMNS_DTYPES.keys(), - dtype = COLUMNS_DTYPES, sep = ";",chunksize = 10240) + csv = pd.read_csv( + f, + usecols=COLUMNS_DTYPES.keys(), + dtype=COLUMNS_DTYPES, + sep=";", + chunksize=10240, + ) for df_chunk in csv: progress.update(len(df_chunk)) @@ -50,22 +58,28 @@ def execute(context): work = pd.concat(df_records) # Second, load education - with context.progress(label = "Reading education flows ...") as progress: + with context.progress(label="Reading education flows ...") as progress: df_records = [] COLUMNS_DTYPES = { - "COMMUNE":"str", - "ARM":"str", - "IPONDI":"float", - "DCETUF":"str", - "AGEREV10":"int" + "COMMUNE": "str", + "ARM": "str", + "IPONDI": "float", + "DCETUF": "str", + "AGEREV10": "int", } with zipfile.ZipFile( - "{}/{}".format(context.config("data_path"), context.config("od_sco_path"))) as archive: + "{}/{}".format(context.config("data_path"), context.config("od_sco_path")) + ) as archive: with archive.open(context.config("od_sco_csv")) as f: - csv = pd.read_csv(f, usecols = COLUMNS_DTYPES.keys(), - dtype = COLUMNS_DTYPES, sep = ";",chunksize = 10240) + csv = pd.read_csv( + f, + usecols=COLUMNS_DTYPES.keys(), + dtype=COLUMNS_DTYPES, + sep=";", + chunksize=10240, + ) for df_chunk in csv: progress.update(len(df_chunk)) @@ -84,13 +98,21 @@ def execute(context): def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("od_pro_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("od_pro_path")) + ): raise RuntimeError("RP MOBPRO data is not available") - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("od_sco_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("od_sco_path")) + ): raise RuntimeError("RP MOBSCO data is not available") return [ - os.path.getsize("%s/%s" % (context.config("data_path"), context.config("od_pro_path"))), - os.path.getsize("%s/%s" % (context.config("data_path"), context.config("od_sco_path"))) + os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("od_pro_path")) + ), + os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("od_sco_path")) + ), ] diff --git a/data/od/weighted.py b/data/od/weighted.py index f50702f6..e9c5e86e 100644 --- a/data/od/weighted.py +++ b/data/od/weighted.py @@ -9,13 +9,15 @@ Potential TODO: Do this by mode of transport! """ + def configure(context): context.stage("data.od.cleaned") context.stage("data.spatial.codes") - context.config("education_location_source","bpe") + context.config("education_location_source", "bpe") + -def fix_origins(df, commune_ids, purpose,category): +def fix_origins(df, commune_ids, purpose, category): existing_ids = set(np.unique(df["origin_id"])) missing_ids = commune_ids - existing_ids categories = set(np.unique(df[category])) @@ -23,14 +25,27 @@ def fix_origins(df, commune_ids, purpose,category): rows = [] for origin_id in missing_ids: for destination_id in commune_ids: - for category_name in categories : - rows.append((origin_id, destination_id, category_name, 1.0 if origin_id == destination_id else 0.0)) + for category_name in categories: + rows.append( + ( + origin_id, + destination_id, + category_name, + 1.0 if origin_id == destination_id else 0.0, + ) + ) print("Fixing %d origins for %s" % (len(missing_ids), purpose)) - return pd.concat([df, pd.DataFrame.from_records( - rows, columns = ["origin_id", "destination_id", category, "weight"] - )]).sort_values(["origin_id", "destination_id"]) + return pd.concat( + [ + df, + pd.DataFrame.from_records( + rows, columns=["origin_id", "destination_id", category, "weight"] + ), + ] + ).sort_values(["origin_id", "destination_id"]) + def execute(context): df_codes = context.stage("data.spatial.codes") @@ -40,22 +55,44 @@ def execute(context): df_work, df_education = context.stage("data.od.cleaned") # Add missing origins - df_work = fix_origins(df_work, commune_ids, "work","commute_mode") - df_education = fix_origins(df_education, commune_ids, "education","age_range") + df_work = fix_origins(df_work, commune_ids, "work", "commute_mode") + df_education = fix_origins(df_education, commune_ids, "education", "age_range") # Aggregate work (we do not consider different modes at the moment) - df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index() - + df_work = ( + df_work[["origin_id", "destination_id", "weight"]] + .groupby(["origin_id", "destination_id"]) + .sum() + .reset_index() + ) + # Compute totals - df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1) - df_work = pd.merge(df_work, df_total, on = "origin_id") + df_total = ( + df_work[["origin_id", "weight"]] + .groupby("origin_id") + .sum() + .reset_index() + .rename({"weight": "total"}, axis=1) + ) + df_work = pd.merge(df_work, df_total, on="origin_id") + + df_total = ( + df_education[["origin_id", "age_range", "weight"]] + .groupby(["origin_id", "age_range"]) + .sum() + .reset_index() + .rename({"weight": "total"}, axis=1) + ) + df_education = pd.merge(df_education, df_total, on=["origin_id", "age_range"]) - df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1) - df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"]) - - if context.config("education_location_source") == 'bpe': + if context.config("education_location_source") == "bpe": # Aggregate education (we do not consider different age range with bpe source) - df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index() + df_education = ( + df_education[["origin_id", "destination_id", "weight", "total"]] + .groupby(["origin_id", "destination_id"]) + .sum() + .reset_index() + ) # Compute weight df_work["weight"] /= df_work["total"] df_education["weight"] /= df_education["total"] @@ -63,5 +100,5 @@ def execute(context): del df_work["total"] del df_education["total"] df_education = df_education.fillna(0.0) - + return df_work, df_education diff --git a/data/osm/cleaned.py b/data/osm/cleaned.py index c15de109..f7fb95cf 100644 --- a/data/osm/cleaned.py +++ b/data/osm/cleaned.py @@ -18,6 +18,7 @@ Additionally, the stage cuts the OSM data to the requested region of the pipeline. """ + def configure(context): context.config("data_path") context.config("osm_path", "osm_idf") @@ -28,11 +29,12 @@ def configure(context): context.stage("data.osm.osmosis") context.stage("data.spatial.municipalities") -def write_poly(df, path, geometry_column = "geometry"): + +def write_poly(df, path, geometry_column="geometry"): df = df.to_crs("EPSG:4326") df["aggregate"] = 0 - area = df.dissolve(by = "aggregate")[geometry_column].values[0] + area = df.dissolve(by="aggregate")[geometry_column].values[0] if not hasattr(area, "exterior"): print("Selected area is not connected -> Using convex hull.") @@ -51,9 +53,12 @@ def write_poly(df, path, geometry_column = "geometry"): with open(path, "w+") as f: f.write("\n".join(data)) + def execute(context): - input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("osm_path"))) - + input_files = get_input_files( + "{}/{}".format(context.config("data_path"), context.config("osm_path")) + ) + # Prepare bounding area df_area = context.stage("data.spatial.municipalities") write_poly(df_area, "%s/boundary.poly" % context.path()) @@ -70,12 +75,22 @@ def execute(context): absolute_path = os.path.abspath(path) - data.osm.osmosis.run(context, [ - "--read-%s" % mode, absolute_path, - "--tag-filter", "accept-ways", "highway=%s" % highway_tags, "railway=%s" % railway_tags, - "--bounding-polygon", "file=%s/boundary.poly" % context.path(), "completeWays=yes", - "--write-pbf", "filtered_%d.osm.pbf" % index - ]) + data.osm.osmosis.run( + context, + [ + "--read-%s" % mode, + absolute_path, + "--tag-filter", + "accept-ways", + "highway=%s" % highway_tags, + "railway=%s" % railway_tags, + "--bounding-polygon", + "file=%s/boundary.poly" % context.path(), + "completeWays=yes", + "--write-pbf", + "filtered_%d.osm.pbf" % index, + ], + ) # Merge filtered files if there are multiple ones print("Merging and compressing OSM data...") @@ -98,17 +113,23 @@ def execute(context): return "output.osm.gz" + def get_input_files(base_path): osm_paths = sorted(list(glob.glob("{}/*.osm.pbf".format(base_path)))) osm_paths += sorted(list(glob.glob("{}/*.osm.xml".format(base_path)))) if len(osm_paths) == 0: - raise RuntimeError("Did not find any OSM data (.osm.pbf) in {}".format(base_path)) - + raise RuntimeError( + "Did not find any OSM data (.osm.pbf) in {}".format(base_path) + ) + return osm_paths + def validate(context): - input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("osm_path"))) + input_files = get_input_files( + "{}/{}".format(context.config("data_path"), context.config("osm_path")) + ) total_size = 0 for path in input_files: diff --git a/data/osm/osmosis.py b/data/osm/osmosis.py index 3913ddf4..39959d5e 100644 --- a/data/osm/osmosis.py +++ b/data/osm/osmosis.py @@ -1,15 +1,17 @@ import subprocess as sp import shutil, os + def configure(context): context.config("osmosis_binary", "osmosis") context.config("java_binary", "java") context.config("java_memory", "50G") -def run(context, arguments = [], cwd = None): + +def run(context, arguments=[], cwd=None): """ - This function calls osmosis. + This function calls osmosis. """ # Make sure there is a dependency context.stage("data.osm.osmosis") @@ -18,9 +20,7 @@ def run(context, arguments = [], cwd = None): cwd = context.path() # Prepare command line - command_line = [ - shutil.which(context.config("osmosis_binary")) - ] + arguments + command_line = [shutil.which(context.config("osmosis_binary"))] + arguments # Prepare environment environment = os.environ.copy() @@ -28,20 +28,23 @@ def run(context, arguments = [], cwd = None): environment["JAVACMD_OPTIONS"] = "-Xmx%s" % context.config("java_memory") # Run Osmosis - return_code = sp.check_call(command_line, cwd = cwd, env = environment) + return_code = sp.check_call(command_line, cwd=cwd, env=environment) if not return_code == 0: raise RuntimeError("Osmosis return code: %d" % return_code) + def validate(context): if shutil.which(context.config("osmosis_binary")) in ["", None]: - raise RuntimeError("Cannot find Osmosis binary at: %s" % context.config("osmosis_binary")) + raise RuntimeError( + "Cannot find Osmosis binary at: %s" % context.config("osmosis_binary") + ) - if not b"0.48." in sp.check_output([ - shutil.which(context.config("osmosis_binary")), - "-v" - ], stderr = sp.STDOUT): + if not b"0.48." in sp.check_output( + [shutil.which(context.config("osmosis_binary")), "-v"], stderr=sp.STDOUT + ): print("WARNING! Osmosis of at least version 0.48.x is recommended!") + def execute(context): pass diff --git a/data/sirene/cleaned.py b/data/sirene/cleaned.py index 9bef6da5..0c5ef575 100644 --- a/data/sirene/cleaned.py +++ b/data/sirene/cleaned.py @@ -4,26 +4,27 @@ """ Clean the SIRENE enterprise census. """ - + + def configure(context): - context.stage("data.sirene.raw_siren", ephemeral = True) - context.stage("data.sirene.raw_siret", ephemeral = True) + context.stage("data.sirene.raw_siren", ephemeral=True) + context.stage("data.sirene.raw_siret", ephemeral=True) context.stage("data.spatial.codes") context.config("exclude_no_employee", False) + def execute(context): df_sirene_establishments = context.stage("data.sirene.raw_siret") df_sirene_headquarters = context.stage("data.sirene.raw_siren") - # Filter out establishments without a corresponding headquarter - df_sirene = df_sirene_establishments[df_sirene_establishments["siren"].isin(df_sirene_headquarters["siren"])].copy() + df_sirene = df_sirene_establishments[ + df_sirene_establishments["siren"].isin(df_sirene_headquarters["siren"]) + ].copy() # Remove inactive enterprises - df_sirene = df_sirene[ - df_sirene["etatAdministratifEtablissement"] == "A" - ].copy() - + df_sirene = df_sirene[df_sirene["etatAdministratifEtablissement"] == "A"].copy() + if context.config("exclude_no_employee"): # exclude "NN", "00", and NaN df_sirene = df_sirene[ @@ -32,37 +33,93 @@ def execute(context): ].copy() # Define work place weights by person under salary .... - df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN - df_sirene["maximum_employees"] = 1 # Includes "NN", "00", and NaN - - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "01", "minimum_employees"] = 1 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "01", "maximum_employees"] = 2 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "02", "minimum_employees"] = 3 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "02", "maximum_employees"] = 5 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "03", "minimum_employees"] = 6 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "03", "maximum_employees"] = 9 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "11", "minimum_employees"] = 10 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "11", "maximum_employees"] = 19 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "12", "minimum_employees"] = 20 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "12", "maximum_employees"] = 49 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "21", "minimum_employees"] = 50 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "21", "maximum_employees"] = 99 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "22", "minimum_employees"] = 100 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "22", "maximum_employees"] = 199 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "31", "minimum_employees"] = 200 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "31", "maximum_employees"] = 249 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "32", "minimum_employees"] = 250 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "32", "maximum_employees"] = 499 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "41", "minimum_employees"] = 500 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "41", "maximum_employees"] = 999 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "42", "minimum_employees"] = 1000 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "42", "maximum_employees"] = 1999 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "51", "minimum_employees"] = 2000 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "51", "maximum_employees"] = 4999 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "52", "minimum_employees"] = 5000 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "52", "maximum_employees"] = 9999 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "53", "minimum_employees"] = 10000 - df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "53", "maximum_employees"] = np.inf + df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN + df_sirene["maximum_employees"] = 1 # Includes "NN", "00", and NaN + + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "01", "minimum_employees" + ] = 1 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "01", "maximum_employees" + ] = 2 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "02", "minimum_employees" + ] = 3 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "02", "maximum_employees" + ] = 5 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "03", "minimum_employees" + ] = 6 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "03", "maximum_employees" + ] = 9 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "11", "minimum_employees" + ] = 10 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "11", "maximum_employees" + ] = 19 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "12", "minimum_employees" + ] = 20 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "12", "maximum_employees" + ] = 49 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "21", "minimum_employees" + ] = 50 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "21", "maximum_employees" + ] = 99 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "22", "minimum_employees" + ] = 100 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "22", "maximum_employees" + ] = 199 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "31", "minimum_employees" + ] = 200 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "31", "maximum_employees" + ] = 249 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "32", "minimum_employees" + ] = 250 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "32", "maximum_employees" + ] = 499 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "41", "minimum_employees" + ] = 500 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "41", "maximum_employees" + ] = 999 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "42", "minimum_employees" + ] = 1000 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "42", "maximum_employees" + ] = 1999 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "51", "minimum_employees" + ] = 2000 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "51", "maximum_employees" + ] = 4999 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "52", "minimum_employees" + ] = 5000 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "52", "maximum_employees" + ] = 9999 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "53", "minimum_employees" + ] = 10000 + df_sirene.loc[ + df_sirene["trancheEffectifsEtablissement"] == "53", "maximum_employees" + ] = np.inf # Add activity classification df_sirene["ape"] = df_sirene["activitePrincipaleEtablissement"] @@ -80,15 +137,24 @@ def execute(context): if len(excess_communes) > 5: raise RuntimeError("Found more than 5 excess municipalities in SIRENE data") - df_sirene = df_sirene[["siren", "commune_id", "minimum_employees", "maximum_employees", "ape", "siret"]] + df_sirene = df_sirene[ + [ + "siren", + "commune_id", + "minimum_employees", + "maximum_employees", + "ape", + "siret", + ] + ] # Add law status initial_count = len(df_sirene) - df_sirene = pd.merge(df_sirene, df_sirene_headquarters, on = "siren") + df_sirene = pd.merge(df_sirene, df_sirene_headquarters, on="siren") df_sirene["law_status"] = df_sirene["categorieJuridiqueUniteLegale"] - df_sirene = df_sirene.drop(columns = ["categorieJuridiqueUniteLegale", "siren"]) + df_sirene = df_sirene.drop(columns=["categorieJuridiqueUniteLegale", "siren"]) final_count = len(df_sirene) assert initial_count == final_count diff --git a/data/sirene/localized.py b/data/sirene/localized.py index 243b51c7..e2111c1d 100644 --- a/data/sirene/localized.py +++ b/data/sirene/localized.py @@ -6,6 +6,8 @@ Should we consider using location accuracy variable to optimize process? """ + + def configure(context): context.stage("data.sirene.cleaned") context.stage("data.sirene.raw_geoloc") @@ -15,19 +17,20 @@ def execute(context): df_sirene = context.stage("data.sirene.cleaned") df_siret_geoloc = context.stage("data.sirene.raw_geoloc") - # merging geographical SIREN file (containing only SIRET and location) with full SIREN file (all variables and processed) - df_siret_geoloc.set_index(("siret"),inplace=True,verify_integrity=True) - df_sirene.set_index(("siret"),inplace=True,verify_integrity=True) + df_siret_geoloc.set_index(("siret"), inplace=True, verify_integrity=True) + df_sirene.set_index(("siret"), inplace=True, verify_integrity=True) df_siret_geoloc.sort_index(inplace=True) df_sirene.sort_index(inplace=True) - df_sirene = df_sirene.join(df_siret_geoloc,how="left") - df_sirene.dropna(subset=['x', 'y'],inplace=True) - + df_sirene = df_sirene.join(df_siret_geoloc, how="left") + df_sirene.dropna(subset=["x", "y"], inplace=True) # convert to geopandas dataframe with Lambert 93, EPSG:2154 french official projection - df_sirene = gpd.GeoDataFrame(df_sirene, geometry=gpd.points_from_xy(df_sirene.x, df_sirene.y),crs="EPSG:2154") - + df_sirene = gpd.GeoDataFrame( + df_sirene, + geometry=gpd.points_from_xy(df_sirene.x, df_sirene.y), + crs="EPSG:2154", + ) return df_sirene diff --git a/data/sirene/output.py b/data/sirene/output.py index a64a9a27..87de6fe9 100644 --- a/data/sirene/output.py +++ b/data/sirene/output.py @@ -3,16 +3,20 @@ makes it easy to extract the data set from the pipeline. """ + def configure(context): context.stage("data.sirene.localized") context.config("output_path") context.config("output_prefix", "ile_de_france_") + def execute(context): df_sirene = context.stage("data.sirene.localized") df_sirene["commune_id"] = df_sirene["commune_id"].astype(str) - df_sirene.to_file("%s/%ssirene.gpkg" % ( - context.config("output_path"), context.config("output_prefix")), driver = "GPKG") - + df_sirene.to_file( + "%s/%ssirene.gpkg" + % (context.config("output_path"), context.config("output_prefix")), + driver="GPKG", + ) diff --git a/data/sirene/raw_geoloc.py b/data/sirene/raw_geoloc.py index 7887710c..5537f499 100644 --- a/data/sirene/raw_geoloc.py +++ b/data/sirene/raw_geoloc.py @@ -5,10 +5,14 @@ This stage loads the geolocalization data for the French enterprise registry. """ + def configure(context): context.config("data_path") - context.config("siret_geo_path", "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip") - + context.config( + "siret_geo_path", + "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip", + ) + context.stage("data.spatial.codes") @@ -16,37 +20,47 @@ def execute(context): # Filter by departement df_codes = context.stage("data.spatial.codes") requested_departements = set(df_codes["departement_id"].unique()) - + COLUMNS_DTYPES = { - "siret":"int64", - "x":"float", - "y":"float", - "plg_code_commune":"str", + "siret": "int64", + "x": "float", + "y": "float", + "plg_code_commune": "str", } - df_siret_geoloc = pd.DataFrame(columns=["siret","x","y"]) - - with context.progress(label = "Reading geolocalized SIRET ...") as progress: - csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siret_geo_path")), - usecols = COLUMNS_DTYPES.keys(), sep=";",dtype = COLUMNS_DTYPES,chunksize = 10240) - - for df_chunk in csv: + df_siret_geoloc = pd.DataFrame(columns=["siret", "x", "y"]) + + with context.progress(label="Reading geolocalized SIRET ...") as progress: + csv = pd.read_csv( + "%s/%s" % (context.config("data_path"), context.config("siret_geo_path")), + usecols=COLUMNS_DTYPES.keys(), + sep=";", + dtype=COLUMNS_DTYPES, + chunksize=10240, + ) + + for df_chunk in csv: progress.update(len(df_chunk)) - - f = df_chunk["siret"].isna() # Just to get a mask - + + f = df_chunk["siret"].isna() # Just to get a mask + for departement in requested_departements: f |= df_chunk["plg_code_commune"].str.startswith(departement) - df_siret_geoloc = pd.concat([df_siret_geoloc, df_chunk[f]],ignore_index=True) + df_siret_geoloc = pd.concat( + [df_siret_geoloc, df_chunk[f]], ignore_index=True + ) return df_siret_geoloc - def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siret_geo_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("siret_geo_path")) + ): raise RuntimeError("SIRENE: geolocaized SIRET data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siret_geo_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("siret_geo_path")) + ) diff --git a/data/sirene/raw_siren.py b/data/sirene/raw_siren.py index 0a7d0ae5..a612f1ce 100644 --- a/data/sirene/raw_siren.py +++ b/data/sirene/raw_siren.py @@ -5,41 +5,48 @@ This stage loads the raw data from the French enterprise registry. """ + def configure(context): context.config("data_path") context.config("siren_path", "sirene/StockUniteLegale_utf8.zip") context.stage("data.sirene.raw_siret") + def execute(context): relevant_siren = context.stage("data.sirene.raw_siret")["siren"].unique() df_siren = [] - - COLUMNS_DTYPES = { - "siren":"int32", - "categorieJuridiqueUniteLegale":"str", + "siren": "int32", + "categorieJuridiqueUniteLegale": "str", } - - with context.progress(label = "Reading SIREN...") as progress: - csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siren_path")), - usecols = COLUMNS_DTYPES.keys(), dtype = COLUMNS_DTYPES,chunksize = 10240) + + with context.progress(label="Reading SIREN...") as progress: + csv = pd.read_csv( + "%s/%s" % (context.config("data_path"), context.config("siren_path")), + usecols=COLUMNS_DTYPES.keys(), + dtype=COLUMNS_DTYPES, + chunksize=10240, + ) for df_chunk in csv: progress.update(len(df_chunk)) - df_chunk = df_chunk[ - df_chunk["siren"].isin(relevant_siren) - ] + df_chunk = df_chunk[df_chunk["siren"].isin(relevant_siren)] if len(df_chunk) > 0: df_siren.append(df_chunk) return pd.concat(df_siren) + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siren_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("siren_path")) + ): raise RuntimeError("SIRENE: SIREN data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siren_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("siren_path")) + ) diff --git a/data/sirene/raw_siret.py b/data/sirene/raw_siret.py index 7b10713a..0bbadbcd 100644 --- a/data/sirene/raw_siret.py +++ b/data/sirene/raw_siret.py @@ -5,12 +5,14 @@ This stage loads the raw data from the French enterprise registry. """ + def configure(context): context.config("data_path") context.config("siret_path", "sirene/StockEtablissement_utf8.zip") context.stage("data.spatial.codes") + def execute(context): # Filter by departement df_codes = context.stage("data.spatial.codes") @@ -18,24 +20,27 @@ def execute(context): df_siret = [] - COLUMNS_DTYPES = { - "siren":"int32", - "siret":"int64", - "codeCommuneEtablissement":"str", - "activitePrincipaleEtablissement":"str", - "trancheEffectifsEtablissement":"str", - "etatAdministratifEtablissement":"str" + "siren": "int32", + "siret": "int64", + "codeCommuneEtablissement": "str", + "activitePrincipaleEtablissement": "str", + "trancheEffectifsEtablissement": "str", + "etatAdministratifEtablissement": "str", } - - with context.progress(label = "Reading SIRET...") as progress: - csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siret_path")), - usecols = COLUMNS_DTYPES.keys(), dtype = COLUMNS_DTYPES,chunksize = 10240) + + with context.progress(label="Reading SIRET...") as progress: + csv = pd.read_csv( + "%s/%s" % (context.config("data_path"), context.config("siret_path")), + usecols=COLUMNS_DTYPES.keys(), + dtype=COLUMNS_DTYPES, + chunksize=10240, + ) for df_chunk in csv: progress.update(len(df_chunk)) - f = df_chunk["codeCommuneEtablissement"].isna() # Just to get a mask + f = df_chunk["codeCommuneEtablissement"].isna() # Just to get a mask for departement in requested_departements: f |= df_chunk["codeCommuneEtablissement"].str.startswith(departement) @@ -46,11 +51,15 @@ def execute(context): if len(df_chunk) > 0: df_siret.append(df_chunk) - return pd.concat(df_siret) + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siret_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("siret_path")) + ): raise RuntimeError("SIRENE: SIRET data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siret_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("siret_path")) + ) diff --git a/data/spatial/centroid_distances.py b/data/spatial/centroid_distances.py index b84d6371..ff8507de 100644 --- a/data/spatial/centroid_distances.py +++ b/data/spatial/centroid_distances.py @@ -1,18 +1,32 @@ import pandas as pd + def configure(context): context.stage("data.spatial.municipalities") + def execute(context): df = context.stage("data.spatial.municipalities") records = [] - with context.progress(total = len(df)**2, label = "Calculating centroid distances ...") as progress: + with context.progress( + total=len(df) ** 2, label="Calculating centroid distances ..." + ) as progress: for origin_id, origin_geometry in zip(df["commune_id"], df["geometry"]): - for destination_id, destination_geometry in zip(df["commune_id"], df["geometry"]): - records.append(( - origin_id, destination_id, origin_geometry.centroid.distance(destination_geometry.centroid) - )) + for destination_id, destination_geometry in zip( + df["commune_id"], df["geometry"] + ): + records.append( + ( + origin_id, + destination_id, + origin_geometry.centroid.distance( + destination_geometry.centroid + ), + ) + ) progress.update() - return pd.DataFrame.from_records(records, columns = ["origin_id", "destination_id", "centroid_distance"]) + return pd.DataFrame.from_records( + records, columns=["origin_id", "destination_id", "centroid_distance"] + ) diff --git a/data/spatial/code_changes.py b/data/spatial/code_changes.py index a65df499..4c80a724 100644 --- a/data/spatial/code_changes.py +++ b/data/spatial/code_changes.py @@ -10,43 +10,58 @@ YEAR = 2021 SOURCE = "codes_%d/reference_IRIS_geo%d.xlsx" % (YEAR, YEAR) + def configure(context): context.config("data_path") context.config("regions", [11]) context.config("departments", []) + def execute(context): # Load IRIS registry df_modifications = pd.read_excel( "%s/%s" % (context.config("data_path"), SOURCE), - skiprows = 5, sheet_name = "Modifications_IRIS" - )[["IRIS_INI", "IRIS_FIN", "COM_INI", "COM_FIN"]].rename(columns = { - "IRIS_INI": "initial_iris", "IRIS_FIN": "final_iris", - "COM_INI": "initial_commune", "COM_FIN": "final_commune" - }) + skiprows=5, + sheet_name="Modifications_IRIS", + )[["IRIS_INI", "IRIS_FIN", "COM_INI", "COM_FIN"]].rename( + columns={ + "IRIS_INI": "initial_iris", + "IRIS_FIN": "final_iris", + "COM_INI": "initial_commune", + "COM_FIN": "final_commune", + } + ) - df_modifications["initial_iris"] = df_modifications["initial_iris"].astype("category") + df_modifications["initial_iris"] = df_modifications["initial_iris"].astype( + "category" + ) df_modifications["final_iris"] = df_modifications["final_iris"].astype("category") - df_modifications["initial_commune"] = df_modifications["initial_commune"].astype("category") - df_modifications["final_commune"] = df_modifications["final_commune"].astype("category") + df_modifications["initial_commune"] = df_modifications["initial_commune"].astype( + "category" + ) + df_modifications["final_commune"] = df_modifications["final_commune"].astype( + "category" + ) return df_modifications + def validate(context): if not os.path.exists("%s/%s" % (context.config("data_path"), SOURCE)): raise RuntimeError("Spatial reference codes are not available") return os.path.getsize("%s/%s" % (context.config("data_path"), SOURCE)) + def update(df_changes, level, values): initial_slot = "initial_%s" % level final_slot = "final_%s" % level df_source = df_changes[df_changes[initial_slot].isin(values.unique())] - dictionary = { k: v for k, v in zip(df_source[initial_slot], df_source[final_slot]) } + dictionary = {k: v for k, v in zip(df_source[initial_slot], df_source[final_slot])} if len(dictionary) > 0: print("Updating %d deprecated zone identifiers ..." % len(dictionary)) - + return values.replace(dictionary) diff --git a/data/spatial/codes.py b/data/spatial/codes.py index 38200a14..c7049363 100644 --- a/data/spatial/codes.py +++ b/data/spatial/codes.py @@ -8,6 +8,7 @@ departement and région. """ + def configure(context): context.config("data_path") @@ -16,19 +17,23 @@ def configure(context): context.config("codes_path", "codes_2021/reference_IRIS_geo2021.zip") context.config("codes_xlsx", "reference_IRIS_geo2021.xlsx") + def execute(context): # Load IRIS registry with zipfile.ZipFile( - "{}/{}".format(context.config("data_path"), context.config("codes_path"))) as archive: + "{}/{}".format(context.config("data_path"), context.config("codes_path")) + ) as archive: with archive.open(context.config("codes_xlsx")) as f: - df_codes = pd.read_excel(f, - skiprows = 5, sheet_name = "Emboitements_IRIS" - )[["CODE_IRIS", "DEPCOM", "DEP", "REG"]].rename(columns = { - "CODE_IRIS": "iris_id", - "DEPCOM": "commune_id", - "DEP": "departement_id", - "REG": "region_id" - }) + df_codes = pd.read_excel(f, skiprows=5, sheet_name="Emboitements_IRIS")[ + ["CODE_IRIS", "DEPCOM", "DEP", "REG"] + ].rename( + columns={ + "CODE_IRIS": "iris_id", + "DEPCOM": "commune_id", + "DEP": "departement_id", + "REG": "region_id", + } + ) df_codes["iris_id"] = df_codes["iris_id"].astype("category") df_codes["commune_id"] = df_codes["commune_id"].astype("category") @@ -47,12 +52,19 @@ def execute(context): df_codes["iris_id"] = df_codes["iris_id"].cat.remove_unused_categories() df_codes["commune_id"] = df_codes["commune_id"].cat.remove_unused_categories() - df_codes["departement_id"] = df_codes["departement_id"].cat.remove_unused_categories() + df_codes["departement_id"] = df_codes[ + "departement_id" + ].cat.remove_unused_categories() return df_codes + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("codes_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("codes_path")) + ): raise RuntimeError("Spatial reference codes are not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("codes_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("codes_path")) + ) diff --git a/data/spatial/departments.py b/data/spatial/departments.py index c055e51e..15b669af 100644 --- a/data/spatial/departments.py +++ b/data/spatial/departments.py @@ -7,11 +7,17 @@ Provides the municipality zoning system. """ + def configure(context): context.stage("data.spatial.municipalities") + def execute(context): - df_departements = context.stage("data.spatial.municipalities").dissolve( - by = "departement_id").drop(columns = ["commune_id", "has_iris"]).reset_index() + df_departements = ( + context.stage("data.spatial.municipalities") + .dissolve(by="departement_id") + .drop(columns=["commune_id", "has_iris"]) + .reset_index() + ) return df_departements diff --git a/data/spatial/iris.py b/data/spatial/iris.py index 8f10457a..56603084 100644 --- a/data/spatial/iris.py +++ b/data/spatial/iris.py @@ -8,35 +8,35 @@ Loads the IRIS zoning system. """ + def configure(context): context.config("data_path") context.config("iris_path", "iris_2021") context.stage("data.spatial.codes") + def execute(context): df_codes = context.stage("data.spatial.codes") - source_path = find_iris("{}/{}".format(context.config("data_path"), context.config("iris_path"))) + source_path = find_iris( + "{}/{}".format(context.config("data_path"), context.config("iris_path")) + ) with py7zr.SevenZipFile(source_path) as archive: - contour_paths = [ - path for path in archive.getnames() - if "LAMB93" in path - ] + contour_paths = [path for path in archive.getnames() if "LAMB93" in path] archive.extract(context.path(), contour_paths) - + shp_path = [path for path in contour_paths if path.endswith(".shp")] if len(shp_path) != 1: - raise RuntimeError("Cannot find IRIS shapes inside the archive, please report this as an error!") + raise RuntimeError( + "Cannot find IRIS shapes inside the archive, please report this as an error!" + ) - df_iris = gpd.read_file("{}/{}".format(context.path(), shp_path[0]))[[ - "CODE_IRIS", "INSEE_COM", "geometry" - ]].rename(columns = { - "CODE_IRIS": "iris_id", - "INSEE_COM": "commune_id" - }) + df_iris = gpd.read_file("{}/{}".format(context.path(), shp_path[0]))[ + ["CODE_IRIS", "INSEE_COM", "geometry"] + ].rename(columns={"CODE_IRIS": "iris_id", "INSEE_COM": "commune_id"}) df_iris.crs = "EPSG:2154" @@ -44,28 +44,35 @@ def execute(context): df_iris["commune_id"] = df_iris["commune_id"].astype("category") # Merge with requested codes and verify integrity - df_iris = pd.merge(df_iris, df_codes, on = ["iris_id", "commune_id"]) + df_iris = pd.merge(df_iris, df_codes, on=["iris_id", "commune_id"]) requested_iris = set(df_codes["iris_id"].unique()) merged_iris = set(df_iris["iris_id"].unique()) if requested_iris != merged_iris: - raise RuntimeError("Some IRIS are missing: %s" % (requested_iris - merged_iris,)) + raise RuntimeError( + "Some IRIS are missing: %s" % (requested_iris - merged_iris,) + ) return df_iris + def find_iris(path): candidates = sorted(list(glob.glob("{}/*.7z".format(path)))) if len(candidates) == 0: raise RuntimeError("IRIS data is not available in {}".format(path)) - + if len(candidates) > 1: - raise RuntimeError("Multiple candidates for IRIS are available in {}".format(path)) - + raise RuntimeError( + "Multiple candidates for IRIS are available in {}".format(path) + ) + return candidates[0] def validate(context): - path = find_iris("{}/{}".format(context.config("data_path"), context.config("iris_path"))) + path = find_iris( + "{}/{}".format(context.config("data_path"), context.config("iris_path")) + ) return os.path.getsize(path) diff --git a/data/spatial/municipalities.py b/data/spatial/municipalities.py index b46eb696..71c553cf 100644 --- a/data/spatial/municipalities.py +++ b/data/spatial/municipalities.py @@ -7,14 +7,20 @@ Provides the municipality zoning system. """ + def configure(context): context.stage("data.spatial.iris") + def execute(context): df_iris = context.stage("data.spatial.iris") df_iris["has_iris"] = ~df_iris["iris_id"].astype(str).str.endswith("0000") - df_municipalities = context.stage("data.spatial.iris").dissolve( - by = "commune_id").drop(columns = ["iris_id"]).reset_index() + df_municipalities = ( + context.stage("data.spatial.iris") + .dissolve(by="commune_id") + .drop(columns=["iris_id"]) + .reset_index() + ) return df_municipalities diff --git a/data/spatial/population.py b/data/spatial/population.py index 04ab94bb..624df8ce 100644 --- a/data/spatial/population.py +++ b/data/spatial/population.py @@ -6,6 +6,7 @@ Loads aggregate population data. """ + def configure(context): context.config("data_path") context.stage("data.spatial.codes") @@ -13,19 +14,28 @@ def configure(context): context.config("population_xlsx", "base-ic-evol-struct-pop-2019.xlsx") context.config("population_year", 19) + def execute(context): year = str(context.config("population_year")) with zipfile.ZipFile( - "{}/{}".format(context.config("data_path"), context.config("population_path"))) as archive: + "{}/{}".format(context.config("data_path"), context.config("population_path")) + ) as archive: with archive.open(context.config("population_xlsx")) as f: df_population = pd.read_excel( f, - skiprows = 5, sheet_name = "IRIS", usecols = ["IRIS", "COM", "DEP", "REG", "P%s_POP" % year] - ).rename(columns = { - "IRIS": "iris_id", "COM": "commune_id", "DEP": "departement_id", "REG": "region_id", - "P%s_POP" % year: "population" - }) + skiprows=5, + sheet_name="IRIS", + usecols=["IRIS", "COM", "DEP", "REG", "P%s_POP" % year], + ).rename( + columns={ + "IRIS": "iris_id", + "COM": "commune_id", + "DEP": "departement_id", + "REG": "region_id", + "P%s_POP" % year: "population", + } + ) df_population["iris_id"] = df_population["iris_id"].astype("category") df_population["commune_id"] = df_population["commune_id"].astype("category") @@ -34,19 +44,31 @@ def execute(context): # Merge into code data and verify integrity df_codes = context.stage("data.spatial.codes") - df_population = pd.merge(df_population, df_codes, on = ["iris_id", "commune_id", "departement_id", "region_id"]) + df_population = pd.merge( + df_population, + df_codes, + on=["iris_id", "commune_id", "departement_id", "region_id"], + ) requested_iris = set(df_codes["iris_id"].unique()) merged_iris = set(df_population["iris_id"].unique()) if requested_iris != merged_iris: - raise RuntimeError("Some IRIS are missing: %s" % (requested_iris - merged_iris,)) + raise RuntimeError( + "Some IRIS are missing: %s" % (requested_iris - merged_iris,) + ) + + return df_population[ + ["region_id", "departement_id", "commune_id", "iris_id", "population"] + ] - return df_population[["region_id", "departement_id", "commune_id", "iris_id", "population"]] def validate(context): - if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("population_path"))): + if not os.path.exists( + "{}/{}".format(context.config("data_path"), context.config("population_path")) + ): raise RuntimeError("Aggregated census data is not available") - return os.path.getsize("{}/{}".format(context.config("data_path"), context.config("population_path"))) - \ No newline at end of file + return os.path.getsize( + "{}/{}".format(context.config("data_path"), context.config("population_path")) + ) diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py index 7e5c0c26..4f80a954 100644 --- a/data/spatial/urban_type.py +++ b/data/spatial/urban_type.py @@ -5,56 +5,73 @@ # START Money patching openpyxl to parse INSEE file from openpyxl.styles.colors import WHITE, RGB + __old_rgb_set__ = RGB.__set__ + def __rgb_set_fixed__(self, instance, value): try: __old_rgb_set__(self, instance, value) except ValueError as e: - if e.args[0] == 'Colors must be aRGB hex values': + if e.args[0] == "Colors must be aRGB hex values": __old_rgb_set__(self, instance, WHITE) + RGB.__set__ = __rgb_set_fixed__ # END Monkey patching openpyxl # Loads the input data for the urban type (unité urbain) + def configure(context): context.stage("data.spatial.municipalities") context.config("data_path") context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip") + def execute(context): - with zipfile.ZipFile("{}/{}".format( - context.config("data_path"), context.config("urban_type_path"))) as archive: + with zipfile.ZipFile( + "{}/{}".format(context.config("data_path"), context.config("urban_type_path")) + ) as archive: assert len(archive.filelist) == 1 with archive.open(archive.filelist[0]) as f: - df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5) - + df = pd.read_excel(f, sheet_name="Composition_communale", skiprows=5) + df = df[["CODGEO", "STATUT_2017"]].copy() - df = df.set_axis(["commune_id", "urban_type"], axis = "columns") + df = df.set_axis(["commune_id", "urban_type"], axis="columns") # Cities that have districts are not detailed in the UU file, only the whole city is mentioned # However the municipalities file details the districts with their respective INSEE codes - cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris - "69123": [str(69001 + i) for i in range(9)], # Lyon - "13055": [str(13201 + i) for i in range(15)]} # Marseilles + cities_with_districts = { + "75056": [str(75101 + i) for i in (range(20))], # Paris + "69123": [str(69001 + i) for i in range(9)], # Lyon + "13055": [str(13201 + i) for i in range(15)], + } # Marseilles # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts for city_code in cities_with_districts: base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"] replacement_codes = cities_with_districts[city_code] - df = pd.concat([df, pd.DataFrame({ - "commune_id": replacement_codes, - "urban_type": [base_type] * len(replacement_codes) - })]) - + df = pd.concat( + [ + df, + pd.DataFrame( + { + "commune_id": replacement_codes, + "urban_type": [base_type] * len(replacement_codes), + } + ), + ] + ) + df = df[~df["commune_id"].isin(cities_with_districts.keys())] # Clean unités urbaines - df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"}) + df["urban_type"] = df["urban_type"].replace( + {"B": "suburb", "C": "central_city", "I": "isolated_city", "H": "none"} + ) assert np.all(~df["urban_type"].isna()) df["urban_type"] = df["urban_type"].astype("category") @@ -66,8 +83,13 @@ def execute(context): return df + def validate(context): - if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))): + if not os.path.exists( + "%s/%s" % (context.config("data_path"), context.config("urban_type_path")) + ): raise RuntimeError("Urban type data is not available") - return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))) + return os.path.getsize( + "%s/%s" % (context.config("data_path"), context.config("urban_type_path")) + ) diff --git a/data/spatial/utils.py b/data/spatial/utils.py index 29b272d4..048116b4 100644 --- a/data/spatial/utils.py +++ b/data/spatial/utils.py @@ -3,20 +3,23 @@ import geopandas as gpd import pandas as pd -def to_gpd(context, df, x = "x", y = "y", crs = "EPSG:2154", column = "geometry"): + +def to_gpd(context, df, x="x", y="y", crs="EPSG:2154", column="geometry"): df[column] = [ - geo.Point(*coord) for coord in context.progress( - zip(df[x], df[y]), total = len(df), - label = "Converting coordinates" - )] - df = gpd.GeoDataFrame(df, crs = "EPSG:2154", geometry = column) + geo.Point(*coord) + for coord in context.progress( + zip(df[x], df[y]), total=len(df), label="Converting coordinates" + ) + ] + df = gpd.GeoDataFrame(df, crs="EPSG:2154", geometry=column) if not df.crs == "EPSG:2154": df = df.to_crs("EPSG:2154") return df -def sample_from_shape(shape, count, random, sample_size = None): + +def sample_from_shape(shape, count, random, sample_size=None): points = [] if sample_size is None: @@ -24,15 +27,16 @@ def sample_from_shape(shape, count, random, sample_size = None): while len(points) < count: minx, miny, maxx, maxy = shape.bounds - candidates = random.random_sample(size = (sample_size, 2)) - candidates[:,0] = minx + candidates[:,0] * (maxx - minx) - candidates[:,1] = miny + candidates[:,1] * (maxy - miny) + candidates = random.random_sample(size=(sample_size, 2)) + candidates[:, 0] = minx + candidates[:, 0] * (maxx - minx) + candidates[:, 1] = miny + candidates[:, 1] * (maxy - miny) candidates = [geo.Point(*point) for point in candidates] candidates = [point for point in candidates if shape.contains(point)] points += candidates return np.array([(point.x, point.y) for point in points[:count]]) + def _sample_from_zones(context, args): attribute_value, random_seed = args @@ -46,9 +50,12 @@ def _sample_from_zones(context, args): f = df[attribute] == attribute_value coordinates = sample_from_shape(zone, np.count_nonzero(f), random) - return pd.DataFrame(coordinates, columns = ["x", "y"], index = f[f].index) + return pd.DataFrame(coordinates, columns=["x", "y"], index=f[f].index) + -def sample_from_zones(context, df_zones, df, attribute, random, label = "Sampling coordinates ..."): +def sample_from_zones( + context, df_zones, df, attribute, random, label="Sampling coordinates ..." +): assert attribute in df assert attribute in df_zones @@ -57,8 +64,14 @@ def sample_from_zones(context, df_zones, df, attribute, random, label = "Samplin df_result = [] - with context.parallel(dict(df_zones = df_zones, df = df, attribute = attribute)) as parallel: - for df_partial in context.progress(parallel.imap(_sample_from_zones, zip(unique_values, random_seeds)), label = label, total = len(unique_values)): + with context.parallel( + dict(df_zones=df_zones, df=df, attribute=attribute) + ) as parallel: + for df_partial in context.progress( + parallel.imap(_sample_from_zones, zip(unique_values, random_seeds)), + label=label, + total=len(unique_values), + ): df_result.append(df_partial) return pd.concat(df_result) diff --git a/data/tiles/raw.py b/data/tiles/raw.py index b42a5d33..7af35c73 100644 --- a/data/tiles/raw.py +++ b/data/tiles/raw.py @@ -9,6 +9,7 @@ This stage loads the raw data from the French population income, poverty and living standards in tiled data. """ + def configure(context): context.stage("data.spatial.departments") context.config("data_path") @@ -62,4 +63,4 @@ def validate(context): return os.path.getsize( "{}/{}".format(context.config("data_path"), context.config("tiles_path")) - ) \ No newline at end of file + ) diff --git a/data/vehicles/raw.py b/data/vehicles/raw.py index 95a9fc31..b726ab63 100644 --- a/data/vehicles/raw.py +++ b/data/vehicles/raw.py @@ -9,31 +9,49 @@ https://www.statistiques.developpement-durable.gouv.fr/donnees-sur-le-parc-automobile-francais-au-1er-janvier-2021 """ + def configure(context): context.config("data_path") context.config("vehicles_path", "vehicles") context.config("vehicles_year", 2021) context.stage("data.spatial.codes") + def execute(context): df_codes = context.stage("data.spatial.codes") # the downloaded excel files meta-data are actually have a badly formatted ISO datetime - # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1659 - with mock.patch.object(excel.ExcelReader, 'read_properties', lambda self: None): + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1659 + with mock.patch.object(excel.ExcelReader, "read_properties", lambda self: None): year = str(context.config("vehicles_year")) - - with zipfile.ZipFile("{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_communes.zip")) as archive: + + with zipfile.ZipFile( + "{}/{}/{}".format( + context.config("data_path"), + context.config("vehicles_path"), + "parc_vp_communes.zip", + ) + ) as archive: with archive.open("Parc_VP_Communes_{}.xlsx".format(year)) as f: df_municipalities = pd.read_excel(f) - with zipfile.ZipFile("{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_regions.zip")) as archive: + with zipfile.ZipFile( + "{}/{}/{}".format( + context.config("data_path"), + context.config("vehicles_path"), + "parc_vp_regions.zip", + ) + ) as archive: with archive.open("Parc_VP_Regions_{}.xlsx".format(year)) as f: df_regions = pd.read_excel(f) - + df_municipalities["region_id"] = df_municipalities["Code région"].astype("category") - df_municipalities["departement_id"] = df_municipalities["Code départment"].astype("category") - df_municipalities["commune_id"] = df_municipalities["Code commune"].astype("category") + df_municipalities["departement_id"] = df_municipalities["Code départment"].astype( + "category" + ) + df_municipalities["commune_id"] = df_municipalities["Code commune"].astype( + "category" + ) df_regions["region_id"] = df_regions["Code région"].astype("category") @@ -41,14 +59,22 @@ def execute(context): requested_regions = set(df_codes["region_id"].astype(str).unique()) if len(requested_departements) > 0: - df_municipalities = df_municipalities[df_municipalities["departement_id"].isin(requested_departements)] + df_municipalities = df_municipalities[ + df_municipalities["departement_id"].isin(requested_departements) + ] if len(requested_regions) > 0: df_regions = df_regions[df_regions["region_id"].isin(requested_regions)] - df_municipalities["region_id"] = df_municipalities["region_id"].cat.remove_unused_categories() - df_municipalities["departement_id"] = df_municipalities["departement_id"].cat.remove_unused_categories() - df_municipalities["commune_id"] = df_municipalities["commune_id"].cat.remove_unused_categories() + df_municipalities["region_id"] = df_municipalities[ + "region_id" + ].cat.remove_unused_categories() + df_municipalities["departement_id"] = df_municipalities[ + "departement_id" + ].cat.remove_unused_categories() + df_municipalities["commune_id"] = df_municipalities[ + "commune_id" + ].cat.remove_unused_categories() df_regions["region_id"] = df_regions["region_id"].cat.remove_unused_categories() @@ -65,19 +91,46 @@ def execute(context): df_regions["fleet"] = df_regions[count_column_name] df_regions["age"] = df_regions[age_column_name] - df_vehicle_fleet_counts = df_municipalities.groupby(["region_id", "commune_id", "critair","technology"])["fleet"].sum().reset_index().dropna() - df_vehicle_age_counts = df_regions.groupby(["region_id", "critair", "technology", "age"])["fleet"].sum().reset_index().dropna() + df_vehicle_fleet_counts = ( + df_municipalities.groupby(["region_id", "commune_id", "critair", "technology"])[ + "fleet" + ] + .sum() + .reset_index() + .dropna() + ) + df_vehicle_age_counts = ( + df_regions.groupby(["region_id", "critair", "technology", "age"])["fleet"] + .sum() + .reset_index() + .dropna() + ) return df_vehicle_fleet_counts, df_vehicle_age_counts + def validate(context): - municipalities_path = "{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_communes.zip") - regions_path = "{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_regions.zip") + municipalities_path = "{}/{}/{}".format( + context.config("data_path"), + context.config("vehicles_path"), + "parc_vp_communes.zip", + ) + regions_path = "{}/{}/{}".format( + context.config("data_path"), + context.config("vehicles_path"), + "parc_vp_regions.zip", + ) if not os.path.exists(municipalities_path): - raise RuntimeError("Municipalities vehicle data is not available at {}".format(municipalities_path)) - + raise RuntimeError( + "Municipalities vehicle data is not available at {}".format( + municipalities_path + ) + ) + if not os.path.exists(regions_path): - raise RuntimeError("Regions vehicle data is not available at {}".format(regions_path)) + raise RuntimeError( + "Regions vehicle data is not available at {}".format(regions_path) + ) return os.path.getsize(municipalities_path) + os.path.getsize(regions_path) diff --git a/data/vehicles/types.py b/data/vehicles/types.py index b10b8c65..f8ef6828 100644 --- a/data/vehicles/types.py +++ b/data/vehicles/types.py @@ -4,18 +4,28 @@ This stage creates the various type of vehicles needed for the simulation with HBEFA emissions """ -HBEFA_TECH = ['petrol', 'diesel'] -HBEFA_EURO = ['1', '2', '3', '4', '5', '6ab', '6c', '6d'] +HBEFA_TECH = ["petrol", "diesel"] +HBEFA_EURO = ["1", "2", "3", "4", "5", "6ab", "6c", "6d"] + def configure(context): pass + def execute(context): vehicle_types = [ { - 'type_id': 'default_car', 'nb_seats': 4, 'length': 5.0, 'width': 1.0, 'pce': 1.0, 'mode': "car", - 'hbefa_cat': "PASSENGER_CAR", 'hbefa_tech': "average", 'hbefa_size': "average", 'hbefa_emission': "average", + "type_id": "default_car", + "nb_seats": 4, + "length": 5.0, + "width": 1.0, + "pce": 1.0, + "mode": "car", + "hbefa_cat": "PASSENGER_CAR", + "hbefa_tech": "average", + "hbefa_size": "average", + "hbefa_emission": "average", } ] @@ -25,7 +35,7 @@ def execute(context): id = "car_%s_%s" % (technology, euro) - if technology == "diesel" and euro in ['2', '3']: + if technology == "diesel" and euro in ["2", "3"]: euro += " (DPF)" size = ">=2L" if technology == "petrol" else "<1,4L" @@ -35,10 +45,17 @@ def execute(context): emission = "PC %s Euro-%s" % (tech, euro) - vehicle_types.append({ - 'type_id': id, 'length': 7.5, 'width': 1.0, - 'hbefa_cat': "PASSENGER_CAR", 'hbefa_tech': tech, 'hbefa_size': size, 'hbefa_emission': emission, - }) + vehicle_types.append( + { + "type_id": id, + "length": 7.5, + "width": 1.0, + "hbefa_cat": "PASSENGER_CAR", + "hbefa_tech": tech, + "hbefa_size": size, + "hbefa_emission": emission, + } + ) df_types = pd.DataFrame.from_records(vehicle_types) - return df_types \ No newline at end of file + return df_types diff --git a/docs/verify_data.py b/docs/verify_data.py index f657dbff..777a6482 100644 --- a/docs/verify_data.py +++ b/docs/verify_data.py @@ -12,8 +12,8 @@ "https://www.insee.fr/fr/statistiques/6544333", "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZA_csv.zip", "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZD_csv.zip", - "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZE_csv.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZE_csv.zip", + ], }, { "name": "OD Matrices 2019", @@ -21,30 +21,30 @@ "https://www.insee.fr/fr/statistiques/6456056", "https://www.insee.fr/fr/statistiques/6456052", "https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip", - "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip", + ], }, { "name": "Population totals 2019", "urls": [ "https://www.insee.fr/fr/statistiques/6543200", - "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019_csv.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019_csv.zip", + ], }, { "name": "Filosofi 2019", "urls": [ "https://www.insee.fr/fr/statistiques/6036907", "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES_csv.zip", - "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA_csv.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA_csv.zip", + ], }, { "name": "BPE 2021", "urls": [ "https://www.insee.fr/fr/statistiques/3568638", - "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip", + ], }, { "name": "ENTD 2008", @@ -55,35 +55,32 @@ "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_menage.csv", "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_individu.csv", "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_ind_lieu_teg.csv", - "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/K_deploc.csv" - ] + "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/K_deploc.csv", + ], }, { "name": "IRIS 2021", "urls": [ "https://geoservices.ign.fr/contoursiris", - "https://wxs.ign.fr/1yhlj2ehpqf3q6dt6a2y7b64/telechargement/inspire/CONTOURS-IRIS-PACK_2021-01$CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/file/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z" - ] + "https://wxs.ign.fr/1yhlj2ehpqf3q6dt6a2y7b64/telechargement/inspire/CONTOURS-IRIS-PACK_2021-01$CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/file/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z", + ], }, { "name": "Zoning 2021", "urls": [ "https://www.insee.fr/fr/information/2017499", - "https://www.insee.fr/fr/statistiques/fichier/2017499/reference_IRIS_geo2021.zip" - ] + "https://www.insee.fr/fr/statistiques/fichier/2017499/reference_IRIS_geo2021.zip", + ], }, { "name": "SIRENE", "urls": [ "https://www.data.gouv.fr/fr/datasets/base-sirene-des-entreprises-et-de-leurs-etablissements-siren-siret/" - ] + ], }, - { "name": "SIRET géolocalisé", - "urls": [ - "https://adresse.data.gouv.fr/donnees-nationales" - ] + "urls": ["https://adresse.data.gouv.fr/donnees-nationales"], }, # { # "name": "BD-TOPO", @@ -101,8 +98,8 @@ "https://download.geofabrik.de/europe/france/ile-de-france-latest.osm.pbf", "https://download.geofabrik.de/europe/france/rhone-alpes-latest.osm.pbf", "https://download.geofabrik.de/europe/france/languedoc-roussillon-latest.osm.pbf", - "https://download.geofabrik.de/europe/france/midi-pyrenees-latest.osm.pbf" - ] + "https://download.geofabrik.de/europe/france/midi-pyrenees-latest.osm.pbf", + ], }, { "name": "GTFS", @@ -115,9 +112,9 @@ "https://download.data.grandlyon.com/files/rdata/tcl_sytral.tcltheorique/GTFS_TCL.ZIP", "https://eu.ftp.opendatasoft.com/sncf/gtfs/export-ter-gtfs-last.zip", "https://eu.ftp.opendatasoft.com/sncf/gtfs/export-intercites-gtfs-last.zip", - "https://ressources.data.sncf.com/explore/dataset/horaires-des-train-voyages-tgvinouiouigo/files/538b55483fac4c1dad455022a0257014/download/" - ] - } + "https://ressources.data.sncf.com/explore/dataset/horaires-des-train-voyages-tgvinouiouigo/files/538b55483fac4c1dad455022a0257014/download/", + ], + }, ] # Start testing process @@ -125,7 +122,7 @@ from urllib.request import urlopen any_errors = False -sleep_time = 10 # s +sleep_time = 10 # s for test in tests: print("Testing %s ..." % test["name"]) diff --git a/documentation/info/collect.py b/documentation/info/collect.py index aca5c7f1..f0df1d72 100644 --- a/documentation/info/collect.py +++ b/documentation/info/collect.py @@ -1,6 +1,7 @@ import numpy as np import json + def configure(context): context.stage("data.hts.comparison") context.stage("data.census.cleaned") @@ -12,6 +13,7 @@ def configure(context): context.stage("data.census.filtered") context.stage("data.sirene.localized") + def execute(context): info = {} @@ -26,12 +28,30 @@ def execute(context): info["census"] = { "number_of_households": len(df_census["household_id"].unique()), "number_of_persons": len(df_census), - "weighted_number_of_households": df_census[["household_id", "weight"]].drop_duplicates("household_id")["weight"].sum(), + "weighted_number_of_households": df_census[["household_id", "weight"]] + .drop_duplicates("household_id")["weight"] + .sum(), "weighted_number_of_persons": df_census["weight"].sum(), - "share_of_households_without_iris": np.sum(df_households[~(df_households["iris_id"] != "undefined") & (df_households["commune_id"] != "undefined")]["weight"]) / np.sum(df_households["weight"]), - "share_of_households_without_commune": np.sum(df_households[~(df_households["iris_id"] != "undefined") & ~(df_households["commune_id"] != "undefined")]["weight"]) / np.sum(df_households["weight"]), - "filtered_households_share": context.get_info("data.census.filtered", "filtered_households_share"), - "filtered_persons_share": context.get_info("data.census.filtered", "filtered_persons_share"), + "share_of_households_without_iris": np.sum( + df_households[ + ~(df_households["iris_id"] != "undefined") + & (df_households["commune_id"] != "undefined") + ]["weight"] + ) + / np.sum(df_households["weight"]), + "share_of_households_without_commune": np.sum( + df_households[ + ~(df_households["iris_id"] != "undefined") + & ~(df_households["commune_id"] != "undefined") + ]["weight"] + ) + / np.sum(df_households["weight"]), + "filtered_households_share": context.get_info( + "data.census.filtered", "filtered_households_share" + ), + "filtered_persons_share": context.get_info( + "data.census.filtered", "filtered_persons_share" + ), } # OD data @@ -39,7 +59,7 @@ def execute(context): info["od"] = { "number_of_work_commutes": len(df_od_work), - "number_of_education_commutes": len(df_od_education) + "number_of_education_commutes": len(df_od_education), } # BPE @@ -48,8 +68,12 @@ def execute(context): info["bpe"] = { "number_of_enterprises": len(df_bpe), "number_of_shop_enterprises": int(np.sum(df_bpe["activity_type"] == "shop")), - "number_of_leisure_enterprises": int(np.sum(df_bpe["activity_type"] == "leisure")), - "number_of_education_enterprises": int(np.sum(df_bpe["activity_type"] == "education")), + "number_of_leisure_enterprises": int( + np.sum(df_bpe["activity_type"] == "leisure") + ), + "number_of_education_enterprises": int( + np.sum(df_bpe["activity_type"] == "education") + ), "number_of_other_enterprises": int(np.sum(df_bpe["activity_type"] == "other")), } @@ -58,28 +82,37 @@ def execute(context): info["zones"] = { "number_of_municipalities": len(df_codes["commune_id"].unique()), - "number_of_iris": len(df_codes["iris_id"].unique()) + "number_of_iris": len(df_codes["iris_id"].unique()), } with open("%s/zones.json" % context.cache_path, "w+") as f: - json.dump(info, f, indent = True) + json.dump(info, f, indent=True) # Income df_income_municipality = context.stage("data.income.municipality") - df_income_municipality = df_income_municipality[(df_income_municipality["attribute"] == "all") & (df_income_municipality["value"] == "all")] + df_income_municipality = df_income_municipality[ + (df_income_municipality["attribute"] == "all") + & (df_income_municipality["value"] == "all") + ] df_income_region = context.stage("data.income.region") info["income"] = { "minimum_median": int(df_income_municipality["q5"].min()), "maximum_median": int(df_income_municipality["q5"].max()), "median_region": int(df_income_region[4]), - "number_of_incomplete_distributions": int(np.sum(~df_income_municipality["is_missing"] & df_income_municipality["is_imputed"])), - "number_of_missing_distributions": int(np.sum(df_income_municipality["is_missing"])) + "number_of_incomplete_distributions": int( + np.sum( + ~df_income_municipality["is_missing"] + & df_income_municipality["is_imputed"] + ) + ), + "number_of_missing_distributions": int( + np.sum(df_income_municipality["is_missing"]) + ), } - # Output with open("%s/info.json" % context.cache_path, "w+") as f: - json.dump(info, f, indent = True) + json.dump(info, f, indent=True) return info diff --git a/documentation/info/tex.py b/documentation/info/tex.py index 1e42fc00..b0730bae 100644 --- a/documentation/info/tex.py +++ b/documentation/info/tex.py @@ -1,29 +1,54 @@ import numpy as np + def configure(context): context.stage("documentation.info.collect") + def execute(context): info = context.stage("documentation.info.collect") variables = { - "infoBpeNumberOfEnterprises": "{:,d}".format(info["bpe"]["number_of_enterprises"]), - "infoBpeNumberOfEducationEnterprises": "{:,d}".format(info["bpe"]["number_of_education_enterprises"]), - "infoBpeNumberOfShopEnterprises": "{:,d}".format(info["bpe"]["number_of_shop_enterprises"]), - "infoBpeNumberOfLeisureEnterprises": "{:,d}".format(info["bpe"]["number_of_leisure_enterprises"]), - "infoBpeNumberOfOtherEnterprises": "{:,d}".format(info["bpe"]["number_of_other_enterprises"]), - - "infoZonesNumberOfMunicipalities": "{:,d}".format(info["zones"]["number_of_municipalities"]), + "infoBpeNumberOfEnterprises": "{:,d}".format( + info["bpe"]["number_of_enterprises"] + ), + "infoBpeNumberOfEducationEnterprises": "{:,d}".format( + info["bpe"]["number_of_education_enterprises"] + ), + "infoBpeNumberOfShopEnterprises": "{:,d}".format( + info["bpe"]["number_of_shop_enterprises"] + ), + "infoBpeNumberOfLeisureEnterprises": "{:,d}".format( + info["bpe"]["number_of_leisure_enterprises"] + ), + "infoBpeNumberOfOtherEnterprises": "{:,d}".format( + info["bpe"]["number_of_other_enterprises"] + ), + "infoZonesNumberOfMunicipalities": "{:,d}".format( + info["zones"]["number_of_municipalities"] + ), "infoZonesNumberOfIris": "{:,d}".format(info["zones"]["number_of_iris"]), - - "infoIncomeMinimumMedian": "{:,.0f}".format(1e3 * np.round(info["income"]["minimum_median"] * 1e-3)), - "infoIncomeMaximumMedian": "{:,.0f}".format(1e3 * np.round(info["income"]["maximum_median"] * 1e-3)), - "infoIncomeMedianRegion": "{:,.0f}".format(1e3 * np.round(info["income"]["median_region"] * 1e-3)), - "infoIncomeNumberOfIncompleteDistributions": "{:,d}".format(info["income"]["number_of_incomplete_distributions"]), - "infoIncomeNumberOfMissingDistributions": "{:,d}".format(info["income"]["number_of_missing_distributions"]), - - "infoCensusFilteredHouseholds": "{:.2f}\\%".format(1e2 * info["census"]["filtered_households_share"]), - "infoCensusFilteredPersons": "{:.2f}\\%".format(1e2 * info["census"]["filtered_persons_share"]) + "infoIncomeMinimumMedian": "{:,.0f}".format( + 1e3 * np.round(info["income"]["minimum_median"] * 1e-3) + ), + "infoIncomeMaximumMedian": "{:,.0f}".format( + 1e3 * np.round(info["income"]["maximum_median"] * 1e-3) + ), + "infoIncomeMedianRegion": "{:,.0f}".format( + 1e3 * np.round(info["income"]["median_region"] * 1e-3) + ), + "infoIncomeNumberOfIncompleteDistributions": "{:,d}".format( + info["income"]["number_of_incomplete_distributions"] + ), + "infoIncomeNumberOfMissingDistributions": "{:,d}".format( + info["income"]["number_of_missing_distributions"] + ), + "infoCensusFilteredHouseholds": "{:.2f}\\%".format( + 1e2 * info["census"]["filtered_households_share"] + ), + "infoCensusFilteredPersons": "{:.2f}\\%".format( + 1e2 * info["census"]["filtered_persons_share"] + ), } latex = [] diff --git a/documentation/meta_output.py b/documentation/meta_output.py index e21bfbf6..2937e29c 100644 --- a/documentation/meta_output.py +++ b/documentation/meta_output.py @@ -1,6 +1,7 @@ import os, datetime, json import subprocess as sp + def configure(context): context.stage("matsim.runtime.git") context.config("output_path") @@ -9,6 +10,7 @@ def configure(context): for option in ("sampling_rate", "hts", "random_seed"): context.config(option) + def get_version(): version_path = os.path.dirname(os.path.realpath(__file__)) version_path = os.path.realpath("{}/../VERSION".format(version_path)) @@ -16,28 +18,39 @@ def get_version(): with open(version_path) as f: return f.read().strip() + def get_commit(): root_path = os.path.dirname(os.path.realpath(__file__)) root_path = os.path.realpath("{}/..".format(root_path)) try: - return sp.check_output(["git", "rev-parse", "HEAD"], cwd = root_path).strip().decode("utf-8") + return ( + sp.check_output(["git", "rev-parse", "HEAD"], cwd=root_path) + .strip() + .decode("utf-8") + ) except sp.CalledProcessError: return "unknown" + def execute(context): # Write meta information information = dict( - sampling_rate = context.config("sampling_rate"), - hts = context.config("hts"), - random_seed = context.config("random_seed"), - created = datetime.datetime.now(datetime.timezone.utc).isoformat(), - version = get_version(), - commit = get_commit() + sampling_rate=context.config("sampling_rate"), + hts=context.config("hts"), + random_seed=context.config("random_seed"), + created=datetime.datetime.now(datetime.timezone.utc).isoformat(), + version=get_version(), + commit=get_commit(), ) - with open("%s/%smeta.json" % (context.config("output_path"), context.config("output_prefix")), "w+") as f: - json.dump(information, f, indent = 4) + with open( + "%s/%smeta.json" + % (context.config("output_path"), context.config("output_prefix")), + "w+", + ) as f: + json.dump(information, f, indent=4) + def validate(context): return get_version() diff --git a/documentation/paper.py b/documentation/paper.py index 1f0a783f..fb630c74 100644 --- a/documentation/paper.py +++ b/documentation/paper.py @@ -1,8 +1,9 @@ import shutil + def configure(context): context.stage("documentation.plots.data.hts_comparison") - #context.stage("documentation.plots.theory.sampling_error") + # context.stage("documentation.plots.theory.sampling_error") context.stage("documentation.plots.monte_carlo") context.stage("documentation.plots.income") @@ -20,38 +21,73 @@ def configure(context): context.config("paper_path") + def execute(context): paper_path = context.config("paper_path") # Copy plots and tables mapping = { - "hts_comparison_distance.pdf": ("documentation.plots.data.hts_comparison", "distance_distribution.pdf"), - "hts_comparison_age.pdf": ("documentation.plots.data.hts_comparison", "age_distribution.pdf"), - - #"theory_sampling_error.pdf": ("documentation.plots.theory.sampling_error", "sampling_error.pdf"), - - #"sampling_sample_count.pdf": ("documentation.plots.sampling.sample_count", "sample_count.pdf"), - #"sampling_error_probability.pdf": ("documentation.plots.sampling.error_probability", "error_probability.pdf"), + "hts_comparison_distance.pdf": ( + "documentation.plots.data.hts_comparison", + "distance_distribution.pdf", + ), + "hts_comparison_age.pdf": ( + "documentation.plots.data.hts_comparison", + "age_distribution.pdf", + ), + # "theory_sampling_error.pdf": ("documentation.plots.theory.sampling_error", "sampling_error.pdf"), + # "sampling_sample_count.pdf": ("documentation.plots.sampling.sample_count", "sample_count.pdf"), + # "sampling_error_probability.pdf": ("documentation.plots.sampling.error_probability", "error_probability.pdf"), "monte_carlo.pdf": ("documentation.plots.monte_carlo", "monte_carlo.pdf"), - "monte_carlo_table.tex": ("documentation.plots.monte_carlo", "monte_carlo_table.tex"), - - "income_distributions.pdf": ("documentation.plots.income", "income_distributions.pdf"), - - "socdem_comparison_persons.pdf": ("documentation.plots.sociodemographics.general", "person.pdf"), - "socdem_comparison_households.pdf": ("documentation.plots.sociodemographics.general", "household.pdf"), - "socdem_spatial_comparison.pdf": ("documentation.plots.sociodemographics.local", "comparison.pdf"), - "activity_chain_comparison.pdf": ("documentation.plots.sociodemographics.chains", "activity_chains.pdf"), - - "commute_flow_bars.pdf": ("documentation.plots.commute_flow", "commute_flows.pdf"), - "commute_flow_boxplot.pdf": ("documentation.plots.commute_flow", "commute_flow_boxplot.pdf"), - "commute_distance_cdf.pdf": ("documentation.plots.commute_distance", "commute_distance_cdf.pdf"), - - "secloc_distributions.pdf": ("documentation.plots.secondary_locations", "input_distributions.pdf"), - "secloc_output.pdf": ("documentation.plots.secondary_locations", "distance_distributions.pdf"), - + "monte_carlo_table.tex": ( + "documentation.plots.monte_carlo", + "monte_carlo_table.tex", + ), + "income_distributions.pdf": ( + "documentation.plots.income", + "income_distributions.pdf", + ), + "socdem_comparison_persons.pdf": ( + "documentation.plots.sociodemographics.general", + "person.pdf", + ), + "socdem_comparison_households.pdf": ( + "documentation.plots.sociodemographics.general", + "household.pdf", + ), + "socdem_spatial_comparison.pdf": ( + "documentation.plots.sociodemographics.local", + "comparison.pdf", + ), + "activity_chain_comparison.pdf": ( + "documentation.plots.sociodemographics.chains", + "activity_chains.pdf", + ), + "commute_flow_bars.pdf": ( + "documentation.plots.commute_flow", + "commute_flows.pdf", + ), + "commute_flow_boxplot.pdf": ( + "documentation.plots.commute_flow", + "commute_flow_boxplot.pdf", + ), + "commute_distance_cdf.pdf": ( + "documentation.plots.commute_distance", + "commute_distance_cdf.pdf", + ), + "secloc_distributions.pdf": ( + "documentation.plots.secondary_locations", + "input_distributions.pdf", + ), + "secloc_output.pdf": ( + "documentation.plots.secondary_locations", + "distance_distributions.pdf", + ), "income.geojson": ("documentation.shapes", "income.geojson"), "info.tex": ("documentation.info.tex", "info.tex"), } for target, (stage, path) in mapping.items(): - shutil.copy("%s/%s" % (context.path(stage), path), "%s/%s" % (paper_path, target)) + shutil.copy( + "%s/%s" % (context.path(stage), path), "%s/%s" % (paper_path, target) + ) diff --git a/documentation/plots/commute_distance.py b/documentation/plots/commute_distance.py index 7d24fb3b..a11086de 100644 --- a/documentation/plots/commute_distance.py +++ b/documentation/plots/commute_distance.py @@ -7,12 +7,18 @@ SAMPLING_RATE = 0.05 + def configure(context): - context.stage("analysis.reference.hts.commute_distance", alias = "hts") - context.stage("analysis.synthesis.commute_distance", dict(sampling_rate = SAMPLING_RATE), alias = "data") - context.stage("analysis.reference.od.commute_distance", alias = "census") + context.stage("analysis.reference.hts.commute_distance", alias="hts") + context.stage( + "analysis.synthesis.commute_distance", + dict(sampling_rate=SAMPLING_RATE), + alias="data", + ) + context.stage("analysis.reference.od.commute_distance", alias="census") context.config("hts") + def execute(context): plotting.setup() @@ -21,32 +27,57 @@ def execute(context): census_data = context.stage("census") hts_name = context.config("hts") - plt.figure(figsize = plotting.SHORT_FIGSIZE) + plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [ - { "slot": "work", "linestyle": "-", "title": "Work" }, - { "slot": "education", "linestyle": "--", "title": "Educ." } + {"slot": "work", "linestyle": "-", "title": "Work"}, + {"slot": "education", "linestyle": "--", "title": "Educ."}, ] for part in parts: slot = part["slot"] - #plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0) - - plt.plot(data[slot]["mean"], data[slot]["cdf"], color = "k", linestyle = part["linestyle"], linewidth = 1.0) - plt.fill_betweenx(data[slot]["cdf"], data[slot]["min"], data[slot]["max"], color = "k", linewidth = 0.0, alpha = 0.25) - - plt.plot(hts_data[slot]["euclidean_distance"] * 1e-3, hts_data[slot]["cdf"], color = plotting.COLORS[hts_name], linestyle = part["linestyle"], linewidth = 1.0) - - plt.plot([np.nan], color = "k", linewidth = 1.0, linestyle = part["linestyle"], label = part["title"]) - - plt.plot([np.nan], color = "k", linewidth = 1.0, label = "Synthetic") - plt.plot([np.nan], color = plotting.COLORS[hts_name], linewidth = 1.0, label = "HTS") + # plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0) + + plt.plot( + data[slot]["mean"], + data[slot]["cdf"], + color="k", + linestyle=part["linestyle"], + linewidth=1.0, + ) + plt.fill_betweenx( + data[slot]["cdf"], + data[slot]["min"], + data[slot]["max"], + color="k", + linewidth=0.0, + alpha=0.25, + ) + + plt.plot( + hts_data[slot]["euclidean_distance"] * 1e-3, + hts_data[slot]["cdf"], + color=plotting.COLORS[hts_name], + linestyle=part["linestyle"], + linewidth=1.0, + ) + + plt.plot( + [np.nan], + color="k", + linewidth=1.0, + linestyle=part["linestyle"], + label=part["title"], + ) + + plt.plot([np.nan], color="k", linewidth=1.0, label="Synthetic") + plt.plot([np.nan], color=plotting.COLORS[hts_name], linewidth=1.0, label="HTS") plt.xlim([0, 40]) plt.ylim([0, 1]) - plt.legend(loc = "best", ncol = 2) + plt.legend(loc="best", ncol=2) plt.grid() plt.gca().set_axisbelow(True) diff --git a/documentation/plots/commute_flow.py b/documentation/plots/commute_flow.py index 2203b3f5..e3fe47f5 100644 --- a/documentation/plots/commute_flow.py +++ b/documentation/plots/commute_flow.py @@ -7,12 +7,18 @@ SAMPLING_RATE = 0.05 + def configure(context): context.config("hts") - context.stage("analysis.reference.od.commute_flow", alias = "census") - context.stage("analysis.reference.hts.commute_flow", alias = "hts") - context.stage("analysis.synthesis.commute_flow", dict(sampling_rate = SAMPLING_RATE), alias = "data") + context.stage("analysis.reference.od.commute_flow", alias="census") + context.stage("analysis.reference.hts.commute_flow", alias="hts") + context.stage( + "analysis.synthesis.commute_flow", + dict(sampling_rate=SAMPLING_RATE), + alias="data", + ) + def execute(context): plotting.setup() @@ -22,11 +28,11 @@ def execute(context): df_hts, df_correction = context.stage("hts") # PLOT: Work / education flows - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) figures = [ - { "slot": "work", "title": "Work", "top": 12 }, - { "slot": "education", "title": "Education", "top": 12, "factor": 0.7 } + {"slot": "work", "title": "Work", "top": 12}, + {"slot": "education", "title": "Education", "top": 12, "factor": 0.7}, ] for index, figure in enumerate(figures): @@ -34,35 +40,77 @@ def execute(context): slot = figure["slot"] df = context.stage("data")[slot] - df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot]) - df = pd.merge(df, df_correction[slot], on = "home") - df["scaled_reference"] = df["reference"] * (figure["factor"] if "factor" in figure else df["factor"]) + df = pd.merge( + df, + df_census[slot].rename(columns={"weight": "reference"}), + on=["home", slot], + ) + df = pd.merge(df, df_correction[slot], on="home") + df["scaled_reference"] = df["reference"] * ( + figure["factor"] if "factor" in figure else df["factor"] + ) count = figure["top"] - df = df.sort_values(by = "scaled_reference", ascending = False).head(count) - - plt.bar(np.arange(count), df["reference"], width = 0.4, align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"], alpha = 0.25) - plt.bar(np.arange(count), df["scaled_reference"], width = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"]) - plt.bar(np.arange(count) + 0.4, df["mean"] / SAMPLING_RATE, width = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"]) + df = df.sort_values(by="scaled_reference", ascending=False).head(count) + + plt.bar( + np.arange(count), + df["reference"], + width=0.4, + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["census"], + alpha=0.25, + ) + plt.bar( + np.arange(count), + df["scaled_reference"], + width=0.4, + label="Census", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["census"], + ) + plt.bar( + np.arange(count) + 0.4, + df["mean"] / SAMPLING_RATE, + width=0.4, + label="Synthetic", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["synthetic"], + ) for index, (min, max) in enumerate(zip(df["min"].values, df["max"].values)): index += 0.4 + 0.2 - plt.plot([index, index], [min / SAMPLING_RATE, max / SAMPLING_RATE], color = 'k', linewidth = 1.0) + plt.plot( + [index, index], + [min / SAMPLING_RATE, max / SAMPLING_RATE], + color="k", + linewidth=1.0, + ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5)) - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,))) + plt.gca().yaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,)) + ) origins, destinations = df["home"].values, df[figure["slot"]].values plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(count) + 0.4)) - plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%s\n%s" % item for item in zip(origins, destinations)])) + plt.gca().xaxis.set_major_formatter( + tck.FixedFormatter(["%s\n%s" % item for item in zip(origins, destinations)]) + ) plt.ylabel("Commuters [x1000]") - plt.legend(loc = "best") + plt.legend(loc="best") plt.title(figure["title"]) plt.tight_layout() @@ -70,11 +118,17 @@ def execute(context): plt.close() # PLOT: Scatter - plt.figure(figsize = plotting.SHORT_FIGSIZE) + plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [ - { "slot": "work", "title": "Work", "marker": ".", "color": "k" }, - { "slot": "education", "title": "Education", "factor": 0.7, "marker": ".", "color": plotting.COLORS[hts_name] } + {"slot": "work", "title": "Work", "marker": ".", "color": "k"}, + { + "slot": "education", + "title": "Education", + "factor": 0.7, + "marker": ".", + "color": plotting.COLORS[hts_name], + }, ] minimum = np.inf @@ -84,17 +138,33 @@ def execute(context): slot = part["slot"] df = context.stage("data")[slot] - df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot]) - df = pd.merge(df, df_correction[slot], on = "home") - df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"]) - - plt.loglog(df["scaled_reference"], df["mean"] / SAMPLING_RATE, markersize = 2, marker = part["marker"], color = part["color"], linestyle = "none", label = part["title"]) + df = pd.merge( + df, + df_census[slot].rename(columns={"weight": "reference"}), + on=["home", slot], + ) + df = pd.merge(df, df_correction[slot], on="home") + df["scaled_reference"] = df["reference"] * ( + part["factor"] if "factor" in part else df["factor"] + ) + + plt.loglog( + df["scaled_reference"], + df["mean"] / SAMPLING_RATE, + markersize=2, + marker=part["marker"], + color=part["color"], + linestyle="none", + label=part["title"], + ) minimum = np.minimum(minimum, df["scaled_reference"].min() * 0.9) maximum = np.maximum(maximum, df["scaled_reference"].max() * 1.1) x = np.linspace(minimum, maximum, 100) - plt.fill_between(x, x * 0.8, x * 1.2, color = "k", alpha = 0.2, linewidth = 0.0, label = r"20% Error") + plt.fill_between( + x, x * 0.8, x * 1.2, color="k", alpha=0.2, linewidth=0.0, label=r"20% Error" + ) plt.xlim([minimum, maximum]) plt.ylim([minimum, maximum]) @@ -111,37 +181,60 @@ def execute(context): plt.close() # PLOT: Histogram - plt.figure(figsize = plotting.SHORT_FIGSIZE) + plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [ - { "slot": "work", "title": "Work" }, - { "slot": "education", "title": "Education", "factor": 0.7 } + {"slot": "work", "title": "Work"}, + {"slot": "education", "title": "Education", "factor": 0.7}, ] for index, part in enumerate(parts): slot = part["slot"] df = context.stage("data")[slot] - df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot]) - df = pd.merge(df, df_correction[slot], on = "home") - df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"]) - - df["difference"] = 100 * (df["mean"] / SAMPLING_RATE - df["scaled_reference"]) / df["scaled_reference"] + df = pd.merge( + df, + df_census[slot].rename(columns={"weight": "reference"}), + on=["home", slot], + ) + df = pd.merge(df, df_correction[slot], on="home") + df["scaled_reference"] = df["reference"] * ( + part["factor"] if "factor" in part else df["factor"] + ) + + df["difference"] = ( + 100 + * (df["mean"] / SAMPLING_RATE - df["scaled_reference"]) + / df["scaled_reference"] + ) min = df["difference"].min() max = df["difference"].max() mean = df["difference"].mean() values = df["difference"].values - outliers = values # values[(values < min) | (values > max)] - - plt.plot([index - 0.2, index + 0.2], [min, min], color = "k", linewidth = 1.0) - plt.plot([index - 0.2, index + 0.2], [max, max], color = "k", linewidth = 1.0) - plt.plot([index - 0.2, index + 0.2], [mean, mean], color = "k", linewidth = 1.0, linestyle = ":") - plt.plot([index - 0.2, index - 0.2], [min, max], color = "k", linewidth = 1.0) - plt.plot([index + 0.2, index + 0.2], [min, max], color = "k", linewidth = 1.0) - - plt.plot([index] * len(outliers), outliers, color = "k", marker = ".", markersize = 2, linestyle = "none") + outliers = values # values[(values < min) | (values > max)] + + plt.plot([index - 0.2, index + 0.2], [min, min], color="k", linewidth=1.0) + plt.plot([index - 0.2, index + 0.2], [max, max], color="k", linewidth=1.0) + plt.plot( + [index - 0.2, index + 0.2], + [mean, mean], + color="k", + linewidth=1.0, + linestyle=":", + ) + plt.plot([index - 0.2, index - 0.2], [min, max], color="k", linewidth=1.0) + plt.plot([index + 0.2, index + 0.2], [min, max], color="k", linewidth=1.0) + + plt.plot( + [index] * len(outliers), + outliers, + color="k", + marker=".", + markersize=2, + linestyle="none", + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator([0, 1])) plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["Work", "Education"])) @@ -151,12 +244,14 @@ def execute(context): plt.xlim([-0.5, 1.5]) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) - plt.bar([np.nan], [np.nan], color = "none", edgecolor = "k", linewidth = 1.0, label = "5% - 95%") - plt.plot([np.nan], color = "k", linestyle = ":", label = "Mean") + plt.bar( + [np.nan], [np.nan], color="none", edgecolor="k", linewidth=1.0, label="5% - 95%" + ) + plt.plot([np.nan], color="k", linestyle=":", label="Mean") - plt.legend(loc = "best") + plt.legend(loc="best") plt.tight_layout() plt.savefig("%s/commute_flow_boxplot.pdf" % context.path()) diff --git a/documentation/plots/data/hts_chains.py b/documentation/plots/data/hts_chains.py index 3da51ecf..1b47c36b 100644 --- a/documentation/plots/data/hts_chains.py +++ b/documentation/plots/data/hts_chains.py @@ -5,46 +5,74 @@ import matplotlib.ticker as tck import documentation.plotting as plotting + def configure(context): - context.stage("analysis.reference.hts.chains", { "hts": "egt" }, alias = "egt") - context.stage("analysis.reference.hts.chains", { "hts": "entd" }, alias = "entd") + context.stage("analysis.reference.hts.chains", {"hts": "egt"}, alias="egt") + context.stage("analysis.reference.hts.chains", {"hts": "entd"}, alias="entd") + def execute(context): plotting.setup() marginal = ("age_range", "sex", "chain") - df_egt = context.stage("egt")[marginal].rename(columns = { "weight": "egt" }) - df_entd = context.stage("entd")[marginal].rename(columns = { "weight": "entd" }) + df_egt = context.stage("egt")[marginal].rename(columns={"weight": "egt"}) + df_entd = context.stage("entd")[marginal].rename(columns={"weight": "entd"}) - df = pd.merge(df_egt, df_entd, on = ["age_range", "sex", "chain"]) + df = pd.merge(df_egt, df_entd, on=["age_range", "sex", "chain"]) df = df[df["age_range"]] - df_female = df[df["sex"] == "female"].sort_values(by = "egt", ascending = False).head(10) - df_male = df[df["sex"] == "male"].sort_values(by = "egt", ascending = False).head(10) + df_female = ( + df[df["sex"] == "female"].sort_values(by="egt", ascending=False).head(10) + ) + df_male = df[df["sex"] == "male"].sort_values(by="egt", ascending=False).head(10) - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) - for index, (df, title) in enumerate(zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])): + for index, (df, title) in enumerate( + zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"]) + ): plt.subplot(1, 2, index + 1) - plt.bar(np.arange(10), df["egt"], width = 0.4, label = "EGT", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["egt"]) - plt.bar(np.arange(10) + 0.4, df["entd"], width = 0.4, label = "ENTD", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["entd"]) + plt.bar( + np.arange(10), + df["egt"], + width=0.4, + label="EGT", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["egt"], + ) + plt.bar( + np.arange(10) + 0.4, + df["entd"], + width=0.4, + label="ENTD", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["entd"], + ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5)) - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,))) + plt.gca().yaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,)) + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(10) + 0.4)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "\n".join(df["chain"].values[p]).upper())) + plt.gca().xaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "\n".join(df["chain"].values[p]).upper()) + ) if index == 1: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000)) plt.gca().yaxis.get_label().set_visible(False) - plt.legend(loc = "best", title = title) + plt.legend(loc="best", title=title) if index == 0: plt.ylabel("Number of persons [x1000]") diff --git a/documentation/plots/data/hts_comparison.py b/documentation/plots/data/hts_comparison.py index 4cb2d35e..c98186d0 100644 --- a/documentation/plots/data/hts_comparison.py +++ b/documentation/plots/data/hts_comparison.py @@ -6,9 +6,11 @@ import documentation.plotting as plotting + def configure(context): context.stage("data.hts.comparison") + def execute(context): plotting.setup() @@ -22,20 +24,37 @@ def execute(context): plt.figure() - plt.bar(df_distance[f_entd]["distance_class"].values, df_distance[f_entd]["trip_weight"].values / 1e6, width = 0.4, label = "ENTD (Routed)", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white") - plt.bar(df_distance[f_egt]["distance_class"].values + 0.4, df_distance[f_egt]["trip_weight"].values / 1e6, width = 0.4, label = "EGT (Euclidean)", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white") + plt.bar( + df_distance[f_entd]["distance_class"].values, + df_distance[f_entd]["trip_weight"].values / 1e6, + width=0.4, + label="ENTD (Routed)", + align="edge", + color=plotting.COLORS["entd"], + linewidth=0.5, + edgecolor="white", + ) + plt.bar( + df_distance[f_egt]["distance_class"].values + 0.4, + df_distance[f_egt]["trip_weight"].values / 1e6, + width=0.4, + label="EGT (Euclidean)", + align="edge", + color=plotting.COLORS["egt"], + linewidth=0.5, + edgecolor="white", + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(0, 10, 2) + 0.4)) - plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)])) - - plt.gca().annotate( - r"≥10 km", - xy = (10.0, 8.0), xycoords = 'data', ha = "right" + plt.gca().xaxis.set_major_formatter( + tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)]) ) + plt.gca().annotate(r"≥10 km", xy=(10.0, 8.0), xycoords="data", ha="right") + plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) plt.xlabel("Trip distance") plt.ylabel("Number of trips [$10^6$]") @@ -55,37 +74,70 @@ def execute(context): plt.figure() - plt.bar(df_age[f_census]["age_class"].values, df_age[f_census]["person_weight"].values / 1e6, width = 0.25, label = "Census", align = "edge", color = plotting.COLORS["census"], linewidth = 0.5, edgecolor = "white") - plt.bar(df_age[f_entd]["age_class"].values + 0.25, df_age[f_entd]["person_weight"].values / 1e6, width = 0.25, label = "ENTD", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white") - plt.bar(df_age[f_egt]["age_class"].values + 0.5, df_age[f_egt]["person_weight"].values / 1e6, width = 0.25, label = "EGT", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white") + plt.bar( + df_age[f_census]["age_class"].values, + df_age[f_census]["person_weight"].values / 1e6, + width=0.25, + label="Census", + align="edge", + color=plotting.COLORS["census"], + linewidth=0.5, + edgecolor="white", + ) + plt.bar( + df_age[f_entd]["age_class"].values + 0.25, + df_age[f_entd]["person_weight"].values / 1e6, + width=0.25, + label="ENTD", + align="edge", + color=plotting.COLORS["entd"], + linewidth=0.5, + edgecolor="white", + ) + plt.bar( + df_age[f_egt]["age_class"].values + 0.5, + df_age[f_egt]["person_weight"].values / 1e6, + width=0.25, + label="EGT", + align="edge", + color=plotting.COLORS["egt"], + linewidth=0.5, + edgecolor="white", + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1000) + 0.75 / 2)) - plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)])) + plt.gca().xaxis.set_major_formatter( + tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)]) + ) AGE_BOUNDS = ["<15", "15-29", "30-44", "45-59", "60-74", ">75"] plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(AGE_BOUNDS)) plt.gca().annotate( "A", - xy = (1.5 + 0.5 * 0.25, 2.0), xycoords='data', - xytext = (1.5 + 0.5 * 0.25, 2.35), textcoords='data', - arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 }, - bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) }, - ha = 'center' + xy=(1.5 + 0.5 * 0.25, 2.0), + xycoords="data", + xytext=(1.5 + 0.5 * 0.25, 2.35), + textcoords="data", + arrowprops={"arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5}, + bbox={"pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0)}, + ha="center", ) plt.gca().annotate( "B", - xy = (4.25 + 0.5 * 0.25, 1.3), xycoords='data', - xytext = (4.25 + 0.5 * 0.25, 1.65), textcoords='data', - arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 }, - bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) }, - ha = 'center' + xy=(4.25 + 0.5 * 0.25, 1.3), + xycoords="data", + xytext=(4.25 + 0.5 * 0.25, 1.65), + textcoords="data", + arrowprops={"arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5}, + bbox={"pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0)}, + ha="center", ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) plt.xlabel("Age") plt.ylabel("Number of persons [x$10^6$]") diff --git a/documentation/plots/income.py b/documentation/plots/income.py index f24ae137..b5029c5b 100644 --- a/documentation/plots/income.py +++ b/documentation/plots/income.py @@ -8,18 +8,24 @@ SAMPLING_RATE = 0.05 + def configure(context): context.stage("data.income.municipality") - context.stage("analysis.synthesis.income", dict(sampling_rate = SAMPLING_RATE), alias = "data") + context.stage( + "analysis.synthesis.income", dict(sampling_rate=SAMPLING_RATE), alias="data" + ) context.stage("analysis.reference.income") + def execute(context): plotting.setup() # Income imputation df_income = context.stage("data.income.municipality") - df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")] + df_income = df_income[ + (df_income["attribute"] == "all") & (df_income["value"] == "all") + ] df_imputed = df_income[df_income["is_imputed"]] plt.figure() @@ -29,8 +35,21 @@ def execute(context): plt.plot([minimum, maximum], [minimum, maximum], "k--") f = ~df_imputed["is_missing"] - plt.plot(df_imputed[f]["reference_median"] * 1e-3, df_imputed[f]["q5"] * 1e-3, '.', markersize = 3, color = plotting.COLORSET[0], label = "y") - plt.plot(df_imputed[~f]["reference_median"] * 1e-3, df_imputed[~f]["q5"] * 1e-3, 'x', markersize = 3, color = plotting.COLORSET[1]) + plt.plot( + df_imputed[f]["reference_median"] * 1e-3, + df_imputed[f]["q5"] * 1e-3, + ".", + markersize=3, + color=plotting.COLORSET[0], + label="y", + ) + plt.plot( + df_imputed[~f]["reference_median"] * 1e-3, + df_imputed[~f]["q5"] * 1e-3, + "x", + markersize=3, + color=plotting.COLORSET[1], + ) plt.xlabel("Reference median income [1000 EUR]") plt.ylabel("Imputed median income [1000 EUR]") @@ -47,23 +66,57 @@ def execute(context): df_reference = context.stage("analysis.reference.income") f = df_reference["source"] == "entd" - plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["entd"], label = "ENTD", linewidth = 1.0) + plt.plot( + df_reference[f]["income"].values * 1e-3, + df_reference[f]["cdf"].values, + color=plotting.COLORS["entd"], + label="ENTD", + linewidth=1.0, + ) f = df_reference["source"] == "egt" - plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["egt"], label = "EGT", linewidth = 1.0) + plt.plot( + df_reference[f]["income"].values * 1e-3, + df_reference[f]["cdf"].values, + color=plotting.COLORS["egt"], + label="EGT", + linewidth=1.0, + ) f = df_reference["source"] == "filo" - plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["census"], label = "Tax data", linewidth = 1.0, marker = ".", markersize = 3) - - plt.plot(df_data["mean"].values * 1e-3, df_data["cdf"].values, color = "k", label = "Synthetic", linewidth = 1.0, linestyle = ":") - plt.fill_betweenx(df_data["cdf"].values, df_data["min"].values * 1e-3, df_data["max"].values * 1e-3, color = "k", linewidth = 0.0, alpha = 0.25) + plt.plot( + df_reference[f]["income"].values * 1e-3, + df_reference[f]["cdf"].values, + color=plotting.COLORS["census"], + label="Tax data", + linewidth=1.0, + marker=".", + markersize=3, + ) + + plt.plot( + df_data["mean"].values * 1e-3, + df_data["cdf"].values, + color="k", + label="Synthetic", + linewidth=1.0, + linestyle=":", + ) + plt.fill_betweenx( + df_data["cdf"].values, + df_data["min"].values * 1e-3, + df_data["max"].values * 1e-3, + color="k", + linewidth=0.0, + alpha=0.25, + ) plt.xlim([0, 60]) plt.xlabel("Household income [1000 EUR]") plt.ylabel("Cumulative density") - plt.legend(loc = "lower right") + plt.legend(loc="lower right") plt.grid() plt.tight_layout() diff --git a/documentation/plots/language.py b/documentation/plots/language.py index 03131b97..3d1f9d43 100644 --- a/documentation/plots/language.py +++ b/documentation/plots/language.py @@ -1,5 +1,7 @@ - def get_source(source): - if source == "egt": return "EGT" - if source == "entd": return "ENTD" - if source == "census": return "Census" + if source == "egt": + return "EGT" + if source == "entd": + return "ENTD" + if source == "census": + return "Census" diff --git a/documentation/plots/matching.py b/documentation/plots/matching.py index d1777a8e..05ae8b00 100644 --- a/documentation/plots/matching.py +++ b/documentation/plots/matching.py @@ -7,19 +7,25 @@ SAMPLING_RATE = 0.05 POPULATION_SAMPLES = 200 + def configure(context): - context.stage("analysis.matching", { - "sampling_rate": SAMPLING_RATE, - "analysis_populations": POPULATION_SAMPLES, - }, alias = "data") + context.stage( + "analysis.matching", + { + "sampling_rate": SAMPLING_RATE, + "analysis_populations": POPULATION_SAMPLES, + }, + alias="data", + ) + def execute(context): data = context.stage("data") variables = max(data.keys()) + 1 means = [np.mean(data[v] / data[0]) for v in range(variables)] - #mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)] - #maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)] + # mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)] + # maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)] mins = [np.min(data[v] / data[0]) for v in range(variables)] maxs = [np.max(data[v] / data[0]) for v in range(variables)] @@ -28,16 +34,27 @@ def execute(context): plotting.setup() plt.figure() - plt.bar(range(variables), means, color = plotting.COLORS["synthetic"]) + plt.bar(range(variables), means, color=plotting.COLORS["synthetic"]) for v, min, max in zip(range(variables), mins, maxs): - plt.plot([v, v,], [min, max], linewidth = 1, label = "90% Conf.", color = "k") + plt.plot( + [ + v, + v, + ], + [min, max], + linewidth=1, + label="90% Conf.", + color="k", + ) plt.xlabel("Variables") plt.ylabel("Matching rate") plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2)) - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d%%" % (100 * x,))) + plt.gca().yaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x,)) + ) plt.tight_layout() plt.savefig("%s/matching_rate.pdf" % context.path()) diff --git a/documentation/plots/monte_carlo.py b/documentation/plots/monte_carlo.py index 122f11e4..cd71c018 100644 --- a/documentation/plots/monte_carlo.py +++ b/documentation/plots/monte_carlo.py @@ -9,10 +9,12 @@ from analysis.synthesis.statistics.monte_carlo import SAMPLING_RATES from analysis.synthesis.statistics.monte_carlo import ACQUISITION_SAMPLE_SIZE + def configure(context): context.stage("analysis.reference.census.sociodemographics") context.stage("analysis.synthesis.statistics.monte_carlo") + SELECTED_MARGINAL = ("age_class", "employed") SELECTED_VALUES = (3, True) @@ -24,14 +26,13 @@ def configure(context): "studies", ] -ADDITIONAL_VALUES = [ - (3, True), (4, True), (5, True) -] +ADDITIONAL_VALUES = [(3, True), (4, True), (5, True)] from analysis.marginals import AGE_CLASS_LABELS ADDITIONAL_LABELS = AGE_CLASS_LABELS[3:6] + def select(reference, data, marginal, values): df_marginal = data[marginal] df_reference = reference[marginal] @@ -44,6 +45,7 @@ def select(reference, data, marginal, values): return df_marginal, reference_value + def execute(context): data = context.stage("analysis.synthesis.statistics.monte_carlo") @@ -55,13 +57,15 @@ def execute(context): values = np.sort(df_marginal[(marginal,)].drop_duplicates().values) for value in values: - row = { "marginal": marginal, "value": value } + row = {"marginal": marginal, "value": value} df_value = df_marginal[df_marginal[marginal] == value] df_value = df_value[df_value["samples"] == ACQUISITION_SAMPLE_SIZE] assert len(df_value) == len(SAMPLING_RATES) - probabilities = df_value.sort_values(by = ["sampling_rate", "samples"])["error_probability"].values[:,0] + probabilities = df_value.sort_values(by=["sampling_rate", "samples"])[ + "error_probability" + ].values[:, 0] for sampling_rate, probability in zip(SAMPLING_RATES, probabilities): row[sampling_rate] = probability @@ -70,7 +74,7 @@ def execute(context): df_table = pd.DataFrame.from_records(df_table) df_table = create_table(df_table) - df_table.to_latex("%s/monte_carlo_table.tex" % context.path(), escape = False) + df_table.to_latex("%s/monte_carlo_table.tex" % context.path(), escape=False) # Prepare data for plotting reference = context.stage("analysis.reference.census.sociodemographics")["person"] @@ -78,52 +82,100 @@ def execute(context): # Perform plotting plotting.setup() - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) # ... subplot on nominal stratum values plt.subplot(1, 2, 1) - plt.title("(a) Monte Carlo analysis", fontsize = plotting.FONT_SIZE) + plt.title("(a) Monte Carlo analysis", fontsize=plotting.FONT_SIZE) - df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, SELECTED_VALUES) + df_marginal, reference_value = select( + reference, data, SELECTED_MARGINAL, SELECTED_VALUES + ) assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES) display_sampling_rates = [0.001, 0.01, 0.05] for index, sampling_rate in enumerate([0.001, 0.01, 0.05]): df_rate = df_marginal[df_marginal["sampling_rate"] == sampling_rate] - df_rate = df_rate.sort_values(by = "samples") - plt.fill_between(df_rate["samples"], df_rate[("weight", "q5")], df_rate[("weight", "q95")], alpha = 0.25 + index * 0.2, color = plotting.COLORSET[0], linewidth = 0.0) - - plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value] * 2, 'k--', label = "Ref. $y$", linewidth = 1.0) - plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 0.99] * 2, 'k:', label = "1% Err.", linewidth = 1.0) - plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2, 'k:', linewidth = 1.0) + df_rate = df_rate.sort_values(by="samples") + plt.fill_between( + df_rate["samples"], + df_rate[("weight", "q5")], + df_rate[("weight", "q95")], + alpha=0.25 + index * 0.2, + color=plotting.COLORSET[0], + linewidth=0.0, + ) + + plt.plot( + [1, ACQUISITION_SAMPLE_SIZE], + [reference_value] * 2, + "k--", + label="Ref. $y$", + linewidth=1.0, + ) + plt.plot( + [1, ACQUISITION_SAMPLE_SIZE], + [reference_value * 0.99] * 2, + "k:", + label="1% Err.", + linewidth=1.0, + ) + plt.plot( + [1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2, "k:", linewidth=1.0 + ) plt.xlabel("Sample size $N$") plt.ylabel("Stratum weight") - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6,))) + plt.gca().yaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6,)) + ) plt.grid() plt.gca().set_axisbelow(True) plt.xlim([1, ACQUISITION_SAMPLE_SIZE]) - plt.fill_between([np.nan], [np.nan], [np.nan], color = plotting.COLORSET[0], alpha = 0.25, label = "90% Conf.") - plt.legend(loc = "lower center", ncol = 2) + plt.fill_between( + [np.nan], + [np.nan], + [np.nan], + color=plotting.COLORSET[0], + alpha=0.25, + label="90% Conf.", + ) + plt.legend(loc="lower center", ncol=2) # ... subplot on nominal stratum values plt.subplot(1, 2, 2) - plt.title("(b) Error probability", fontsize = plotting.FONT_SIZE) + plt.title("(b) Error probability", fontsize=plotting.FONT_SIZE) for index, values in enumerate(ADDITIONAL_VALUES): - df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, values) + df_marginal, reference_value = select( + reference, data, SELECTED_MARGINAL, values + ) assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES) df_max = df_marginal[df_marginal["samples"] == ACQUISITION_SAMPLE_SIZE] - df_max = df_max.sort_values(by = "sampling_rate") - - plt.plot(100 * np.array(SAMPLING_RATES), df_max[("error_probability", "mean")], color = plotting.COLORSET[index], label = "Age %s" % ADDITIONAL_LABELS[index], marker = ".", markersize = 3.0, linewidth = 1.0) - - plt.plot([0, 100 * max(SAMPLING_RATES)], [0.9] * 2, 'k:', label = "90% Prob.", linewidth = 1.0) + df_max = df_max.sort_values(by="sampling_rate") + + plt.plot( + 100 * np.array(SAMPLING_RATES), + df_max[("error_probability", "mean")], + color=plotting.COLORSET[index], + label="Age %s" % ADDITIONAL_LABELS[index], + marker=".", + markersize=3.0, + linewidth=1.0, + ) + + plt.plot( + [0, 100 * max(SAMPLING_RATES)], + [0.9] * 2, + "k:", + label="90% Prob.", + linewidth=1.0, + ) plt.xlim([0, 100 * max(SAMPLING_RATES)]) plt.ylim([0, 1.0]) @@ -133,14 +185,16 @@ def execute(context): plt.grid() plt.gca().set_axisbelow(True) - plt.legend(loc = "center", ncol = 1) + plt.legend(loc="center", ncol=1) plt.tight_layout() plt.savefig("%s/monte_carlo.pdf" % context.path()) plt.close() + import analysis.marginals + def label_row(row): if row["marginal"] == "age_class": return analysis.marginals.AGE_CLASS_LABELS[row["value"]] @@ -157,28 +211,36 @@ def label_row(row): elif row["marginal"] == "socioprofessional_class": return analysis.marginals.SOCIOPROFESIONAL_CLASS_LABELS[row["value"]] + def bold_probability(x): if x >= 0.9: return "\\textbf{%.2f}" % x else: return "%.2f" % x -def create_table(df_table): - df_table["value"] = df_table.apply(label_row, axis = 1, raw = False) - df_table["marginal"] = df_table["marginal"].map({ - "age_class": "Age", - "sex": "Sex", - "employed": "Employed", - "studies": "Studies", - "socioprofessional_class": "Socioprof. Cat." - }) +def create_table(df_table): + df_table["value"] = df_table.apply(label_row, axis=1, raw=False) + + df_table["marginal"] = df_table["marginal"].map( + { + "age_class": "Age", + "sex": "Sex", + "employed": "Employed", + "studies": "Studies", + "socioprofessional_class": "Socioprof. Cat.", + } + ) for sampling_rate in SAMPLING_RATES: df_table[sampling_rate] = df_table[sampling_rate].apply(bold_probability) - df_table.columns = ["Variable", "Stratum"] + ["%.1f%%" % (100 * s,) for s in SAMPLING_RATES] + df_table.columns = ["Variable", "Stratum"] + [ + "%.1f%%" % (100 * s,) for s in SAMPLING_RATES + ] df_table = df_table.set_index(["Variable", "Stratum"]) - df_table.columns = pd.MultiIndex.from_tuples([("Sampling rate $s$", str(s)) for s in SAMPLING_RATES]) + df_table.columns = pd.MultiIndex.from_tuples( + [("Sampling rate $s$", str(s)) for s in SAMPLING_RATES] + ) return df_table diff --git a/documentation/plots/secondary_locations.py b/documentation/plots/secondary_locations.py index 296fb335..8b70da8d 100644 --- a/documentation/plots/secondary_locations.py +++ b/documentation/plots/secondary_locations.py @@ -3,6 +3,7 @@ import matplotlib.ticker as tck import documentation.plotting as plotting + def configure(context): context.stage("synthesis.population.spatial.secondary.distance_distributions") @@ -11,17 +12,20 @@ def configure(context): context.config("hts") + def execute(context): plotting.setup() hts_name = context.config("hts") # PLOT: Input distributions - distributions = context.stage("synthesis.population.spatial.secondary.distance_distributions") + distributions = context.stage( + "synthesis.population.spatial.secondary.distance_distributions" + ) plt.figure() modes = list(context.stage("analysis.reference.hts.mode_distances").keys()) - #modes = ["car", "car_passenger", "pt", "bike", "walk"] + # modes = ["car", "car_passenger", "pt", "bike", "walk"] for index, mode in enumerate(modes): mode_distribution = distributions[mode] @@ -36,21 +40,40 @@ def execute(context): weights = distribution["weights"] / np.sum(distribution["weights"]) means.append(np.sum(weights * distribution["values"])) - q10.append(distribution["values"][np.count_nonzero(distribution["cdf"] < 0.1)]) - q90.append(distribution["values"][np.count_nonzero(distribution["cdf"] < 0.9)]) + q10.append( + distribution["values"][np.count_nonzero(distribution["cdf"] < 0.1)] + ) + q90.append( + distribution["values"][np.count_nonzero(distribution["cdf"] < 0.9)] + ) if mode in ("car", "pt"): - plt.fill_between([0.0] + list(bounds), q10, q90, color = plotting.COLORSET5[index], alpha = 0.25, linewidth = 0.0) - - plt.plot([0.0] + list(bounds), means, label = "%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)), linewidth = 1.0, marker = ".", markersize = 3, color = plotting.COLORSET5[index]) + plt.fill_between( + [0.0] + list(bounds), + q10, + q90, + color=plotting.COLORSET5[index], + alpha=0.25, + linewidth=0.0, + ) + + plt.plot( + [0.0] + list(bounds), + means, + label="%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)), + linewidth=1.0, + marker=".", + markersize=3, + color=plotting.COLORSET5[index], + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 60 * 20)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: str(x // 60))) + plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: str(x // 60))) plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 5 * 1000)) - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: str(x // 1000))) + plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: str(x // 1000))) - plt.legend(loc = "upper left") + plt.legend(loc="upper left") plt.xlim([0, 90 * 60 if hts_name == "egt" else 50 * 60]) plt.ylim([0, 45 * 1000 if hts_name == "egt" else 25 * 1000]) @@ -67,29 +90,49 @@ def execute(context): df_synthetic = context.stage("analysis.synthesis.mode_distances") reference_data = context.stage("analysis.reference.hts.mode_distances") - plt.figure(figsize = (6.0, 2.5), dpi = 100) # 2.5 * 2.5 + plt.figure(figsize=(6.0, 2.5), dpi=100) # 2.5 * 2.5 limits = dict( - car = 20 * 1e3, car_passenger = 20 * 1e3, pt = 20 * 1e3, - bike = 6 * 1e3, walk = 1 * 1e3 + car=20 * 1e3, car_passenger=20 * 1e3, pt=20 * 1e3, bike=6 * 1e3, walk=1 * 1e3 ) - modes = ["car", "bike" if "bike" in modes else "walk" ] + modes = ["car", "bike" if "bike" in modes else "walk"] for index, mode in enumerate(modes): plt.subplot(1, 2, index + 1) mode_reference = reference_data[mode] - plt.plot(mode_reference["values"] * 1e-3, mode_reference["cdf"], linestyle = '--', color = "k", linewidth = 1.0, label = "HTS") + plt.plot( + mode_reference["values"] * 1e-3, + mode_reference["cdf"], + linestyle="--", + color="k", + linewidth=1.0, + label="HTS", + ) df_mode = df_synthetic[df_synthetic["mode"] == mode] - plt.fill_betweenx(df_mode["cdf"], df_mode["min"]* 1e-3, df_mode["max"] * 1e-3, linewidth = 0.0, color = plotting.COLORS[hts_name], alpha = 0.25, label = "Range") - plt.plot(df_mode["mean"] * 1e-3, df_mode["cdf"], color = plotting.COLORS[hts_name], linewidth = 1.0, label = "Synthetic") + plt.fill_betweenx( + df_mode["cdf"], + df_mode["min"] * 1e-3, + df_mode["max"] * 1e-3, + linewidth=0.0, + color=plotting.COLORS[hts_name], + alpha=0.25, + label="Range", + ) + plt.plot( + df_mode["mean"] * 1e-3, + df_mode["cdf"], + color=plotting.COLORS[hts_name], + linewidth=1.0, + label="Synthetic", + ) plt.xlim([0, limits[mode] * 1e-3]) plt.ylim([0, 1]) - plt.title(plotting.MODE_LABELS[mode], fontsize = plotting.FONT_SIZE) + plt.title(plotting.MODE_LABELS[mode], fontsize=plotting.FONT_SIZE) plt.xlabel("Euclidean distance [km]") plt.grid() @@ -97,7 +140,7 @@ def execute(context): plt.ylabel("Cumulative density") if index % 2 == 1: - plt.legend(loc = "best") + plt.legend(loc="best") plt.tight_layout() plt.savefig("%s/distance_distributions.pdf" % context.path()) diff --git a/documentation/plots/sociodemographics/chains.py b/documentation/plots/sociodemographics/chains.py index 6632e6de..fbed851c 100644 --- a/documentation/plots/sociodemographics/chains.py +++ b/documentation/plots/sociodemographics/chains.py @@ -7,16 +7,19 @@ SAMPLING_RATE = 0.05 + def configure(context): context.stage("analysis.reference.hts.chains") context.stage( "analysis.synthesis.sociodemographics.chains", - dict(sampling_rate = SAMPLING_RATE), alias = "data" + dict(sampling_rate=SAMPLING_RATE), + alias="data", ) context.config("hts") + def execute(context): plotting.setup() @@ -26,41 +29,76 @@ def execute(context): # PLOT: Activity chains by sex marginal = ("age_range", "sex", "chain") - df = pd.merge(data[marginal], reference[marginal].rename(columns = { "weight": "reference" })) + df = pd.merge( + data[marginal], reference[marginal].rename(columns={"weight": "reference"}) + ) df = df[df["age_range"]] - df_female = df[df["sex"] == "female"].sort_values(by = "reference", ascending = False).head(10) - df_male = df[df["sex"] == "male"].sort_values(by = "reference", ascending = False).head(10) + df_female = ( + df[df["sex"] == "female"].sort_values(by="reference", ascending=False).head(10) + ) + df_male = ( + df[df["sex"] == "male"].sort_values(by="reference", ascending=False).head(10) + ) - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) hts_name = context.config("hts") - for index, (df, title) in enumerate(zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])): + for index, (df, title) in enumerate( + zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"]) + ): plt.subplot(1, 2, index + 1) - plt.bar(np.arange(10), df["reference"], width = 0.4, label = "HTS", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS[hts_name]) - plt.bar(np.arange(10) + 0.4, df["mean"] / SAMPLING_RATE, width = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"]) + plt.bar( + np.arange(10), + df["reference"], + width=0.4, + label="HTS", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS[hts_name], + ) + plt.bar( + np.arange(10) + 0.4, + df["mean"] / SAMPLING_RATE, + width=0.4, + label="Synthetic", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["synthetic"], + ) for location, (min, max) in enumerate(zip(df["min"].values, df["max"].values)): location += 0.4 + 0.2 - plt.plot([location, location], [min / SAMPLING_RATE, max / SAMPLING_RATE], "k", linewidth = 1) + plt.plot( + [location, location], + [min / SAMPLING_RATE, max / SAMPLING_RATE], + "k", + linewidth=1, + ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().xaxis.grid(alpha = 0.0) + plt.gca().xaxis.grid(alpha=0.0) if hts_name == "egt": plt.ylim([0, 3.5e5]) else: plt.ylim([0, 5e5]) - plt.plot([np.nan], color = "k", linewidth = 1, label = "Range") + plt.plot([np.nan], color="k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5)) - plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,))) + plt.gca().yaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,)) + ) plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(10) + 0.4)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "\n".join(df["chain"].values[p]).upper())) + plt.gca().xaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "\n".join(df["chain"].values[p]).upper()) + ) if index == 1: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000)) @@ -69,7 +107,7 @@ def execute(context): handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3]] labels = [labels[-2], labels[-1], labels[-3]] - plt.legend(handles = handles, labels = labels, loc = "best", title = title) + plt.legend(handles=handles, labels=labels, loc="best", title=title) if index == 0: plt.ylabel("Number of persons [x1000]") diff --git a/documentation/plots/sociodemographics/general.py b/documentation/plots/sociodemographics/general.py index 869f0604..005451ae 100644 --- a/documentation/plots/sociodemographics/general.py +++ b/documentation/plots/sociodemographics/general.py @@ -9,6 +9,7 @@ SAMPLING_RATE = 0.05 + def configure(context): context.config("hts") @@ -17,15 +18,18 @@ def configure(context): context.stage( "analysis.synthesis.sociodemographics.general", - dict(sampling_rate = SAMPLING_RATE), alias = "data" + dict(sampling_rate=SAMPLING_RATE), + alias="data", ) + def get_reference(level, marginal, census, hts): if (marginal,) in census[level]: return census[level][(marginal,)] else: return hts[level][(marginal,)] + def prepare_reference(hts_marginals, census_marginals, level, marginal): if (marginal,) in census_marginals[level]: df = census_marginals[level][(marginal,)] @@ -34,26 +38,34 @@ def prepare_reference(hts_marginals, census_marginals, level, marginal): df = hts_marginals[level][(marginal,)] df["reference_source"] = "hts" - df = df.copy().rename(columns = { marginal: "value", "weight": "reference" }) + df = df.copy().rename(columns={marginal: "value", "weight": "reference"}) df = df[["value", "reference", "reference_source"]] - df = df.sort_values(by = "value") + df = df.sort_values(by="value") return df -def prepare_marginal(data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate): - df = data_marginals[level][(marginal,)].copy().rename(columns = { marginal: "value" }) + +def prepare_marginal( + data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate +): + df = data_marginals[level][(marginal,)].copy().rename(columns={marginal: "value"}) df["attribute"] = marginal df = df[["attribute", "value", "mean", "min", "max"]] - df = df.sort_values(by = "value") + df = df.sort_values(by="value") df["mean"] /= sampling_rate df["min"] /= sampling_rate df["max"] /= sampling_rate - df = pd.merge(df, prepare_reference(hts_marginals, census_marginals, level, marginal), on = "value") + df = pd.merge( + df, + prepare_reference(hts_marginals, census_marginals, level, marginal), + on="value", + ) return df + def label(row): if row["attribute"] == "age_class": return "Age %s" % analysis.marginals.AGE_CLASS_LABELS[row["value"]] @@ -77,22 +89,43 @@ def label(row): return "SC %s" % analysis.marginals.SOCIOPROFESIONAL_CLASS_LABELS[row["value"]] elif row["attribute"] == "household_size_class": - return "Household size %s" % analysis.marginals.HOUSEHOLD_SIZE_LABELS[row["value"]] + return ( + "Household size %s" % analysis.marginals.HOUSEHOLD_SIZE_LABELS[row["value"]] + ) elif row["attribute"] == "number_of_vehicles_class": - return "No. vehicles %s" % analysis.marginals.NUMBER_OF_VEHICLES_LABELS[row["value"]] + return ( + "No. vehicles %s" + % analysis.marginals.NUMBER_OF_VEHICLES_LABELS[row["value"]] + ) elif row["attribute"] == "number_of_bikes_class": - return "No. bicycles %s" % analysis.marginals.NUMBER_OF_BIKES_LABELS[row["value"]] + return ( + "No. bicycles %s" % analysis.marginals.NUMBER_OF_BIKES_LABELS[row["value"]] + ) + def add_labels(df_figure): - df_figure["label"] = df_figure.apply(label, axis = 1, raw = False) + df_figure["label"] = df_figure.apply(label, axis=1, raw=False) + + +def prepare_data( + data_marginals, hts_marginals, census_marginals, level, marginals, sampling_rate +): + return pd.concat( + [ + prepare_marginal( + data_marginals, + hts_marginals, + census_marginals, + level, + marginal, + sampling_rate, + ) + for marginal in marginals + ] + ) -def prepare_data(data_marginals, hts_marginals, census_marginals, level, marginals, sampling_rate): - return pd.concat([ - prepare_marginal(data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate) - for marginal in marginals - ]) def reweight_hts(df_figure, hts_marginals, census_marginals, level): hts_total = hts_marginals[level][tuple()]["weight"].values[0] @@ -101,6 +134,7 @@ def reweight_hts(df_figure, hts_marginals, census_marginals, level): f = df_figure["reference_source"] == "hts" df_figure.loc[f, "reference"] *= census_total / hts_total + def execute(context): plotting.setup() @@ -110,19 +144,37 @@ def execute(context): figures = [ dict( - level = "person", label = "Number of persons", size = (6.0, 5.0), - marginals = ["age_class", "sex", "employed", "studies", "has_license", "has_pt_subscription", "socioprofessional_class"] + level="person", + label="Number of persons", + size=(6.0, 5.0), + marginals=[ + "age_class", + "sex", + "employed", + "studies", + "has_license", + "has_pt_subscription", + "socioprofessional_class", + ], ), dict( - level = "household", label = "Number of households", size = plotting.WIDE_FIGSIZE, - marginals = ["household_size_class", "number_of_vehicles_class", "number_of_bikes_class"] - ) + level="household", + label="Number of households", + size=plotting.WIDE_FIGSIZE, + marginals=[ + "household_size_class", + "number_of_vehicles_class", + "number_of_bikes_class", + ], + ), ] for figure in figures: - plt.figure(figsize = figure["size"]) + plt.figure(figsize=figure["size"]) - df_figure = prepare_data(data, hts, census, figure["level"], figure["marginals"], SAMPLING_RATE) + df_figure = prepare_data( + data, hts, census, figure["level"], figure["marginals"], SAMPLING_RATE + ) reweight_hts(df_figure, hts, census, figure["level"]) add_labels(df_figure) @@ -130,32 +182,80 @@ def execute(context): locations = np.arange(len(df_figure)) f = (df_figure["reference_source"] == "census").values - plt.barh(locations[f], df_figure["reference"].values[f], height = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"]) - plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"]) + plt.barh( + locations[f], + df_figure["reference"].values[f], + height=0.4, + label="Census", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["census"], + ) + plt.barh( + locations[f] + 0.4, + df_figure["mean"].values[f], + height=0.4, + label="Synthetic", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["synthetic"], + ) f = (df_figure["reference_source"] == "hts").values hts_name = context.config("hts") - plt.barh(locations[f], df_figure["reference"].values[f], height = 0.4, label = "HTS", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS[hts_name]) - plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height = 0.4, label = None, align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"]) + plt.barh( + locations[f], + df_figure["reference"].values[f], + height=0.4, + label="HTS", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS[hts_name], + ) + plt.barh( + locations[f] + 0.4, + df_figure["mean"].values[f], + height=0.4, + label=None, + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["synthetic"], + ) - for index, (min, max) in enumerate(zip(df_figure["min"].values, df_figure["max"].values)): + for index, (min, max) in enumerate( + zip(df_figure["min"].values, df_figure["max"].values) + ): location = index + 0.4 + 0.2 - plt.plot([min, max], [location, location], "k", linewidth = 1, label = "Range") + plt.plot([min, max], [location, location], "k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4)) - plt.gca().yaxis.set_major_formatter(tck.FixedFormatter(df_figure["label"].values)) + plt.gca().yaxis.set_major_formatter( + tck.FixedFormatter(df_figure["label"].values) + ) if figure["level"] == "person": - plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1, 100) * 1e6 * 2)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%dM" % (x / 1e6,))) + plt.gca().xaxis.set_major_locator( + tck.FixedLocator(np.arange(1, 100) * 1e6 * 2) + ) + plt.gca().xaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%dM" % (x / 1e6,)) + ) if figure["level"] == "household": - plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%.1fM" % (x / 1e6,))) + plt.gca().xaxis.set_major_locator( + tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5) + ) + plt.gca().xaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%.1fM" % (x / 1e6,)) + ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().yaxis.grid(alpha = 0.0) + plt.gca().yaxis.grid(alpha=0.0) plt.gca().invert_yaxis() plt.xlabel(figure["label"]) @@ -163,7 +263,7 @@ def execute(context): handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3], handles[-4]] labels = [labels[-2], labels[-1], labels[-3], labels[-4]] - plt.legend(handles = handles, labels = labels, loc = "best") + plt.legend(handles=handles, labels=labels, loc="best") plt.tight_layout() plt.savefig("%s/%s.pdf" % (context.path(), figure["level"])) diff --git a/documentation/plots/sociodemographics/local.py b/documentation/plots/sociodemographics/local.py index 6d831c12..1e0e9584 100644 --- a/documentation/plots/sociodemographics/local.py +++ b/documentation/plots/sociodemographics/local.py @@ -9,15 +9,18 @@ SAMPLING_RATE = 0.05 + def configure(context): context.stage("analysis.reference.census.sociodemographics") context.stage( "analysis.synthesis.sociodemographics.spatial", - dict(sampling_rate = SAMPLING_RATE), alias = "data" + dict(sampling_rate=SAMPLING_RATE), + alias="data", ) -def filter_commune(marginals, commune_id, levels = ["person", "household"]): + +def filter_commune(marginals, commune_id, levels=["person", "household"]): result = {} for level in levels: @@ -26,7 +29,7 @@ def filter_commune(marginals, commune_id, levels = ["person", "household"]): for attributes, df_marginal in marginals[level].items(): if "commune_id" in attributes: f = df_marginal["commune_id"] == str(commune_id) - df_marginal = df_marginal[f].drop(columns = ["commune_id"]) + df_marginal = df_marginal[f].drop(columns=["commune_id"]) attributes = list(attributes) attributes.remove("commune_id") @@ -37,6 +40,7 @@ def filter_commune(marginals, commune_id, levels = ["person", "household"]): return result + def execute(context): plotting.setup() @@ -44,20 +48,36 @@ def execute(context): data = context.stage("data") cases = [ - dict(commune = 75113, title = "13th Arrondissement"), - dict(commune = 94028, title = "Alfortville"), + dict(commune=75113, title="13th Arrondissement"), + dict(commune=94028, title="Alfortville"), ] - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) for case_index, case in enumerate(cases): case_census = filter_commune(census, case["commune"]) case_data = filter_commune(data, case["commune"]) - df_case = pd.concat([ - prepare_data(case_data, case_census, case_census, "household", ["household_size_class"], SAMPLING_RATE), - prepare_data(case_data, case_census, case_census, "person", ["age_class"], SAMPLING_RATE), - ]) + df_case = pd.concat( + [ + prepare_data( + case_data, + case_census, + case_census, + "household", + ["household_size_class"], + SAMPLING_RATE, + ), + prepare_data( + case_data, + case_census, + case_census, + "person", + ["age_class"], + SAMPLING_RATE, + ), + ] + ) add_labels(df_case) @@ -67,36 +87,60 @@ def execute(context): reference_values = df_case["reference"].values mean_values = df_case["mean"].values - plt.barh(locations, df_case["reference"].values, height = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"]) - plt.barh(locations + 0.4, df_case["mean"].values, height = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"]) - - for index, (min, max) in enumerate(zip(df_case["min"].values, df_case["max"].values)): + plt.barh( + locations, + df_case["reference"].values, + height=0.4, + label="Census", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["census"], + ) + plt.barh( + locations + 0.4, + df_case["mean"].values, + height=0.4, + label="Synthetic", + align="edge", + linewidth=0.5, + edgecolor="white", + color=plotting.COLORS["synthetic"], + ) + + for index, (min, max) in enumerate( + zip(df_case["min"].values, df_case["max"].values) + ): location = index + 0.4 + 0.2 - plt.plot([min, max], [location, location], "k", linewidth = 1, label = "Range") + plt.plot([min, max], [location, location], "k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4)) if case_index == 0: - plt.gca().yaxis.set_major_formatter(tck.FixedFormatter(df_case["label"].values)) + plt.gca().yaxis.set_major_formatter( + tck.FixedFormatter(df_case["label"].values) + ) else: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100)) - plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%dk" % (x // 1000,))) + plt.gca().xaxis.set_major_formatter( + tck.FuncFormatter(lambda x, p: "%dk" % (x // 1000,)) + ) plt.grid() plt.gca().set_axisbelow(True) - plt.gca().yaxis.grid(alpha = 0.0) + plt.gca().yaxis.grid(alpha=0.0) plt.gca().invert_yaxis() plt.xlabel("Number of persons / households") plt.title(case["title"]) - #plt.ylim([len(locations) + 2.5, -0.5]) + # plt.ylim([len(locations) + 2.5, -0.5]) if case_index == 1: handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3]] labels = [labels[-2], labels[-1], labels[-3]] - plt.legend(handles = handles, labels = labels, loc = (0.05, 0.32), framealpha = 1.0) + plt.legend(handles=handles, labels=labels, loc=(0.05, 0.32), framealpha=1.0) plt.tight_layout() plt.savefig("%s/comparison.pdf" % (context.path(),)) diff --git a/documentation/plots/sociodemographics/utils.py b/documentation/plots/sociodemographics/utils.py index 14bfcb45..849aacb6 100644 --- a/documentation/plots/sociodemographics/utils.py +++ b/documentation/plots/sociodemographics/utils.py @@ -1,15 +1,20 @@ def create_labels(df, marginals): - labels = df.apply(lambda x: "%s %s" % ( - marginals[x["marginal"]]["category_label"], - marginals[x["marginal"]]["label"] - ), axis = 1) + labels = df.apply( + lambda x: "%s %s" + % ( + marginals[x["marginal"]]["category_label"], + marginals[x["marginal"]]["label"], + ), + axis=1, + ) labels = labels.str.replace("Number of", "No.") labels = labels.str.replace("Socioprof. Cat.", "SC") return labels.values -def filter_marginals(df, marginal_level, marginals, blacklist = set()): + +def filter_marginals(df, marginal_level, marginals, blacklist=set()): df = df[df["marginal_level"] == marginal_level] df = df[df["marginal"].isin(marginals.keys())] df = df[~df["marginal"].isin(blacklist)] diff --git a/documentation/plots/theory/sampling_error.py b/documentation/plots/theory/sampling_error.py index 9dca3897..dc6ae222 100644 --- a/documentation/plots/theory/sampling_error.py +++ b/documentation/plots/theory/sampling_error.py @@ -4,6 +4,7 @@ import scipy.stats as stats import documentation.plotting as plotting + def get_count_distribution(ns, w, s): l, u = np.floor(w), np.ceil(w) p = w - l @@ -13,6 +14,7 @@ def get_count_distribution(ns, w, s): return p * Fu + (1 - p) * Fl + def get_error_probability(ws, s, q): probabilities = [] @@ -25,23 +27,25 @@ def get_error_probability(ws, s, q): return probabilities + def configure(context): pass + def execute(context): plotting.setup() q = 0.01 - plt.figure(figsize = plotting.WIDE_FIGSIZE) + plt.figure(figsize=plotting.WIDE_FIGSIZE) for s, color in zip([0.01, 0.1, 0.25], ["#000000", "#777777", "#cccccc"]): ws = np.linspace(0, 2000, 10000) probs = get_error_probability(ws, s, q) - plt.plot(ws, probs, ".", label = "s = %.2f" % s, color = color, markersize = 2) + plt.plot(ws, probs, ".", label="s = %.2f" % s, color=color, markersize=2) - plt.legend(loc = "best") + plt.legend(loc="best") plt.grid() plt.xlabel("Reference weight") plt.ylabel("Probability") diff --git a/documentation/plotting.py b/documentation/plotting.py index 7900d8de..a23b8102 100644 --- a/documentation/plotting.py +++ b/documentation/plotting.py @@ -10,27 +10,28 @@ DPI = 300 FONT_SIZE = 8 -COLORSET = palettable.colorbrewer.qualitative.Set2_4.mpl_colors -COLORSET5 = palettable.colorbrewer.qualitative.Set2_5.mpl_colors +COLORSET = palettable.colorbrewer.qualitative.Set2_4.mpl_colors +COLORSET5 = palettable.colorbrewer.qualitative.Set2_5.mpl_colors COLORS = { "census": COLORSET[2], "entd": COLORSET[0], "egt": COLORSET[1], - "synthetic": "#cccccc", #COLORSET[3] + "synthetic": "#cccccc", # COLORSET[3] } MODE_LABELS = dict( - car = "Car driver", - car_passenger = "Car passenger", - pt = "Public transport", - bike = "Bicycle", - walk = "Walking" + car="Car driver", + car_passenger="Car passenger", + pt="Public transport", + bike="Bicycle", + walk="Walking", ) + def setup(): - plt.rc("font", family = "serif", size = FONT_SIZE) - plt.rc("figure", dpi = DPI, figsize = SHORT_FIGSIZE) - plt.rc("legend", fontsize = FONT_SIZE, loc = "best", fancybox = False) - plt.rc("grid", linewidth = 0.5) - plt.rc("patch", linewidth = 0.5) - plt.rc("mathtext", fontset = "cm") + plt.rc("font", family="serif", size=FONT_SIZE) + plt.rc("figure", dpi=DPI, figsize=SHORT_FIGSIZE) + plt.rc("legend", fontsize=FONT_SIZE, loc="best", fancybox=False) + plt.rc("grid", linewidth=0.5) + plt.rc("patch", linewidth=0.5) + plt.rc("mathtext", fontset="cm") diff --git a/documentation/shapes.py b/documentation/shapes.py index dd92ff63..a57d8ec5 100644 --- a/documentation/shapes.py +++ b/documentation/shapes.py @@ -4,28 +4,34 @@ import matplotlib.ticker as tck import palettable + def configure(context): context.stage("data.income.municipality") context.stage("data.spatial.municipalities") context.stage("data.bpe.cleaned") + def execute(context): df_communes = context.stage("data.spatial.municipalities") # Spatial income distribution df_income = context.stage("data.income.municipality") - df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")] - df_income = pd.merge(df_communes, df_income, how = "inner", on = "commune_id") + df_income = df_income[ + (df_income["attribute"] == "all") & (df_income["value"] == "all") + ] + df_income = pd.merge(df_communes, df_income, how="inner", on="commune_id") df_income["is_imputed"] = df_income["is_imputed"].astype(int) df_income["commune_id"] = df_income["commune_id"].astype(str) df_income["departement_id"] = df_income["departement_id"].astype(str) - df_income.to_file("%s/income.geojson" % context.cache_path, driver = "GeoJSON") + df_income.to_file("%s/income.geojson" % context.cache_path, driver="GeoJSON") # Enterprises - df_bpe = context.stage("data.bpe.cleaned")[["enterprise_id", "geometry", "imputed", "commune_id"]].copy() + df_bpe = context.stage("data.bpe.cleaned")[ + ["enterprise_id", "geometry", "imputed", "commune_id"] + ].copy() df_bpe["imputed"] = df_bpe["imputed"].astype(int) df_bpe["commune_id"] = df_bpe["commune_id"].astype(str) - df_bpe = df_bpe.iloc[np.random.choice(len(df_bpe), size = 10000, replace = False)] + df_bpe = df_bpe.iloc[np.random.choice(len(df_bpe), size=10000, replace=False)] df_bpe.to_file("%s/bpe.shp" % context.cache_path) return context.cache_path diff --git a/matsim/output.py b/matsim/output.py index 2f616403..520a9a19 100644 --- a/matsim/output.py +++ b/matsim/output.py @@ -1,10 +1,11 @@ import shutil + def configure(context): if context.config("run_matsim", True): # allow disabling performing one run of the simulation context.stage("matsim.simulation.run") - + context.stage("matsim.simulation.prepare") context.stage("matsim.runtime.eqasim") @@ -14,14 +15,14 @@ def configure(context): need_osm = context.config("export_detailed_network", False) if need_osm: context.stage("matsim.scenario.supply.osm") - context.stage("documentation.meta_output") + def execute(context): config_path = "%s/%s" % ( context.path("matsim.simulation.prepare"), - context.stage("matsim.simulation.prepare") + context.stage("matsim.simulation.prepare"), ) file_names = [ @@ -32,23 +33,33 @@ def execute(context): "%snetwork.xml.gz" % context.config("output_prefix"), "%stransit_schedule.xml.gz" % context.config("output_prefix"), "%stransit_vehicles.xml.gz" % context.config("output_prefix"), - "%sconfig.xml" % context.config("output_prefix") + "%sconfig.xml" % context.config("output_prefix"), ] for name in file_names: shutil.copy( "%s/%s" % (context.path("matsim.simulation.prepare"), name), - "%s/%s" % (context.config("output_path"), name) + "%s/%s" % (context.config("output_path"), name), ) if context.config("export_detailed_network"): shutil.copy( - "%s/%s" % (context.path("matsim.scenario.supply.osm"), "detailed_network.csv"), - "%s/%s" % (context.config("output_path"), "%sdetailed_network.csv" % context.config("output_prefix")) + "%s/%s" + % (context.path("matsim.scenario.supply.osm"), "detailed_network.csv"), + "%s/%s" + % ( + context.config("output_path"), + "%sdetailed_network.csv" % context.config("output_prefix"), + ), ) - + if context.config("write_jar"): shutil.copy( - "%s/%s" % (context.path("matsim.runtime.eqasim"), context.stage("matsim.runtime.eqasim")), - "%s/%srun.jar" % (context.config("output_path"), context.config("output_prefix")) + "%s/%s" + % ( + context.path("matsim.runtime.eqasim"), + context.stage("matsim.runtime.eqasim"), + ), + "%s/%srun.jar" + % (context.config("output_path"), context.config("output_prefix")), ) diff --git a/matsim/runtime/eqasim.py b/matsim/runtime/eqasim.py index 72e4846e..6a315f67 100644 --- a/matsim/runtime/eqasim.py +++ b/matsim/runtime/eqasim.py @@ -9,6 +9,7 @@ DEFAULT_EQASIM_BRANCH = "develop" DEFAULT_EQASIM_COMMIT = "ece4932" + def configure(context): context.stage("matsim.runtime.git") context.stage("matsim.runtime.java") @@ -20,6 +21,7 @@ def configure(context): context.config("eqasim_repository", "https://github.com/eqasim-org/eqasim-java.git") context.config("eqasim_path", "") + def run(context, command, arguments): version = context.config("eqasim_version") @@ -27,10 +29,12 @@ def run(context, command, arguments): context.stage("matsim.runtime.eqasim") jar_path = "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % ( - context.path("matsim.runtime.eqasim"), version + context.path("matsim.runtime.eqasim"), + version, ) java.run(context, command, arguments, jar_path) + def execute(context): version = context.config("eqasim_version") @@ -39,33 +43,61 @@ def execute(context): # Clone repository and checkout version branch = context.config("eqasim_branch") - git.run(context, [ - "clone", "--single-branch", "-b", branch, - context.config("eqasim_repository"), "eqasim-java" - ]) + git.run( + context, + [ + "clone", + "--single-branch", + "-b", + branch, + context.config("eqasim_repository"), + "eqasim-java", + ], + ) # Select the configured commit or tag commit = context.config("eqasim_commit") - git.run(context, [ - "checkout", commit - ], cwd = "{}/eqasim-java".format(context.path())) + git.run( + context, ["checkout", commit], cwd="{}/eqasim-java".format(context.path()) + ) # Build eqasim - maven.run(context, ["-Pstandalone", "--projects", "ile_de_france", "--also-make", "package", "-DskipTests=true"], cwd = "%s/eqasim-java" % context.path()) - - if not os.path.exists("{}/eqasim-java/ile_de_france/target/ile_de_france-{}.jar".format(context.path(), version)): - raise RuntimeError("The JAR was not created correctly. Wrong eqasim_version specified?") + maven.run( + context, + [ + "-Pstandalone", + "--projects", + "ile_de_france", + "--also-make", + "package", + "-DskipTests=true", + ], + cwd="%s/eqasim-java" % context.path(), + ) + + if not os.path.exists( + "{}/eqasim-java/ile_de_france/target/ile_de_france-{}.jar".format( + context.path(), version + ) + ): + raise RuntimeError( + "The JAR was not created correctly. Wrong eqasim_version specified?" + ) # Special case: We provide the jar directly. This is mainly used for # creating input to unit tests of the eqasim-java package. else: os.makedirs("%s/eqasim-java/ile_de_france/target" % context.path()) - shutil.copy(context.config("eqasim_path"), - "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % (context.path(), version)) + shutil.copy( + context.config("eqasim_path"), + "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar" + % (context.path(), version), + ) return "eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % version + def validate(context): path = context.config("eqasim_path") @@ -74,12 +106,14 @@ def validate(context): if not os.path.exists(path): raise RuntimeError("Cannot find eqasim at: %s" % path) - + if context.config("eqasim_tag") is None: if context.config("eqasim_commit") is None: raise RuntimeError("Either eqasim commit or tag must be defined") - - if (context.config("eqasim_tag") is None) == (context.config("eqasim_commit") is None): + + if (context.config("eqasim_tag") is None) == ( + context.config("eqasim_commit") is None + ): raise RuntimeError("Eqasim commit and tag must not be defined at the same time") return os.path.getmtime(path) diff --git a/matsim/runtime/git.py b/matsim/runtime/git.py index 08585b00..27e5551f 100644 --- a/matsim/runtime/git.py +++ b/matsim/runtime/git.py @@ -1,12 +1,14 @@ import subprocess as sp import shutil + def configure(context): context.config("git_binary", "git") -def run(context, arguments = [], cwd = None, catch_output = False): + +def run(context, arguments=[], cwd=None, catch_output=False): """ - This function calls git. + This function calls git. """ # Make sure there is a dependency context.stage("matsim.runtime.git") @@ -14,28 +16,29 @@ def run(context, arguments = [], cwd = None, catch_output = False): if cwd is None: cwd = context.path() - command_line = [ - shutil.which(context.config("git_binary")) - ] + arguments + command_line = [shutil.which(context.config("git_binary"))] + arguments if catch_output: - return sp.check_output(command_line, cwd = cwd).decode("utf-8").strip() + return sp.check_output(command_line, cwd=cwd).decode("utf-8").strip() else: - return_code = sp.check_call(command_line, cwd = cwd) + return_code = sp.check_call(command_line, cwd=cwd) if not return_code == 0: raise RuntimeError("Git return code: %d" % return_code) + def validate(context): if shutil.which(context.config("git_binary")) in ["", None]: - raise RuntimeError("Cannot find git binary at: %s" % context.config("git_binary")) + raise RuntimeError( + "Cannot find git binary at: %s" % context.config("git_binary") + ) - if not b"2." in sp.check_output([ - shutil.which(context.config("git_binary")), - "--version" - ], stderr = sp.STDOUT): + if not b"2." in sp.check_output( + [shutil.which(context.config("git_binary")), "--version"], stderr=sp.STDOUT + ): print("WARNING! Git of at least version 2.x.x is recommended!") + def execute(context): pass diff --git a/matsim/runtime/java.py b/matsim/runtime/java.py index 1cfe5702..94c0f1a5 100644 --- a/matsim/runtime/java.py +++ b/matsim/runtime/java.py @@ -1,16 +1,27 @@ import subprocess as sp import os, shutil + def configure(context): context.config("java_binary", "java") context.config("java_memory", "50G") -def run(context, entry_point, arguments = [], class_path = None, vm_arguments = [], cwd = None, memory = None, mode = "raise"): + +def run( + context, + entry_point, + arguments=[], + class_path=None, + vm_arguments=[], + cwd=None, + memory=None, + mode="raise", +): """ - This function calls java code. There are three modes: - - return_code: Returns the return code of the Java call - - output: Returns the output of the Java call - - raise (default): Raises an exception if the return code is not zero + This function calls java code. There are three modes: + - return_code: Returns the return code of the Java call + - output: Returns the output of the Java call + - raise (default): Raises an exception if the return code is not zero """ # Make sure there is a dependency context.stage("matsim.runtime.java") @@ -25,7 +36,7 @@ def run(context, entry_point, arguments = [], class_path = None, vm_arguments = vm_arguments = [ "-Xmx" + memory, "-Djava.io.tmpdir=%s" % temp_path, - "-Dmatsim.useLocalDtds=true" + "-Dmatsim.useLocalDtds=true", ] + vm_arguments # Prepare classpath @@ -37,38 +48,43 @@ def run(context, entry_point, arguments = [], class_path = None, vm_arguments = cwd = context.path() # Prepare command line - command_line = [ - shutil.which(context.config("java_binary")), - "-cp", class_path - ] + vm_arguments + [ - entry_point - ] + arguments + command_line = ( + [shutil.which(context.config("java_binary")), "-cp", class_path] + + vm_arguments + + [entry_point] + + arguments + ) command_line = list(map(str, command_line)) print("Executing java:", " ".join(command_line)) if mode == "raise" or mode == "return_code": - return_code = sp.check_call(command_line, cwd = cwd) + return_code = sp.check_call(command_line, cwd=cwd) if not return_code == 0: raise RuntimeError("Java return code: %d" % return_code) return return_code elif mode == "output": - return sp.check_output(command_line, cwd = cwd) + return sp.check_output(command_line, cwd=cwd) else: - raise RuntimeError("Mode is expected to be one of 'raise', 'return_code' or 'output'") + raise RuntimeError( + "Mode is expected to be one of 'raise', 'return_code' or 'output'" + ) + def validate(context): if shutil.which(context.config("java_binary")) in ["", None]: - raise RuntimeError("Cannot find Java binary at: %s" % context.config("java_binary")) + raise RuntimeError( + "Cannot find Java binary at: %s" % context.config("java_binary") + ) - if not b"11" in sp.check_output([ - shutil.which(context.config("java_binary")), - "-version" - ], stderr = sp.STDOUT): + if not b"11" in sp.check_output( + [shutil.which(context.config("java_binary")), "-version"], stderr=sp.STDOUT + ): print("WARNING! A Java JDK of at least version 11 is recommended.") + def execute(context): pass diff --git a/matsim/runtime/maven.py b/matsim/runtime/maven.py index a4832617..587dec63 100644 --- a/matsim/runtime/maven.py +++ b/matsim/runtime/maven.py @@ -1,13 +1,15 @@ import subprocess as sp import os, shutil + def configure(context): context.config("maven_binary", "mvn") context.config("maven_skip_tests", False) -def run(context, arguments = [], cwd = None): + +def run(context, arguments=[], cwd=None): """ - This function calls Maven. + This function calls Maven. """ # Make sure there is a dependency context.stage("matsim.runtime.maven") @@ -20,31 +22,32 @@ def run(context, arguments = [], cwd = None): if not os.path.exists(temp_path): os.mkdir(temp_path) - vm_arguments = [ - "-Djava.io.tmpdir=%s" % temp_path - ] + vm_arguments = ["-Djava.io.tmpdir=%s" % temp_path] if context.config("maven_skip_tests"): vm_arguments.append("-DskipTests=true") - command_line = [ - shutil.which(context.config("maven_binary")) - ] + vm_arguments + arguments + command_line = ( + [shutil.which(context.config("maven_binary"))] + vm_arguments + arguments + ) - return_code = sp.check_call(command_line, cwd = cwd) + return_code = sp.check_call(command_line, cwd=cwd) if not return_code == 0: raise RuntimeError("Maven return code: %d" % return_code) + def validate(context): if shutil.which(context.config("maven_binary")) in ["", None]: - raise RuntimeError("Cannot find Maven binary at: %s" % context.config("maven_binary")) + raise RuntimeError( + "Cannot find Maven binary at: %s" % context.config("maven_binary") + ) - if not b"3." in sp.check_output([ - shutil.which(context.config("maven_binary")), - "-version" - ], stderr = sp.STDOUT): + if not b"3." in sp.check_output( + [shutil.which(context.config("maven_binary")), "-version"], stderr=sp.STDOUT + ): print("WARNING! Maven of at least version 3.x.x is recommended!") + def execute(context): pass diff --git a/matsim/runtime/pt2matsim.py b/matsim/runtime/pt2matsim.py index ef837fd6..62573a33 100644 --- a/matsim/runtime/pt2matsim.py +++ b/matsim/runtime/pt2matsim.py @@ -5,6 +5,7 @@ import matsim.runtime.java as java import matsim.runtime.maven as maven + def configure(context): context.stage("matsim.runtime.git") context.stage("matsim.runtime.java") @@ -13,6 +14,7 @@ def configure(context): context.config("pt2matsim_version", "22.3") context.config("pt2matsim_branch", "v22.3") + def run(context, command, arguments, vm_arguments=[]): version = context.config("pt2matsim_version") @@ -20,29 +22,43 @@ def run(context, command, arguments, vm_arguments=[]): context.stage("matsim.runtime.pt2matsim") jar_path = "%s/pt2matsim/target/pt2matsim-%s-shaded.jar" % ( - context.path("matsim.runtime.pt2matsim"), version + context.path("matsim.runtime.pt2matsim"), + version, ) java.run(context, command, arguments, jar_path, vm_arguments) + def execute(context): version = context.config("pt2matsim_version") branch = context.config("pt2matsim_branch") # Clone repository and checkout version - git.run(context, [ - "clone", "https://github.com/matsim-org/pt2matsim.git", - "--branch", branch, - "--single-branch", "pt2matsim", - "--depth", "1" - ]) + git.run( + context, + [ + "clone", + "https://github.com/matsim-org/pt2matsim.git", + "--branch", + branch, + "--single-branch", + "pt2matsim", + "--depth", + "1", + ], + ) # Build pt2matsim - maven.run(context, ["package", "-DskipTests=true"], cwd = "%s/pt2matsim" % context.path()) + maven.run( + context, ["package", "-DskipTests=true"], cwd="%s/pt2matsim" % context.path() + ) jar_path = "%s/pt2matsim/target/pt2matsim-%s-shaded.jar" % (context.path(), version) # Test pt2matsim - java.run(context, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", [ - "test_config.xml" - ], jar_path) + java.run( + context, + "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", + ["test_config.xml"], + jar_path, + ) assert os.path.exists("%s/test_config.xml" % context.path()) diff --git a/matsim/scenario/facilities.py b/matsim/scenario/facilities.py index 4bc223a9..f721fa8c 100644 --- a/matsim/scenario/facilities.py +++ b/matsim/scenario/facilities.py @@ -5,28 +5,31 @@ import matsim.writers as writers + def configure(context): context.stage("synthesis.locations.secondary") context.stage("synthesis.population.spatial.home.locations") context.stage("synthesis.population.spatial.primary.locations") -HOME_FIELDS = [ - "household_id", "geometry" -] -PRIMARY_FIELDS = [ - "location_id", "geometry", "is_work" -] +HOME_FIELDS = ["household_id", "geometry"] + +PRIMARY_FIELDS = ["location_id", "geometry", "is_work"] SECONDARY_FIELDS = [ - "location_id", "geometry", "offers_leisure", "offers_shop", "offers_other" + "location_id", + "geometry", + "offers_leisure", + "offers_shop", + "offers_other", ] + def execute(context): output_path = "%s/facilities.xml.gz" % context.path() - with gzip.open(output_path, 'wb+') as writer: - with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer: + with gzip.open(output_path, "wb+") as writer: + with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer: writer = writers.FacilitiesWriter(writer) writer.start_facilities() @@ -35,13 +38,16 @@ def execute(context): df_homes = context.stage("synthesis.population.spatial.home.locations") df_homes = df_homes[HOME_FIELDS] - with context.progress(total = len(df_homes), label = "Writing home facilities ...") as progress: - for item in df_homes.itertuples(index = False): + with context.progress( + total=len(df_homes), label="Writing home facilities ..." + ) as progress: + for item in df_homes.itertuples(index=False): geometry = item[HOME_FIELDS.index("geometry")] writer.start_facility( "home_%s" % item[HOME_FIELDS.index("household_id")], - geometry.x, geometry.y + geometry.x, + geometry.y, ) writer.add_activity("home") @@ -49,7 +55,9 @@ def execute(context): # Write primary - df_work, df_education = context.stage("synthesis.population.spatial.primary.locations") + df_work, df_education = context.stage( + "synthesis.population.spatial.primary.locations" + ) df_work = df_work.drop_duplicates("location_id").copy() df_education = df_education.drop_duplicates("location_id").copy() @@ -60,16 +68,21 @@ def execute(context): df_locations = pd.concat([df_work, df_education]) df_locations = df_locations[PRIMARY_FIELDS] - with context.progress(total = len(df_locations), label = "Writing primary facilities ...") as progress: - for item in df_locations.itertuples(index = False): + with context.progress( + total=len(df_locations), label="Writing primary facilities ..." + ) as progress: + for item in df_locations.itertuples(index=False): geometry = item[PRIMARY_FIELDS.index("geometry")] writer.start_facility( str(item[PRIMARY_FIELDS.index("location_id")]), - geometry.x, geometry.y + geometry.x, + geometry.y, ) - writer.add_activity("work" if item[PRIMARY_FIELDS.index("is_work")] else "education") + writer.add_activity( + "work" if item[PRIMARY_FIELDS.index("is_work")] else "education" + ) writer.end_facility() # Write secondary @@ -77,13 +90,16 @@ def execute(context): df_locations = context.stage("synthesis.locations.secondary") df_locations = df_locations[SECONDARY_FIELDS] - with context.progress(total = len(df_locations), label = "Writing secondary facilities ...") as progress: - for item in df_locations.itertuples(index = False): + with context.progress( + total=len(df_locations), label="Writing secondary facilities ..." + ) as progress: + for item in df_locations.itertuples(index=False): geometry = item[SECONDARY_FIELDS.index("geometry")] writer.start_facility( item[SECONDARY_FIELDS.index("location_id")], - geometry.x, geometry.y + geometry.x, + geometry.y, ) for purpose in ("shop", "leisure", "other"): diff --git a/matsim/scenario/households.py b/matsim/scenario/households.py index 2f47cfee..0d33bd48 100644 --- a/matsim/scenario/households.py +++ b/matsim/scenario/households.py @@ -5,42 +5,69 @@ import matsim.writers as writers + def configure(context): context.stage("synthesis.population.enriched") -FIELDS = ["household_id", "person_id", "household_income", "car_availability", "bike_availability", "census_household_id"] + +FIELDS = [ + "household_id", + "person_id", + "household_income", + "car_availability", + "bike_availability", + "census_household_id", +] + def add_household(writer, household, member_ids): writer.start_household(household[FIELDS.index("household_id")]) writer.add_members(member_ids) writer.start_attributes() - writer.add_attribute("carAvailability", "java.lang.String", household[FIELDS.index("car_availability")]) - writer.add_attribute("bikeAvailability", "java.lang.String", household[FIELDS.index("bike_availability")]) - writer.add_attribute("household_income", "java.lang.Double", household[FIELDS.index("household_income")]) - writer.add_attribute("censusId", "java.lang.Long", household[FIELDS.index("census_household_id")]) + writer.add_attribute( + "carAvailability", + "java.lang.String", + household[FIELDS.index("car_availability")], + ) + writer.add_attribute( + "bikeAvailability", + "java.lang.String", + household[FIELDS.index("bike_availability")], + ) + writer.add_attribute( + "household_income", + "java.lang.Double", + household[FIELDS.index("household_income")], + ) + writer.add_attribute( + "censusId", "java.lang.Long", household[FIELDS.index("census_household_id")] + ) writer.end_attributes() writer.end_household() + def execute(context): output_path = "%s/households.xml.gz" % context.path() df_persons = context.stage("synthesis.population.enriched") - df_persons = df_persons.sort_values(by = ["household_id", "person_id"]) + df_persons = df_persons.sort_values(by=["household_id", "person_id"]) df_persons = df_persons[FIELDS] current_members = [] current_household_id = None current_household = None - with gzip.open(output_path, 'wb+') as writer: - with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer: + with gzip.open(output_path, "wb+") as writer: + with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer: writer = writers.HouseholdsWriter(writer) writer.start_households() - with context.progress(total = len(df_persons), label = "Writing households ...") as progress: - for item in df_persons.itertuples(index = False): + with context.progress( + total=len(df_persons), label="Writing households ..." + ) as progress: + for item in df_persons.itertuples(index=False): if current_household_id != item[FIELDS.index("household_id")]: if not current_household_id is None: add_household(writer, current_household, current_members) diff --git a/matsim/scenario/population.py b/matsim/scenario/population.py index 2fc0fa4d..2f7e52eb 100644 --- a/matsim/scenario/population.py +++ b/matsim/scenario/population.py @@ -7,6 +7,7 @@ import matsim.writers as writers from matsim.writers import backlog_iterator + def configure(context): context.stage("synthesis.population.enriched") @@ -16,59 +17,125 @@ def configure(context): context.stage("synthesis.population.trips") context.stage("synthesis.vehicles.vehicles") + PERSON_FIELDS = [ - "person_id", "household_income", "car_availability", "bike_availability", - "census_household_id", "census_person_id", "household_id", - "has_license", "has_pt_subscription", "is_passenger", - "hts_id", "hts_household_id", - "age", "employed", "sex" + "person_id", + "household_income", + "car_availability", + "bike_availability", + "census_household_id", + "census_person_id", + "household_id", + "has_license", + "has_pt_subscription", + "is_passenger", + "hts_id", + "hts_household_id", + "age", + "employed", + "sex", ] ACTIVITY_FIELDS = [ - "person_id", "start_time", "end_time", "purpose", "geometry", "location_id" + "person_id", + "start_time", + "end_time", + "purpose", + "geometry", + "location_id", ] -TRIP_FIELDS = [ - "person_id", "mode", "departure_time", "travel_time" -] +TRIP_FIELDS = ["person_id", "mode", "departure_time", "travel_time"] + +VEHICLE_FIELDS = ["owner_id", "vehicle_id", "mode"] -VEHICLE_FIELDS = [ - "owner_id", "vehicle_id", "mode" -] def add_person(writer, person, activities, trips, vehicles): writer.start_person(person[PERSON_FIELDS.index("person_id")]) writer.start_attributes() - writer.add_attribute("householdId", "java.lang.Integer", person[PERSON_FIELDS.index("household_id")]) - writer.add_attribute("householdIncome", "java.lang.Double", person[PERSON_FIELDS.index("household_income")]) - - writer.add_attribute("carAvailability", "java.lang.String", person[PERSON_FIELDS.index("car_availability")]) - writer.add_attribute("bikeAvailability", "java.lang.String", person[PERSON_FIELDS.index("bike_availability")]) - - writer.add_attribute("censusHouseholdId", "java.lang.Long", person[PERSON_FIELDS.index("census_household_id")]) - writer.add_attribute("censusPersonId", "java.lang.Long", person[PERSON_FIELDS.index("census_person_id")]) - - writer.add_attribute("htsHouseholdId", "java.lang.Long", person[PERSON_FIELDS.index("hts_household_id")]) - writer.add_attribute("htsPersonId", "java.lang.Long", person[PERSON_FIELDS.index("hts_id")]) - - writer.add_attribute("hasPtSubscription", "java.lang.Boolean", person[PERSON_FIELDS.index("has_pt_subscription")]) - writer.add_attribute("hasLicense", "java.lang.String", writer.yes_no(person[PERSON_FIELDS.index("has_license")])) - - writer.add_attribute("isPassenger", "java.lang.Boolean", person[PERSON_FIELDS.index("is_passenger")]) + writer.add_attribute( + "householdId", "java.lang.Integer", person[PERSON_FIELDS.index("household_id")] + ) + writer.add_attribute( + "householdIncome", + "java.lang.Double", + person[PERSON_FIELDS.index("household_income")], + ) + + writer.add_attribute( + "carAvailability", + "java.lang.String", + person[PERSON_FIELDS.index("car_availability")], + ) + writer.add_attribute( + "bikeAvailability", + "java.lang.String", + person[PERSON_FIELDS.index("bike_availability")], + ) + + writer.add_attribute( + "censusHouseholdId", + "java.lang.Long", + person[PERSON_FIELDS.index("census_household_id")], + ) + writer.add_attribute( + "censusPersonId", + "java.lang.Long", + person[PERSON_FIELDS.index("census_person_id")], + ) + + writer.add_attribute( + "htsHouseholdId", + "java.lang.Long", + person[PERSON_FIELDS.index("hts_household_id")], + ) + writer.add_attribute( + "htsPersonId", "java.lang.Long", person[PERSON_FIELDS.index("hts_id")] + ) + + writer.add_attribute( + "hasPtSubscription", + "java.lang.Boolean", + person[PERSON_FIELDS.index("has_pt_subscription")], + ) + writer.add_attribute( + "hasLicense", + "java.lang.String", + writer.yes_no(person[PERSON_FIELDS.index("has_license")]), + ) + + writer.add_attribute( + "isPassenger", "java.lang.Boolean", person[PERSON_FIELDS.index("is_passenger")] + ) writer.add_attribute("age", "java.lang.Integer", person[PERSON_FIELDS.index("age")]) - writer.add_attribute("employed", "java.lang.String", person[PERSON_FIELDS.index("employed")]) - writer.add_attribute("sex", "java.lang.String", person[PERSON_FIELDS.index("sex")][0]) - - writer.add_attribute("vehicles", "org.matsim.vehicles.PersonVehicles", "{{{content}}}".format(content = ",".join([ - "\"{mode}\":\"{id}\"".format(mode = v[VEHICLE_FIELDS.index("mode")], id = v[VEHICLE_FIELDS.index("vehicle_id")]) - for v in vehicles - ]))) + writer.add_attribute( + "employed", "java.lang.String", person[PERSON_FIELDS.index("employed")] + ) + writer.add_attribute( + "sex", "java.lang.String", person[PERSON_FIELDS.index("sex")][0] + ) + + writer.add_attribute( + "vehicles", + "org.matsim.vehicles.PersonVehicles", + "{{{content}}}".format( + content=",".join( + [ + '"{mode}":"{id}"'.format( + mode=v[VEHICLE_FIELDS.index("mode")], + id=v[VEHICLE_FIELDS.index("vehicle_id")], + ) + for v in vehicles + ] + ) + ), + ) writer.end_attributes() - writer.start_plan(selected = True) + writer.start_plan(selected=True) for activity, trip in itertools.zip_longest(activities, trips): start_time = activity[ACTIVITY_FIELDS.index("start_time")] @@ -80,58 +147,71 @@ def add_person(writer, person, activities, trips, vehicles): location_id = "home_%s" % person[PERSON_FIELDS.index("household_id")] location = writer.location( - geometry.x, geometry.y, - None if location_id == -1 else location_id + geometry.x, geometry.y, None if location_id == -1 else location_id ) writer.add_activity( - type = activity[ACTIVITY_FIELDS.index("purpose")], - location = location, - start_time = None if np.isnan(start_time) else start_time, - end_time = None if np.isnan(end_time) else end_time + type=activity[ACTIVITY_FIELDS.index("purpose")], + location=location, + start_time=None if np.isnan(start_time) else start_time, + end_time=None if np.isnan(end_time) else end_time, ) if not trip is None: writer.add_leg( - mode = trip[TRIP_FIELDS.index("mode")], - departure_time = trip[TRIP_FIELDS.index("departure_time")], - travel_time = trip[TRIP_FIELDS.index("travel_time")] + mode=trip[TRIP_FIELDS.index("mode")], + departure_time=trip[TRIP_FIELDS.index("departure_time")], + travel_time=trip[TRIP_FIELDS.index("travel_time")], ) writer.end_plan() writer.end_person() + def execute(context): output_path = "%s/population.xml.gz" % context.path() df_persons = context.stage("synthesis.population.enriched") - df_persons = df_persons.sort_values(by = ["household_id", "person_id"]) + df_persons = df_persons.sort_values(by=["household_id", "person_id"]) df_persons = df_persons[PERSON_FIELDS] - df_activities = context.stage("synthesis.population.activities").sort_values(by = ["person_id", "activity_index"]) - df_locations = context.stage("synthesis.population.spatial.locations")[[ - "person_id", "activity_index", "geometry", "location_id"]].sort_values(by = ["person_id", "activity_index"]) + df_activities = context.stage("synthesis.population.activities").sort_values( + by=["person_id", "activity_index"] + ) + df_locations = context.stage("synthesis.population.spatial.locations")[ + ["person_id", "activity_index", "geometry", "location_id"] + ].sort_values(by=["person_id", "activity_index"]) - df_activities = pd.merge(df_activities, df_locations, how = "left", on = ["person_id", "activity_index"]) - #df_activities["location_id"] = df_activities["location_id"].fillna(-1).astype(int) + df_activities = pd.merge( + df_activities, df_locations, how="left", on=["person_id", "activity_index"] + ) + # df_activities["location_id"] = df_activities["location_id"].fillna(-1).astype(int) df_trips = context.stage("synthesis.population.trips") df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"] df_vehicles = context.stage("synthesis.vehicles.vehicles")[1] - df_vehicles = df_vehicles.sort_values(by = ["owner_id"]) + df_vehicles = df_vehicles.sort_values(by=["owner_id"]) - with gzip.open(output_path, 'wb+') as writer: - with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer: + with gzip.open(output_path, "wb+") as writer: + with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer: writer = writers.PopulationWriter(writer) writer.start_population() - activity_iterator = backlog_iterator(iter(df_activities[ACTIVITY_FIELDS].itertuples(index = False))) - trip_iterator = backlog_iterator(iter(df_trips[TRIP_FIELDS].itertuples(index = False))) - vehicle_iterator = backlog_iterator(iter(df_vehicles[VEHICLE_FIELDS].itertuples(index = False))) + activity_iterator = backlog_iterator( + iter(df_activities[ACTIVITY_FIELDS].itertuples(index=False)) + ) + trip_iterator = backlog_iterator( + iter(df_trips[TRIP_FIELDS].itertuples(index=False)) + ) + vehicle_iterator = backlog_iterator( + iter(df_vehicles[VEHICLE_FIELDS].itertuples(index=False)) + ) - with context.progress(total = len(df_persons), label = "Writing population ...") as progress: - for person in df_persons.itertuples(index = False): + with context.progress( + total=len(df_persons), label="Writing population ..." + ) as progress: + for person in df_persons.itertuples(index=False): person_id = person[PERSON_FIELDS.index("person_id")] activities = [] @@ -142,7 +222,10 @@ def execute(context): while activity_iterator.has_next(): activity = activity_iterator.next() - if not activity[ACTIVITY_FIELDS.index("person_id")] == person_id: + if ( + not activity[ACTIVITY_FIELDS.index("person_id")] + == person_id + ): activity_iterator.previous() break else: diff --git a/matsim/scenario/supply/gtfs.py b/matsim/scenario/supply/gtfs.py index 0635cc0f..83991cdb 100644 --- a/matsim/scenario/supply/gtfs.py +++ b/matsim/scenario/supply/gtfs.py @@ -2,6 +2,7 @@ import matsim.runtime.pt2matsim as pt2matsim + def configure(context): context.stage("matsim.runtime.java") context.stage("matsim.runtime.pt2matsim") @@ -10,21 +11,26 @@ def configure(context): context.config("gtfs_date", "dayWithMostServices") + def execute(context): gtfs_path = "%s/output" % context.path("data.gtfs.cleaned") crs = context.stage("synthesis.population.spatial.home.locations").crs - pt2matsim.run(context, "org.matsim.pt2matsim.run.Gtfs2TransitSchedule", [ - gtfs_path, - context.config("gtfs_date"), crs, - "%s/transit_schedule.xml.gz" % context.path(), - "%s/transit_vehicles.xml.gz" % context.path() - ]) + pt2matsim.run( + context, + "org.matsim.pt2matsim.run.Gtfs2TransitSchedule", + [ + gtfs_path, + context.config("gtfs_date"), + crs, + "%s/transit_schedule.xml.gz" % context.path(), + "%s/transit_vehicles.xml.gz" % context.path(), + ], + ) - assert(os.path.exists("%s/transit_schedule.xml.gz" % context.path())) - assert(os.path.exists("%s/transit_vehicles.xml.gz" % context.path())) + assert os.path.exists("%s/transit_schedule.xml.gz" % context.path()) + assert os.path.exists("%s/transit_vehicles.xml.gz" % context.path()) return dict( - schedule_path = "transit_schedule.xml.gz", - vehicles_path = "transit_vehicles.xml.gz" + schedule_path="transit_schedule.xml.gz", vehicles_path="transit_vehicles.xml.gz" ) diff --git a/matsim/scenario/supply/osm.py b/matsim/scenario/supply/osm.py index f723104e..f9ea0485 100644 --- a/matsim/scenario/supply/osm.py +++ b/matsim/scenario/supply/osm.py @@ -2,6 +2,7 @@ import matsim.runtime.pt2matsim as pt2matsim + def configure(context): context.stage("matsim.runtime.java") context.stage("matsim.runtime.pt2matsim") @@ -10,12 +11,15 @@ def configure(context): context.config("export_detailed_network", False) + def execute(context): osm_path = "%s/output.osm.gz" % context.path("data.osm.cleaned") crs = context.stage("data.spatial.iris").crs - pt2matsim.run(context, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", - arguments=["config_template.xml"] + pt2matsim.run( + context, + "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", + arguments=["config_template.xml"], ) with open("%s/config_template.xml" % context.path()) as f_read: @@ -23,17 +27,17 @@ def execute(context): content = content.replace( '', - '' % osm_path + '' % osm_path, ) content = content.replace( '', - ''.format(crs) + ''.format(crs), ) content = content.replace( '', - '' + '', ) if context.config("export_detailed_network"): @@ -43,22 +47,24 @@ def execute(context): ) content = content.replace( - '', + "", """ - """ + """, ) with open("%s/config.xml" % context.path(), "w+") as f_write: f_write.write(content) - pt2matsim.run(context, "org.matsim.pt2matsim.run.Osm2MultimodalNetwork", - arguments=["config.xml"] + pt2matsim.run( + context, + "org.matsim.pt2matsim.run.Osm2MultimodalNetwork", + arguments=["config.xml"], ) - assert(os.path.exists("%s/network.xml.gz" % context.path())) + assert os.path.exists("%s/network.xml.gz" % context.path()) return "network.xml.gz" diff --git a/matsim/scenario/supply/processed.py b/matsim/scenario/supply/processed.py index f75fc130..448df94f 100644 --- a/matsim/scenario/supply/processed.py +++ b/matsim/scenario/supply/processed.py @@ -2,6 +2,7 @@ import matsim.runtime.pt2matsim as pt2matsim + def configure(context): context.stage("matsim.runtime.java") context.stage("matsim.runtime.pt2matsim") @@ -12,68 +13,71 @@ def configure(context): context.config("data_path") context.config("processes") + def execute(context): # Prepare input paths network_path = "%s/%s" % ( context.path("matsim.scenario.supply.osm"), - context.stage("matsim.scenario.supply.osm") + context.stage("matsim.scenario.supply.osm"), ) schedule_path = "%s/%s" % ( context.path("matsim.scenario.supply.gtfs"), - context.stage("matsim.scenario.supply.gtfs")["schedule_path"] + context.stage("matsim.scenario.supply.gtfs")["schedule_path"], ) # Create and modify config file - pt2matsim.run(context, "org.matsim.pt2matsim.run.CreateDefaultPTMapperConfig", [ - "config_template.xml" - ]) + pt2matsim.run( + context, + "org.matsim.pt2matsim.run.CreateDefaultPTMapperConfig", + ["config_template.xml"], + ) with open("%s/config_template.xml" % context.path()) as f_read: content = f_read.read() content = content.replace( '', - '' % network_path + '' % network_path, ) content = content.replace( '', - '' % schedule_path + '' % schedule_path, ) content = content.replace( '', - '' % context.config("processes") + '' % context.config("processes"), ) content = content.replace( '', - '' + '', ) content = content.replace( '', - '' + '', ) content = content.replace( '', - '' + '', ) content = content.replace( '', - '' + '', ) with open("%s/config.xml" % context.path(), "w+") as f_write: f_write.write(content) # Run mapping process - pt2matsim.run(context, "org.matsim.pt2matsim.run.PublicTransitMapper", [ - "config.xml" - ]) + pt2matsim.run( + context, "org.matsim.pt2matsim.run.PublicTransitMapper", ["config.xml"] + ) - assert(os.path.exists("%s/network.xml.gz" % context.path())) - assert(os.path.exists("%s/schedule.xml.gz" % context.path())) + assert os.path.exists("%s/network.xml.gz" % context.path()) + assert os.path.exists("%s/schedule.xml.gz" % context.path()) return dict( - network_path = "network.xml.gz", - schedule_path = "schedule.xml.gz", - #plausibility_path = "allPlausibilityWarnings.xml.gz" + network_path="network.xml.gz", + schedule_path="schedule.xml.gz", + # plausibility_path = "allPlausibilityWarnings.xml.gz" ) diff --git a/matsim/scenario/vehicles.py b/matsim/scenario/vehicles.py index 63205fc3..9530bbdc 100644 --- a/matsim/scenario/vehicles.py +++ b/matsim/scenario/vehicles.py @@ -5,52 +5,59 @@ import matsim.writers as writers + def configure(context): context.stage("synthesis.vehicles.vehicles") + TYPE_FIELDS = ["type_id", "nb_seats", "length", "width", "pce", "mode"] VEHICLE_FIELDS = ["vehicle_id", "type_id", "critair", "technology", "age", "euro"] + def execute(context): output_path = "%s/vehicles.xml.gz" % context.path() df_vehicle_types, df_vehicles = context.stage("synthesis.vehicles.vehicles") - with gzip.open(output_path, 'wb+') as writer: - with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer: + with gzip.open(output_path, "wb+") as writer: + with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer: writer = writers.VehiclesWriter(writer) writer.start_vehicles() - with context.progress(total = len(df_vehicle_types), label = "Writing vehicles types ...") as progress: + with context.progress( + total=len(df_vehicle_types), label="Writing vehicles types ..." + ) as progress: for type in df_vehicle_types.to_dict(orient="records"): writer.add_type( type["type_id"], length=type["length"], width=type["width"], - engine_attributes = { + engine_attributes={ "HbefaVehicleCategory": type["hbefa_cat"], "HbefaTechnology": type["hbefa_tech"], "HbefaSizeClass": type["hbefa_size"], - "HbefaEmissionsConcept": type["hbefa_emission"] - } + "HbefaEmissionsConcept": type["hbefa_emission"], + }, ) progress.update() - with context.progress(total = len(df_vehicles), label = "Writing vehicles ...") as progress: + with context.progress( + total=len(df_vehicles), label="Writing vehicles ..." + ) as progress: for vehicle in df_vehicles.to_dict(orient="records"): writer.add_vehicle( vehicle["vehicle_id"], vehicle["type_id"], - attributes = { + attributes={ "critair": vehicle["critair"], "technology": vehicle["technology"], "age": vehicle["age"], - "euro": vehicle["euro"] - } + "euro": vehicle["euro"], + }, ) progress.update() writer.end_vehicles() - return "vehicles.xml.gz" \ No newline at end of file + return "vehicles.xml.gz" diff --git a/matsim/simulation/prepare.py b/matsim/simulation/prepare.py index 7a73e6d8..39176e31 100644 --- a/matsim/simulation/prepare.py +++ b/matsim/simulation/prepare.py @@ -3,9 +3,10 @@ import matsim.runtime.eqasim as eqasim + def configure(context): context.config("mode_choice", False) - + context.stage("matsim.scenario.population") context.stage("matsim.scenario.households") context.stage("matsim.scenario.vehicles") @@ -26,148 +27,245 @@ def configure(context): context.config("output_prefix", "ile_de_france_") + def execute(context): # Prepare input files facilities_path = "%s/%s" % ( context.path("matsim.scenario.facilities"), - context.stage("matsim.scenario.facilities") + context.stage("matsim.scenario.facilities"), ) population_path = "%s/%s" % ( context.path("matsim.scenario.population"), - context.stage("matsim.scenario.population") + context.stage("matsim.scenario.population"), ) network_path = "%s/%s" % ( context.path("matsim.scenario.supply.processed"), - context.stage("matsim.scenario.supply.processed")["network_path"] + context.stage("matsim.scenario.supply.processed")["network_path"], ) - eqasim.run(context, "org.eqasim.core.scenario.preparation.RunPreparation", [ - "--input-facilities-path", facilities_path, - "--output-facilities-path", "%sfacilities.xml.gz" % context.config("output_prefix"), - "--input-population-path", population_path, - "--output-population-path", "prepared_population.xml.gz", - "--input-network-path", network_path, - "--output-network-path", "%snetwork.xml.gz" % context.config("output_prefix"), - "--threads", context.config("processes") - ]) + eqasim.run( + context, + "org.eqasim.core.scenario.preparation.RunPreparation", + [ + "--input-facilities-path", + facilities_path, + "--output-facilities-path", + "%sfacilities.xml.gz" % context.config("output_prefix"), + "--input-population-path", + population_path, + "--output-population-path", + "prepared_population.xml.gz", + "--input-network-path", + network_path, + "--output-network-path", + "%snetwork.xml.gz" % context.config("output_prefix"), + "--threads", + context.config("processes"), + ], + ) - assert os.path.exists("%s/%sfacilities.xml.gz" % (context.path(), context.config("output_prefix"))) + assert os.path.exists( + "%s/%sfacilities.xml.gz" % (context.path(), context.config("output_prefix")) + ) assert os.path.exists("%s/prepared_population.xml.gz" % context.path()) - assert os.path.exists("%s/%snetwork.xml.gz" % (context.path(), context.config("output_prefix"))) + assert os.path.exists( + "%s/%snetwork.xml.gz" % (context.path(), context.config("output_prefix")) + ) # Copy remaining input files households_path = "%s/%s" % ( context.path("matsim.scenario.households"), - context.stage("matsim.scenario.households") + context.stage("matsim.scenario.households"), + ) + shutil.copy( + households_path, + "%s/%shouseholds.xml.gz" + % (context.cache_path, context.config("output_prefix")), ) - shutil.copy(households_path, "%s/%shouseholds.xml.gz" % (context.cache_path, context.config("output_prefix"))) transit_schedule_path = "%s/%s" % ( context.path("matsim.scenario.supply.processed"), - context.stage("matsim.scenario.supply.processed")["schedule_path"] + context.stage("matsim.scenario.supply.processed")["schedule_path"], + ) + shutil.copy( + transit_schedule_path, + "%s/%stransit_schedule.xml.gz" + % (context.cache_path, context.config("output_prefix")), ) - shutil.copy(transit_schedule_path, "%s/%stransit_schedule.xml.gz" % (context.cache_path, context.config("output_prefix"))) transit_vehicles_path = "%s/%s" % ( context.path("matsim.scenario.supply.gtfs"), - context.stage("matsim.scenario.supply.gtfs")["vehicles_path"] + context.stage("matsim.scenario.supply.gtfs")["vehicles_path"], + ) + shutil.copy( + transit_vehicles_path, + "%s/%stransit_vehicles.xml.gz" + % (context.cache_path, context.config("output_prefix")), ) - shutil.copy(transit_vehicles_path, "%s/%stransit_vehicles.xml.gz" % (context.cache_path, context.config("output_prefix"))) vehicles_path = "%s/%s" % ( context.path("matsim.scenario.vehicles"), - context.stage("matsim.scenario.vehicles") + context.stage("matsim.scenario.vehicles"), + ) + shutil.copy( + vehicles_path, + "%s/%svehicles.xml.gz" % (context.cache_path, context.config("output_prefix")), ) - shutil.copy(vehicles_path, "%s/%svehicles.xml.gz" % (context.cache_path, context.config("output_prefix"))) # Generate base configuration - eqasim.run(context, "org.eqasim.core.scenario.config.RunGenerateConfig", [ - "--sample-size", context.config("sampling_rate"), - "--threads", context.config("processes"), - "--prefix", context.config("output_prefix"), - "--random-seed", context.config("random_seed"), - "--output-path", "generic_config.xml" - ]) + eqasim.run( + context, + "org.eqasim.core.scenario.config.RunGenerateConfig", + [ + "--sample-size", + context.config("sampling_rate"), + "--threads", + context.config("processes"), + "--prefix", + context.config("output_prefix"), + "--random-seed", + context.config("random_seed"), + "--output-path", + "generic_config.xml", + ], + ) assert os.path.exists("%s/generic_config.xml" % context.path()) # Adapt config for Île-de-France - eqasim.run(context, "org.eqasim.ile_de_france.scenario.RunAdaptConfig", [ - "--input-path", "generic_config.xml", - "--output-path", "%sconfig.xml" % context.config("output_prefix"), - "--prefix", context.config("output_prefix") - ]) - assert os.path.exists("%s/%sconfig.xml" % (context.path(), context.config("output_prefix"))) + eqasim.run( + context, + "org.eqasim.ile_de_france.scenario.RunAdaptConfig", + [ + "--input-path", + "generic_config.xml", + "--output-path", + "%sconfig.xml" % context.config("output_prefix"), + "--prefix", + context.config("output_prefix"), + ], + ) + assert os.path.exists( + "%s/%sconfig.xml" % (context.path(), context.config("output_prefix")) + ) # Add urban attributes to population and network # (but only if Paris is included in the scenario!) df_codes = context.stage("data.spatial.codes") if "75" in df_codes["departement_id"].unique().astype(str): - df_shape = context.stage("data.spatial.departments")[["departement_id", "geometry"]].rename( - columns = dict(departement_id = "id") - ) + df_shape = context.stage("data.spatial.departments")[ + ["departement_id", "geometry"] + ].rename(columns=dict(departement_id="id")) df_shape["id"] = df_shape["id"].astype(str) if "75" in df_shape["id"].unique(): df_shape.to_file("%s/departments.shp" % context.path()) - eqasim.run(context, "org.eqasim.core.scenario.spatial.RunImputeSpatialAttribute", [ - "--input-population-path", "prepared_population.xml.gz", - "--output-population-path", "prepared_population.xml.gz", - "--input-network-path", "%snetwork.xml.gz" % context.config("output_prefix"), - "--output-network-path", "%snetwork.xml.gz" % context.config("output_prefix"), - "--shape-path", "departments.shp", - "--shape-attribute", "id", - "--shape-value", "75", - "--attribute", "isUrban" - ]) - - eqasim.run(context, "org.eqasim.core.scenario.spatial.RunAdjustCapacity", [ - "--input-path", "%snetwork.xml.gz" % context.config("output_prefix"), - "--output-path", "%snetwork.xml.gz" % context.config("output_prefix"), - "--shape-path", "departments.shp", - "--shape-attribute", "id", - "--shape-value", "75", - "--factor", str(0.8) - ]) - - + eqasim.run( + context, + "org.eqasim.core.scenario.spatial.RunImputeSpatialAttribute", + [ + "--input-population-path", + "prepared_population.xml.gz", + "--output-population-path", + "prepared_population.xml.gz", + "--input-network-path", + "%snetwork.xml.gz" % context.config("output_prefix"), + "--output-network-path", + "%snetwork.xml.gz" % context.config("output_prefix"), + "--shape-path", + "departments.shp", + "--shape-attribute", + "id", + "--shape-value", + "75", + "--attribute", + "isUrban", + ], + ) + + eqasim.run( + context, + "org.eqasim.core.scenario.spatial.RunAdjustCapacity", + [ + "--input-path", + "%snetwork.xml.gz" % context.config("output_prefix"), + "--output-path", + "%snetwork.xml.gz" % context.config("output_prefix"), + "--shape-path", + "departments.shp", + "--shape-attribute", + "id", + "--shape-value", + "75", + "--factor", + str(0.8), + ], + ) + # Optionally, perform mode choice if context.config("mode_choice"): - eqasim.run(context, "org.eqasim.core.standalone_mode_choice.RunStandaloneModeChoice", [ - "--config-path", "%sconfig.xml" % context.config("output_prefix"), - "--config:standaloneModeChoice.outputDirectory", "mode_choice", - "--config:global.numberOfThreads", context.config("processes"), - "--write-output-csv-trips", "true", - "--skip-scenario-check", "true", - "--config:plans.inputPlansFile", "prepared_population.xml.gz", - "--eqasim-configurator-class", "org.eqasim.ile_de_france.IDFConfigurator", - "--mode-choice-configurator-class", "org.eqasim.ile_de_france.IDFStandaloneModeChoiceConfigurator" - ]) + eqasim.run( + context, + "org.eqasim.core.standalone_mode_choice.RunStandaloneModeChoice", + [ + "--config-path", + "%sconfig.xml" % context.config("output_prefix"), + "--config:standaloneModeChoice.outputDirectory", + "mode_choice", + "--config:global.numberOfThreads", + context.config("processes"), + "--write-output-csv-trips", + "true", + "--skip-scenario-check", + "true", + "--config:plans.inputPlansFile", + "prepared_population.xml.gz", + "--eqasim-configurator-class", + "org.eqasim.ile_de_france.IDFConfigurator", + "--mode-choice-configurator-class", + "org.eqasim.ile_de_france.IDFStandaloneModeChoiceConfigurator", + ], + ) assert os.path.exists("%s/mode_choice/output_plans.xml.gz" % context.path()) assert os.path.exists("%s/mode_choice/output_trips.csv" % context.path()) assert os.path.exists("%s/mode_choice/output_pt_legs.csv" % context.path()) - shutil.copy("%s/mode_choice/output_plans.xml.gz" % context.path(), - "%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix"))) + shutil.copy( + "%s/mode_choice/output_plans.xml.gz" % context.path(), + "%s/%spopulation.xml.gz" + % (context.path(), context.config("output_prefix")), + ) else: # Route population - eqasim.run(context, "org.eqasim.core.scenario.routing.RunPopulationRouting", [ - "--config-path", "%sconfig.xml" % context.config("output_prefix"), - "--output-path", "%spopulation.xml.gz" % context.config("output_prefix"), - "--threads", context.config("processes"), - "--config:plans.inputPlansFile", "prepared_population.xml.gz" - ]) + eqasim.run( + context, + "org.eqasim.core.scenario.routing.RunPopulationRouting", + [ + "--config-path", + "%sconfig.xml" % context.config("output_prefix"), + "--output-path", + "%spopulation.xml.gz" % context.config("output_prefix"), + "--threads", + context.config("processes"), + "--config:plans.inputPlansFile", + "prepared_population.xml.gz", + ], + ) - assert os.path.exists("%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix"))) + assert os.path.exists( + "%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix")) + ) # Validate scenario - eqasim.run(context, "org.eqasim.core.scenario.validation.RunScenarioValidator", [ - "--config-path", "%sconfig.xml" % context.config("output_prefix") - ]) + eqasim.run( + context, + "org.eqasim.core.scenario.validation.RunScenarioValidator", + ["--config-path", "%sconfig.xml" % context.config("output_prefix")], + ) # Cleanup os.remove("%s/prepared_population.xml.gz" % context.path()) diff --git a/matsim/simulation/run.py b/matsim/simulation/run.py index fb6773b2..69da3376 100644 --- a/matsim/simulation/run.py +++ b/matsim/simulation/run.py @@ -3,23 +3,33 @@ import matsim.runtime.eqasim as eqasim + def configure(context): context.stage("matsim.simulation.prepare") context.stage("matsim.runtime.java") context.stage("matsim.runtime.eqasim") + def execute(context): config_path = "%s/%s" % ( context.path("matsim.simulation.prepare"), - context.stage("matsim.simulation.prepare") + context.stage("matsim.simulation.prepare"), ) # Run routing - eqasim.run(context, "org.eqasim.ile_de_france.RunSimulation", [ - "--config-path", config_path, - "--config:controler.lastIteration", str(1), - "--config:controler.writeEventsInterval", str(1), - "--config:controler.writePlansInterval", str(1), - ]) + eqasim.run( + context, + "org.eqasim.ile_de_france.RunSimulation", + [ + "--config-path", + config_path, + "--config:controler.lastIteration", + str(1), + "--config:controler.writeEventsInterval", + str(1), + "--config:controler.writePlansInterval", + str(1), + ], + ) assert os.path.exists("%s/simulation_output/output_events.xml.gz" % context.path()) diff --git a/matsim/writers.py b/matsim/writers.py index da99084f..94a5ced9 100644 --- a/matsim/writers.py +++ b/matsim/writers.py @@ -1,6 +1,7 @@ import numpy as np from xml.sax.saxutils import escape + class XmlWriter: def __init__(self, writer): self.writer = writer @@ -37,22 +38,26 @@ def time(self, time): time = int(time) hours = time // 3600 minutes = (time % 3600) // 60 - seconds = (time % 60) + seconds = time % 60 return "%02d:%02d:%02d" % (hours, minutes, seconds) - def location(self, x, y, facility_id = None): + def location(self, x, y, facility_id=None): return (x, y, None if facility_id is None else facility_id) + def _write_preface_attributes(writer, attributes): if len(attributes) > 0: - writer._write_line('') + writer._write_line("") writer.indent += 1 for item in attributes.items(): - writer._write_line('%s' % item) + writer._write_line( + '%s' % item + ) writer.indent -= 1 - writer._write_line('') + writer._write_line("") + class PopulationWriter(XmlWriter): POPULATION_SCOPE = 0 @@ -64,11 +69,13 @@ class PopulationWriter(XmlWriter): def __init__(self, writer): XmlWriter.__init__(self, writer) - def start_population(self, attributes = {}): + def start_population(self, attributes={}): self._require_scope(None) self._write_line('') - self._write_line('') - self._write_line('') + self._write_line( + '' + ) + self._write_line("") self.scope = self.POPULATION_SCOPE self.indent += 1 @@ -78,7 +85,7 @@ def start_population(self, attributes = {}): def end_population(self): self._require_scope(self.POPULATION_SCOPE) self.indent -= 1 - self._write_line('') + self._write_line("") self.scope = self.FINISHED_SCOPE def start_person(self, person_id): @@ -91,11 +98,11 @@ def end_person(self): self._require_scope(self.PERSON_SCOPE) self.indent -= 1 self.scope = self.POPULATION_SCOPE - self._write_line('') + self._write_line("") def start_attributes(self): # We don't require any scope here because attributes can be almost anywhere - self._write_line('') + self._write_line("") self.indent += 1 # And we need to remember which scope we were in before starting the attributes self._pre_attributes_scope = self.scope @@ -106,13 +113,13 @@ def end_attributes(self): self.indent -= 1 # Resetting the scope that we were in before starting the attributes self.scope = self._pre_attributes_scope - self._write_line('') + self._write_line("") def add_attribute(self, name, type, value): self._require_scope(self.ATTRIBUTES_SCOPE) - self._write_line('%s' % ( - name, type, value - )) + self._write_line( + '%s' % (name, type, value) + ) def start_plan(self, selected): self._require_scope(self.PERSON_SCOPE) @@ -124,33 +131,37 @@ def end_plan(self): self._require_scope(self.PLAN_SCOPE) self.indent -= 1 self.scope = self.PERSON_SCOPE - self._write_line('') + self._write_line("") - def add_activity(self, type, location, start_time = None, end_time = None): + def add_activity(self, type, location, start_time=None, end_time=None): self._require_scope(self.PLAN_SCOPE) self._write_indent() - self._write('\n') + if location[2] is not None: + self._write('facility="%s" ' % str(location[2])) + if start_time is not None: + self._write('start_time="%s" ' % self.time(start_time)) + if end_time is not None: + self._write('end_time="%s" ' % self.time(end_time)) + self._write("/>\n") def add_leg(self, mode, departure_time, travel_time): self._require_scope(self.PLAN_SCOPE) self._write_indent() - self._write('\n') + self._write(">\n") self.start_attributes() - self.add_attribute('routingMode', 'java.lang.String', mode) + self.add_attribute("routingMode", "java.lang.String", mode) self.end_attributes() - self._write_line('') + self._write_line("") + class HouseholdsWriter(XmlWriter): HOUSEHOLDS_SCOPE = 0 @@ -161,10 +172,12 @@ class HouseholdsWriter(XmlWriter): def __init__(self, writer): XmlWriter.__init__(self, writer) - def start_households(self, attributes = {}): + def start_households(self, attributes={}): self._require_scope(None) self._write_line('') - self._write_line('') + self._write_line( + '' + ) self.scope = self.HOUSEHOLDS_SCOPE self.indent += 1 @@ -173,7 +186,7 @@ def start_households(self, attributes = {}): def end_households(self): self._require_scope(self.HOUSEHOLDS_SCOPE) - self._write_line('') + self._write_line("") self.scope = self.FINISHED_SCOPE def start_household(self, household_id): @@ -186,11 +199,11 @@ def end_household(self): self._require_scope(self.HOUSEHOLD_SCOPE) self.indent -= 1 self.scope = self.HOUSEHOLDS_SCOPE - self._write_line('') + self._write_line("") def start_attributes(self): self._require_scope(self.HOUSEHOLD_SCOPE) - self._write_line('') + self._write_line("") self.indent += 1 self.scope = self.ATTRIBUTES_SCOPE @@ -198,26 +211,28 @@ def end_attributes(self): self._require_scope(self.ATTRIBUTES_SCOPE) self.indent -= 1 self.scope = self.HOUSEHOLD_SCOPE - self._write_line('') + self._write_line("") def add_attribute(self, name, type, value): self._require_scope(self.ATTRIBUTES_SCOPE) - self._write_line('%s' % ( - name, type, value - )) + self._write_line( + '%s' % (name, type, value) + ) def add_members(self, person_ids): self._require_scope(self.HOUSEHOLD_SCOPE) - self._write_line('') + self._write_line("") self.indent += 1 - for person_id in person_ids: self._write_line('' % person_id) + for person_id in person_ids: + self._write_line('' % person_id) self.indent -= 1 - self._write_line('') + self._write_line("") def add_income(self, income): self._require_scope(self.HOUSEHOLD_SCOPE) self._write_line('%f' % income) + class FacilitiesWriter(XmlWriter): FACILITIES_SCOPE = 0 FINISHED_SCOPE = 1 @@ -226,11 +241,13 @@ class FacilitiesWriter(XmlWriter): def __init__(self, writer): XmlWriter.__init__(self, writer) - def start_facilities(self, attributes = {}): + def start_facilities(self, attributes={}): self._require_scope(None) self._write_line('') - self._write_line('') - self._write_line('') + self._write_line( + '' + ) + self._write_line("") self.scope = self.FACILITIES_SCOPE self.indent += 1 @@ -240,14 +257,12 @@ def start_facilities(self, attributes = {}): def end_facilities(self): self._require_scope(self.FACILITIES_SCOPE) self.indent -= 1 - self._write_line('') + self._write_line("") self.scope = self.FINISHED_SCOPE def start_facility(self, facility_id, x, y): self._require_scope(self.FACILITIES_SCOPE) - self._write_line('' % ( - str(facility_id), x, y - )) + self._write_line('' % (str(facility_id), x, y)) self.indent += 1 self.scope = self.FACILITY_SCOPE @@ -256,7 +271,7 @@ def end_facility(self): self._require_scope(self.FACILITY_SCOPE) self.indent -= 1 self.scope = self.FACILITIES_SCOPE - self._write_line('') + self._write_line("") def add_activity(self, purpose): self._require_scope(self.FACILITY_SCOPE) @@ -270,10 +285,12 @@ class VehiclesWriter(XmlWriter): def __init__(self, writer): XmlWriter.__init__(self, writer) - def start_vehicles(self, attributes = {}): + def start_vehicles(self, attributes={}): self._require_scope(None) self._write_line('') - self._write_line('') + self._write_line( + '' + ) self.scope = self.VEHICLES_SCOPE self.indent += 1 @@ -283,40 +300,58 @@ def start_vehicles(self, attributes = {}): def end_vehicles(self): self._require_scope(self.VEHICLES_SCOPE) self.indent -= 1 - self._write_line('') + self._write_line("") self.scope = self.FINISHED_SCOPE - def add_type(self, vehicle_type_id, nb_seats = 4, length = 5.0, width = 1.0, pce = 1.0, mode = "car", attributes = {}, engine_attributes = {}): + def add_type( + self, + vehicle_type_id, + nb_seats=4, + length=5.0, + width=1.0, + pce=1.0, + mode="car", + attributes={}, + engine_attributes={}, + ): self._require_scope(self.VEHICLES_SCOPE) self._write_line('' % str(vehicle_type_id)) self.indent += 1 if len(attributes) > 0: - self._write_line('') + self._write_line("") self.indent += 1 for key, item in attributes.items(): - self._write_line('%s' % (key, escape(item))) + self._write_line( + '%s' + % (key, escape(item)) + ) self.indent -= 1 - self._write_line('') + self._write_line("") if not np.isnan(nb_seats): - self._write_line('' % nb_seats) + self._write_line( + '' % nb_seats + ) self._write_line('' % length) self._write_line('' % width) if len(engine_attributes) > 0: - self._write_line('') + self._write_line("") self.indent += 1 - self._write_line('') + self._write_line("") self.indent += 1 for key, item in engine_attributes.items(): - self._write_line('%s' % (key, escape(item))) + self._write_line( + '%s' + % (key, escape(item)) + ) self.indent -= 1 - self._write_line('') + self._write_line("") self.indent -= 1 - self._write_line('') + self._write_line("") if not np.isnan(pce): self._write_line('' % pce) @@ -324,29 +359,35 @@ def add_type(self, vehicle_type_id, nb_seats = 4, length = 5.0, width = 1.0, pce self._write_line('' % mode) self.indent -= 1 - self._write_line('') - + self._write_line("") - def add_vehicle(self, vehicle_id, type_id, attributes = {}): + def add_vehicle(self, vehicle_id, type_id, attributes={}): self._require_scope(self.VEHICLES_SCOPE) if len(attributes) > 0: - self._write_line('' % (str(vehicle_id), str(type_id))) + self._write_line( + '' % (str(vehicle_id), str(type_id)) + ) self.indent += 1 - self._write_line('') + self._write_line("") self.indent += 1 for key, item in attributes.items(): - self._write_line('%s' % (str(key), str(item))) + self._write_line( + '%s' + % (str(key), str(item)) + ) self.indent -= 1 - self._write_line('') + self._write_line("") self.indent -= 1 - self._write_line('') + self._write_line("") else: - self._write_line('' % (str(vehicle_id), str(type_id))) + self._write_line( + '' % (str(vehicle_id), str(type_id)) + ) class backlog_iterator: - def __init__(self, iterable, backlog = 1): + def __init__(self, iterable, backlog=1): self.iterable = iterable self.forward_log = [] self.backward_log = [None] * (backlog + 1) diff --git a/scripts/verify_data.py b/scripts/verify_data.py index 93b77d4f..55a37b2f 100644 --- a/scripts/verify_data.py +++ b/scripts/verify_data.py @@ -1,34 +1,43 @@ import requests import time -# The goal of this script is to verify the availability of the data +# The goal of this script is to verify the availability of the data # that is needed to set up the pipeline -sleep_time = 5 # seconds -timeout = 30 # seconds +sleep_time = 5 # seconds +timeout = 30 # seconds retries = 3 + class Report: def __init__(self): self.sources = [] def register(self, name, url): - self.sources.append({ "name": name, "url": url }) + self.sources.append({"name": name, "url": url}) def validate(self): failed = [] with requests.Session() as session: - session.headers.update({ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0" }) + session.headers.update( + { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0" + } + ) for index, source in enumerate(self.sources): - print("[{}/{}] Checking {} ...".format(index + 1, len(self.sources), source["name"])) - + print( + "[{}/{}] Checking {} ...".format( + index + 1, len(self.sources), source["name"] + ) + ) + retry = 0 success = False while not success and retry < retries: try: - response = session.head(source["url"], timeout = timeout) + response = session.head(source["url"], timeout=timeout) source["status"] = response.status_code success = True except TimeoutError: @@ -38,54 +47,59 @@ def validate(self): print(e) retry += 1 - print(" Status {} (retry {}/{})".format(source["status"], retry, retries)) - + print( + " Status {} (retry {}/{})".format( + source["status"], retry, retries + ) + ) + time.sleep(sleep_time) if source["status"] != 200: failed.append(source["name"]) - + print("Done.") print("Missing: ", len(failed)) print(failed) return len(failed) == 0 + report = Report() report.register( "Census data (RP 2019)", - "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVI_csv.zip" + "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVI_csv.zip", ) report.register( "Population totals (RP 2019)", - "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019.zip" + "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019.zip", ) report.register( "Origin-destination data (RP-MOBPRO 2019)", - "https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip" + "https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip", ) report.register( "Origin-destination data (RP-MOBSCO 2019)", - "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip" + "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip", ) report.register( "Income tax data (Filosofi 2019), municipalities", - "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES.zip" + "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES.zip", ) report.register( "Income tax data (Filosofi 2019), administrative", - "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA.zip" + "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA.zip", ) report.register( "Service and facility census (BPE 2021)", - "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip" + "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip", ) entd_sources = [ @@ -94,60 +108,66 @@ def validate(self): (2556, "Q_menage"), (2565, "Q_individu"), (2566, "Q_ind_lieu_teg"), - (2568, "K_deploc") + (2568, "K_deploc"), ] for identifier, name in entd_sources: report.register( "National household travel survey (ENTD 2008), {}".format(name), - "https://www.statistiques.developpement-durable.gouv.fr/media/{}/download?inline".format(identifier) + "https://www.statistiques.developpement-durable.gouv.fr/media/{}/download?inline".format( + identifier + ), ) report.register( "IRIS zoning system (2021)", - "https://data.geopf.fr/telechargement/download/CONTOURS-IRIS/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z" + "https://data.geopf.fr/telechargement/download/CONTOURS-IRIS/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z", ) report.register( "Zoning registry (2021)", - "https://www.insee.fr/fr/statistiques/fichier/7708995/reference_IRIS_geo2021.zip" + "https://www.insee.fr/fr/statistiques/fichier/7708995/reference_IRIS_geo2021.zip", ) report.register( "Enterprise census (SIRENE), Etablissement", - "https://files.data.gouv.fr/insee-sirene/StockEtablissement_utf8.zip" + "https://files.data.gouv.fr/insee-sirene/StockEtablissement_utf8.zip", ) report.register( "Enterprise census (SIRENE), Unité Legale", - "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip" + "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip", ) report.register( "Enterprise census (SIRENE), Géolocalisé", - "https://files.data.gouv.fr/insee-sirene-geo/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" + "https://files.data.gouv.fr/insee-sirene-geo/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip", ) for department in (75, 77, 78, 91, 92, 93, 94, 95): report.register( "Buildings database (BD TOPO), {}".format(department), - "https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15.7z".format(department, department) + "https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15.7z".format( + department, department + ), ) for department in (75, 77, 78, 91, 92, 93, 94, 95): report.register( "Adresses database (BAN), {}".format(department), - "https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{}.csv.gz".format(department) + "https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{}.csv.gz".format( + department + ), ) report.register( "Population projections", - "https://www.insee.fr/fr/statistiques/fichier/5894093/00_central.xlsx" + "https://www.insee.fr/fr/statistiques/fichier/5894093/00_central.xlsx", ) report.register( "Urban type", - "https://www.insee.fr/fr/statistiques/fichier/4802589/UU2020_au_01-01-2023.zip" + "https://www.insee.fr/fr/statistiques/fichier/4802589/UU2020_au_01-01-2023.zip", ) exit(0 if report.validate() else 1) diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py index 45a32a70..3a0998b6 100644 --- a/synthesis/locations/education.py +++ b/synthesis/locations/education.py @@ -3,13 +3,15 @@ import pandas as pd import geopandas as gpd + def configure(context): context.stage("data.spatial.municipalities") - if context.config("education_location_source","bpe") == "addresses": - context.stage("data.external.education", alias = "location_source") + if context.config("education_location_source", "bpe") == "addresses": + context.stage("data.external.education", alias="location_source") else: - context.stage("data.bpe.cleaned", alias = "location_source") + context.stage("data.bpe.cleaned", alias="location_source") + EDUCATION_WEIGHT_MAP = [ ("C101", 100), # Preschools @@ -25,6 +27,7 @@ def configure(context): ("C501", 2000), # University ] + def fake_education(missing_communes, c, df_locations, df_zones): # Fake education destinations as the centroid of zones that have no other destinations print( @@ -50,49 +53,80 @@ def fake_education(missing_communes, c, df_locations, df_zones): return df_added + def execute(context): df_locations = context.stage("location_source") df_locations = df_locations[df_locations["activity_type"] == "education"] - df_locations = df_locations[["education_type", "commune_id","weight", "geometry"]].copy() + df_locations = df_locations[ + ["education_type", "commune_id", "weight", "geometry"] + ].copy() df_locations["fake"] = False # Add education destinations to the centroid of zones that have no other destinations df_zones = context.stage("data.spatial.municipalities") - required_communes = set(df_zones["commune_id"].unique()) - - if context.config("education_location_source") != 'bpe': # either weighted or addresses + required_communes = set(df_zones["commune_id"].unique()) + + if ( + context.config("education_location_source") != "bpe" + ): # either weighted or addresses for prefix, weight in EDUCATION_WEIGHT_MAP: - df_locations.loc[df_locations["education_type"]==prefix, "weight"] = ( + df_locations.loc[df_locations["education_type"] == prefix, "weight"] = ( weight - ) - if context.config("education_location_source") != 'bpe' : + ) + if context.config("education_location_source") != "bpe": - # Add education destinations in function of level education for c in ["C1", "C2", "C3"]: - missing_communes = required_communes - set(df_locations[df_locations["education_type"].str.startswith(c)]["commune_id"].unique()) + missing_communes = required_communes - set( + df_locations[df_locations["education_type"].str.startswith(c)][ + "commune_id" + ].unique() + ) if len(missing_communes) > 0: - df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)]) - + df_locations = pd.concat( + [ + df_locations, + fake_education(missing_communes, c, df_locations, df_zones), + ] + ) + # Add education destinations for last level education - missing_communes = required_communes - set(df_locations[~(df_locations["education_type"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique()) + missing_communes = required_communes - set( + df_locations[ + ~(df_locations["education_type"].str.startswith(("C1", "C2", "C3"))) + ]["commune_id"].unique() + ) if len(missing_communes) > 0: - df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)]) - else : + df_locations = pd.concat( + [ + df_locations, + fake_education(missing_communes, "C4", df_locations, df_zones), + ] + ) + else: missing_communes = required_communes - set(df_locations["commune_id"].unique()) if len(missing_communes) > 0: - df_locations = pd.concat([df_locations,fake_education(missing_communes, "C0", df_locations, df_zones)]) - df_locations["education_type"] = df_locations["education_type"].str[:2].astype("category") + df_locations = pd.concat( + [ + df_locations, + fake_education(missing_communes, "C0", df_locations, df_zones), + ] + ) + df_locations["education_type"] = ( + df_locations["education_type"].str[:2].astype("category") + ) # Define identifiers - df_locations["location_id"]= np.arange(len(df_locations)) + df_locations["location_id"] = np.arange(len(df_locations)) df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str) - - return df_locations[["location_id","education_type", "commune_id","weight","fake", "geometry"]] + + return df_locations[ + ["location_id", "education_type", "commune_id", "weight", "fake", "geometry"] + ] diff --git a/synthesis/locations/home/addresses.py b/synthesis/locations/home/addresses.py index 01410a13..34a29455 100644 --- a/synthesis/locations/home/addresses.py +++ b/synthesis/locations/home/addresses.py @@ -18,57 +18,65 @@ If no adresses matches a building, its centroid is taken as the unique address. """ + def configure(context): context.stage("data.bdtopo.raw") - + context.config("home_address_buffer", 5.0) context.config("home_location_weight", "housing") if context.config("home_location_source", "addresses") == "addresses": context.stage("data.ban.raw") + def execute(context): # Load buildings df_buildings = context.stage("data.bdtopo.raw") - print("Number of buildings:", + len(df_buildings)) + print("Number of buildings:", +len(df_buildings)) if context.config("home_location_source") == "buildings": - df_addresses = pd.DataFrame({ - "building_id": [], "housing": [], "geometry": [] - }) + df_addresses = pd.DataFrame({"building_id": [], "housing": [], "geometry": []}) - else: # addresses + else: # addresses # Load addresses df_addresses = context.stage("data.ban.raw")[["geometry"]].copy() - print("Number of addresses:", + len(df_addresses)) + print("Number of addresses:", +len(df_addresses)) # Buffer buildings to capture adresses in their vicinity df_buffer = df_buildings[["building_id", "housing", "geometry"]].copy() df_buffer["geometry"] = df_buffer.buffer(context.config("home_address_buffer")) # Find close-by addresses - df_addresses = gpd.sjoin(df_addresses, df_buffer, predicate = "within")[[ - "building_id", "housing", "geometry"]] - + df_addresses = gpd.sjoin(df_addresses, df_buffer, predicate="within")[ + ["building_id", "housing", "geometry"] + ] + # Create missing addresses by using centroids - df_missing = df_buildings[~df_buildings["building_id"].isin(df_addresses["building_id"])].copy() + df_missing = df_buildings[ + ~df_buildings["building_id"].isin(df_addresses["building_id"]) + ].copy() df_missing["geometry"] = df_missing["geometry"].centroid df_missing = df_missing[["building_id", "housing", "geometry"]] # Put together matched and missing addresses df_addresses = pd.concat([df_addresses, df_missing]) - df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs).rename(columns={"building_id":"home_location_id"}) + df_addresses = gpd.GeoDataFrame(df_addresses, crs=df_buildings.crs).rename( + columns={"building_id": "home_location_id"} + ) # Obtain weights for all addresses if context.config("home_location_weight") == "housing": - df_count = df_addresses.groupby("home_location_id").size().reset_index(name = "count") - df_addresses = pd.merge(df_addresses, df_count, on = "home_location_id") + df_count = ( + df_addresses.groupby("home_location_id").size().reset_index(name="count") + ) + df_addresses = pd.merge(df_addresses, df_count, on="home_location_id") df_addresses["weight"] = df_addresses["housing"] / df_addresses["count"] else: df_addresses["weight"] = 1.0 - + return df_addresses[["home_location_id", "weight", "geometry"]] + def validate(context): - assert context.config("home_location_source") in ("addresses", "buildings","tiles") + assert context.config("home_location_source") in ("addresses", "buildings", "tiles") assert context.config("home_location_weight") in ("uniform", "housing") diff --git a/synthesis/locations/home/locations.py b/synthesis/locations/home/locations.py index 391748ec..40b012d2 100644 --- a/synthesis/locations/home/locations.py +++ b/synthesis/locations/home/locations.py @@ -7,27 +7,30 @@ home activities. """ + def configure(context): context.stage("data.spatial.iris") if context.config("home_location_source", "addresses") == "tiles": - context.stage("data.tiles.raw", alias = "location_source") + context.stage("data.tiles.raw", alias="location_source") else: - context.stage("synthesis.locations.home.addresses", alias = "location_source") + context.stage("synthesis.locations.home.addresses", alias="location_source") + def execute(context): # Find required IRIS df_iris = context.stage("data.spatial.iris") required_iris = set(df_iris["iris_id"].unique()) - + # Load all addresses and add IRIS information df_addresses = context.stage("location_source") print("Imputing IRIS into addresses ...") - - df_addresses = gpd.sjoin(df_addresses, - df_iris[["iris_id", "commune_id", "geometry"]], predicate = "within") + + df_addresses = gpd.sjoin( + df_addresses, df_iris[["iris_id", "commune_id", "geometry"]], predicate="within" + ) del df_addresses["index_right"] - + df_addresses.loc[df_addresses["iris_id"].isna(), "iris_id"] = "unknown" df_addresses["iris_id"] = df_addresses["iris_id"].astype("category") @@ -37,21 +40,30 @@ def execute(context): missing_iris = required_iris - set(df_addresses["iris_id"].unique()) if len(missing_iris) > 0: - print("Adding homes at the centroid of %d/%d IRIS without BDTOPO observations" % ( - len(missing_iris), len(required_iris))) + print( + "Adding homes at the centroid of %d/%d IRIS without BDTOPO observations" + % (len(missing_iris), len(required_iris)) + ) df_added = [] for iris_id in sorted(missing_iris): - centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[0] + centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[ + 0 + ] - df_added.append({ - "iris_id": iris_id, "geometry": centroid, - "commune_id": iris_id[:5], - "weight" : 1, - "home_location_id": -1 - }) + df_added.append( + { + "iris_id": iris_id, + "geometry": centroid, + "commune_id": iris_id[:5], + "weight": 1, + "home_location_id": -1, + } + ) - df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_addresses.crs) + df_added = gpd.GeoDataFrame( + pd.DataFrame.from_records(df_added), crs=df_addresses.crs + ) df_added["fake"] = True df_addresses = pd.concat([df_addresses, df_added]) diff --git a/synthesis/locations/home/output.py b/synthesis/locations/home/output.py index 54c33ec6..926170bd 100644 --- a/synthesis/locations/home/output.py +++ b/synthesis/locations/home/output.py @@ -1,5 +1,6 @@ import geopandas as gpd + def configure(context): context.config("output_path") context.config("output_prefix", "ile_de_france_") @@ -7,19 +8,26 @@ def configure(context): context.stage("data.bdtopo.raw") context.stage("synthesis.locations.home.locations") + def execute(context): # Load data - df_buildings = context.stage("data.bdtopo.raw")[[ - "building_id", "housing", "geometry"]] - - df_locations = context.stage("synthesis.locations.home.locations")[[ - "location_id", "weight", "building_id", "geometry"]] + df_buildings = context.stage("data.bdtopo.raw")[ + ["building_id", "housing", "geometry"] + ] + + df_locations = context.stage("synthesis.locations.home.locations")[ + ["location_id", "weight", "building_id", "geometry"] + ] # Write into same file with multiple layers - df_buildings.to_file("%s/%shousing.gpkg" % ( - context.config("output_path"), context.config("output_prefix") - ), layer = "buildings") + df_buildings.to_file( + "%s/%shousing.gpkg" + % (context.config("output_path"), context.config("output_prefix")), + layer="buildings", + ) - df_locations.to_file("%s/%shousing.gpkg" % ( - context.config("output_path"), context.config("output_prefix") - ), layer = "addresses") + df_locations.to_file( + "%s/%shousing.gpkg" + % (context.config("output_path"), context.config("output_prefix")), + layer="addresses", + ) diff --git a/synthesis/locations/secondary.py b/synthesis/locations/secondary.py index c5446359..40bef2c1 100644 --- a/synthesis/locations/secondary.py +++ b/synthesis/locations/secondary.py @@ -3,20 +3,24 @@ import pandas as pd import geopandas as gpd + def configure(context): context.stage("data.bpe.cleaned") context.stage("data.spatial.municipalities") + def execute(context): - df_locations = context.stage("data.bpe.cleaned")[[ - "enterprise_id", "activity_type", "commune_id", "geometry" - ]].copy() + df_locations = context.stage("data.bpe.cleaned")[ + ["enterprise_id", "activity_type", "commune_id", "geometry"] + ].copy() df_locations["destination_id"] = np.arange(len(df_locations)) # Attach attributes for activity types df_locations["offers_leisure"] = df_locations["activity_type"] == "leisure" df_locations["offers_shop"] = df_locations["activity_type"] == "shop" - df_locations["offers_other"] = ~(df_locations["offers_leisure"] | df_locations["offers_shop"]) + df_locations["offers_other"] = ~( + df_locations["offers_leisure"] | df_locations["offers_shop"] + ) # Define new IDs df_locations["location_id"] = np.arange(len(df_locations)) diff --git a/synthesis/locations/work.py b/synthesis/locations/work.py index 0fc9bcee..c4178244 100644 --- a/synthesis/locations/work.py +++ b/synthesis/locations/work.py @@ -11,14 +11,16 @@ place at their centroid to be in line with INSEE OD data. """ + def configure(context): context.stage("data.sirene.localized") context.stage("data.spatial.municipalities") + def execute(context): - df_workplaces = context.stage("data.sirene.localized")[[ - "commune_id", "minimum_employees", "maximum_employees", "geometry" - ]].copy() + df_workplaces = context.stage("data.sirene.localized")[ + ["commune_id", "minimum_employees", "maximum_employees", "geometry"] + ].copy() # Use minimum number of employees as weight df_workplaces["employees"] = df_workplaces["minimum_employees"] @@ -30,19 +32,29 @@ def execute(context): missing_communes = required_communes - set(df_workplaces["commune_id"].unique()) if len(missing_communes) > 0: - print("Adding work places at the centroid of %d/%d communes without SIRENE observations" % ( - len(missing_communes), len(required_communes))) + print( + "Adding work places at the centroid of %d/%d communes without SIRENE observations" + % (len(missing_communes), len(required_communes)) + ) df_added = [] for commune_id in missing_communes: - centroid = df_zones[df_zones["commune_id"] == commune_id]["geometry"].centroid.iloc[0] + centroid = df_zones[df_zones["commune_id"] == commune_id][ + "geometry" + ].centroid.iloc[0] - df_added.append({ - "commune_id": commune_id, "employees": 1.0, "geometry": centroid, - }) + df_added.append( + { + "commune_id": commune_id, + "employees": 1.0, + "geometry": centroid, + } + ) - df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_workplaces.crs) + df_added = gpd.GeoDataFrame( + pd.DataFrame.from_records(df_added), crs=df_workplaces.crs + ) df_added["fake"] = True df_workplaces = pd.concat([df_workplaces, df_added]) diff --git a/synthesis/output.py b/synthesis/output.py index 84c52a36..eeea93fc 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -7,6 +7,7 @@ import math import numpy as np + def configure(context): context.stage("synthesis.population.enriched") @@ -22,7 +23,7 @@ def configure(context): context.config("output_path") context.config("output_prefix", "ile_de_france_") context.config("output_formats", ["csv", "gpkg"]) - + if context.config("mode_choice", False): context.stage("matsim.simulation.prepare") @@ -33,8 +34,9 @@ def validate(context): if not os.path.isdir(output_path): raise RuntimeError("Output directory must exist: %s" % output_path) + def clean_gpkg(path): - ''' + """ Make GPKG files time and OS independent. In GeoPackage metadata: @@ -42,21 +44,28 @@ def clean_gpkg(path): - round coordinates. This allow for comparison of output digests between runs and between OS. - ''' + """ conn = sqlite3.connect(path) cur = conn.cursor() for table_name, min_x, min_y, max_x, max_y in cur.execute( "SELECT table_name, min_x, min_y, max_x, max_y FROM gpkg_contents" ): cur.execute( - "UPDATE gpkg_contents " + - "SET last_change='2000-01-01T00:00:00Z', min_x=?, min_y=?, max_x=?, max_y=? " + - "WHERE table_name=?", - (math.floor(min_x), math.floor(min_y), math.ceil(max_x), math.ceil(max_y), table_name) + "UPDATE gpkg_contents " + + "SET last_change='2000-01-01T00:00:00Z', min_x=?, min_y=?, max_x=?, max_y=? " + + "WHERE table_name=?", + ( + math.floor(min_x), + math.floor(min_y), + math.ceil(max_x), + math.ceil(max_y), + table_name, + ), ) conn.commit() conn.close() + def execute(context): output_path = context.config("output_path") output_prefix = context.config("output_prefix") @@ -64,121 +73,237 @@ def execute(context): # Prepare persons df_persons = context.stage("synthesis.population.enriched").rename( - columns = { "has_license": "has_driving_license" } + columns={"has_license": "has_driving_license"} ) - df_persons = df_persons[[ - "person_id", "household_id", - "age", "employed", "sex", "socioprofessional_class", - "has_driving_license", "has_pt_subscription", - "census_person_id", "hts_id" - ]] + df_persons = df_persons[ + [ + "person_id", + "household_id", + "age", + "employed", + "sex", + "socioprofessional_class", + "has_driving_license", + "has_pt_subscription", + "census_person_id", + "hts_id", + ] + ] if "csv" in output_formats: - df_persons.to_csv("%s/%spersons.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + df_persons.to_csv( + "%s/%spersons.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) if "parquet" in output_formats: df_persons.to_parquet("%s/%spersons.parquet" % (output_path, output_prefix)) # Prepare activities df_activities = context.stage("synthesis.population.activities").rename( - columns = { "trip_index": "following_trip_index" } + columns={"trip_index": "following_trip_index"} ) df_activities = pd.merge( - df_activities, df_persons[["person_id", "household_id"]], on = "person_id") + df_activities, df_persons[["person_id", "household_id"]], on="person_id" + ) - df_activities["preceding_trip_index"] = df_activities["following_trip_index"].shift(1) + df_activities["preceding_trip_index"] = df_activities["following_trip_index"].shift( + 1 + ) df_activities.loc[df_activities["is_first"], "preceding_trip_index"] = -1 - df_activities["preceding_trip_index"] = df_activities["preceding_trip_index"].astype(int) + df_activities["preceding_trip_index"] = df_activities[ + "preceding_trip_index" + ].astype(int) # Prepare spatial data sets - df_locations = context.stage("synthesis.population.spatial.locations")[[ - "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry" - ]] + df_locations = context.stage("synthesis.population.spatial.locations")[ + [ + "person_id", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "activity_index", + "geometry", + ] + ] - df_activities = pd.merge(df_activities, df_locations[[ - "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry" - ]], how = "left", on = ["person_id", "activity_index"]) + df_activities = pd.merge( + df_activities, + df_locations[ + [ + "person_id", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "activity_index", + "geometry", + ] + ], + how="left", + on=["person_id", "activity_index"], + ) # Prepare spatial activities - df_spatial = gpd.GeoDataFrame(df_activities[[ - "person_id", "household_id", "activity_index", - "iris_id", "commune_id","departement_id","region_id", - "preceding_trip_index", "following_trip_index", - "purpose", "start_time", "end_time", - "is_first", "is_last", "geometry" - ]], crs = df_locations.crs) - df_spatial = df_spatial.astype({'purpose': 'str', "departement_id": 'str'}) + df_spatial = gpd.GeoDataFrame( + df_activities[ + [ + "person_id", + "household_id", + "activity_index", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "preceding_trip_index", + "following_trip_index", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + "geometry", + ] + ], + crs=df_locations.crs, + ) + df_spatial = df_spatial.astype({"purpose": "str", "departement_id": "str"}) # Write activities - df_activities = df_activities[[ - "person_id", "household_id", "activity_index", - "iris_id", "commune_id","departement_id","region_id", - "preceding_trip_index", "following_trip_index", - "purpose", "start_time", "end_time", - "is_first", "is_last" - ]] + df_activities = df_activities[ + [ + "person_id", + "household_id", + "activity_index", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "preceding_trip_index", + "following_trip_index", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + ] + ] if "csv" in output_formats: - df_activities.to_csv("%s/%sactivities.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + df_activities.to_csv( + "%s/%sactivities.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) if "parquet" in output_formats: - df_activities.to_parquet("%s/%sactivities.parquet" % (output_path, output_prefix)) + df_activities.to_parquet( + "%s/%sactivities.parquet" % (output_path, output_prefix) + ) # Prepare households - df_households = context.stage("synthesis.population.enriched").rename( - columns = { "household_income": "income" } - ).drop_duplicates("household_id") - - df_households = pd.merge(df_households,df_activities[df_activities["purpose"] == "home"][["household_id", - "iris_id", "commune_id","departement_id","region_id"]].drop_duplicates("household_id"),how="left") - df_households = df_households[[ - "household_id","iris_id", "commune_id", "departement_id","region_id", - "car_availability", "bike_availability", - "number_of_vehicles", "number_of_bikes", - "income", - "census_household_id" - ]] + df_households = ( + context.stage("synthesis.population.enriched") + .rename(columns={"household_income": "income"}) + .drop_duplicates("household_id") + ) + + df_households = pd.merge( + df_households, + df_activities[df_activities["purpose"] == "home"][ + ["household_id", "iris_id", "commune_id", "departement_id", "region_id"] + ].drop_duplicates("household_id"), + how="left", + ) + df_households = df_households[ + [ + "household_id", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "car_availability", + "bike_availability", + "number_of_vehicles", + "number_of_bikes", + "income", + "census_household_id", + ] + ] if "csv" in output_formats: - df_households.to_csv("%s/%shouseholds.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + df_households.to_csv( + "%s/%shouseholds.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) if "parquet" in output_formats: - df_households.to_parquet("%s/%shouseholds.parquet" % (output_path, output_prefix)) + df_households.to_parquet( + "%s/%shouseholds.parquet" % (output_path, output_prefix) + ) # Prepare trips df_trips = context.stage("synthesis.population.trips").rename( - columns = { - "is_first_trip": "is_first", - "is_last_trip": "is_last" - } + columns={"is_first_trip": "is_first", "is_last_trip": "is_last"} ) df_trips["preceding_activity_index"] = df_trips["trip_index"] df_trips["following_activity_index"] = df_trips["trip_index"] + 1 - df_trips = df_trips[[ - "person_id", "trip_index", - "preceding_activity_index", "following_activity_index", - "departure_time", "arrival_time", - "preceding_purpose", "following_purpose", - "is_first", "is_last" - ]] + df_trips = df_trips[ + [ + "person_id", + "trip_index", + "preceding_activity_index", + "following_activity_index", + "departure_time", + "arrival_time", + "preceding_purpose", + "following_purpose", + "is_first", + "is_last", + ] + ] if context.config("mode_choice"): df_mode_choice = pd.read_csv( - "{}/mode_choice/output_trips.csv".format(context.path("matsim.simulation.prepare"), output_prefix), - delimiter = ";") + "{}/mode_choice/output_trips.csv".format( + context.path("matsim.simulation.prepare"), output_prefix + ), + delimiter=";", + ) df_mode_choice = df_mode_choice.rename(columns={"person_trip_id": "trip_index"}) columns_to_keep = ["person_id", "trip_index"] - columns_to_keep.extend([c for c in df_trips.columns if c not in df_mode_choice.columns]) + columns_to_keep.extend( + [c for c in df_trips.columns if c not in df_mode_choice.columns] + ) df_trips = df_trips[columns_to_keep] - df_trips = pd.merge(df_trips, df_mode_choice, on = [ - "person_id", "trip_index"], how="left", validate = "one_to_one") + df_trips = pd.merge( + df_trips, + df_mode_choice, + on=["person_id", "trip_index"], + how="left", + validate="one_to_one", + ) - shutil.copy("%s/mode_choice/output_pt_legs.csv" % (context.path("matsim.simulation.prepare")), - "%s/%spt_legs.csv" % (output_path, output_prefix)) + shutil.copy( + "%s/mode_choice/output_pt_legs.csv" + % (context.path("matsim.simulation.prepare")), + "%s/%spt_legs.csv" % (output_path, output_prefix), + ) - assert not np.any(df_trips["mode"].isna()) + assert not np.any(df_trips["mode"].isna()) if "csv" in output_formats: - df_trips.to_csv("%s/%strips.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + df_trips.to_csv( + "%s/%strips.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) if "parquet" in output_formats: df_trips.to_csv("%s/%strips.parquet" % (output_path, output_prefix)) @@ -186,30 +311,48 @@ def execute(context): df_vehicle_types, df_vehicles = context.stage("synthesis.vehicles.vehicles") if "csv" in output_formats: - df_vehicle_types.to_csv("%s/%svehicle_types.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") - df_vehicles.to_csv("%s/%svehicles.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + df_vehicle_types.to_csv( + "%s/%svehicle_types.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) + df_vehicles.to_csv( + "%s/%svehicles.csv" % (output_path, output_prefix), + sep=";", + index=None, + lineterminator="\n", + ) if "parquet" in output_formats: - df_vehicle_types.to_parquet("%s/%svehicle_types.parquet" % (output_path, output_prefix)) + df_vehicle_types.to_parquet( + "%s/%svehicle_types.parquet" % (output_path, output_prefix) + ) df_vehicles.to_parquet("%s/%svehicles.parquet" % (output_path, output_prefix)) - if "gpkg" in output_formats: path = "%s/%sactivities.gpkg" % (output_path, output_prefix) - df_spatial.to_file(path, driver = "GPKG") + df_spatial.to_file(path, driver="GPKG") clean_gpkg(path) if "geoparquet" in output_formats: path = "%s/%sactivities.geoparquet" % (output_path, output_prefix) df_spatial.to_parquet(path) # Write spatial homes - df_spatial_homes = df_spatial[ - df_spatial["purpose"] == "home" - ].drop_duplicates("household_id")[[ - "household_id","iris_id", "commune_id","departement_id","region_id", "geometry" - ]] + df_spatial_homes = df_spatial[df_spatial["purpose"] == "home"].drop_duplicates( + "household_id" + )[ + [ + "household_id", + "iris_id", + "commune_id", + "departement_id", + "region_id", + "geometry", + ] + ] if "gpkg" in output_formats: path = "%s/%shomes.gpkg" % (output_path, output_prefix) - df_spatial_homes.to_file(path, driver = "GPKG") + df_spatial_homes.to_file(path, driver="GPKG") clean_gpkg(path) if "geoparquet" in output_formats: path = "%s/%shomes.geoparquet" % (output_path, output_prefix) @@ -217,8 +360,12 @@ def execute(context): # Write spatial commutes df_spatial = pd.merge( - df_spatial[df_spatial["purpose"] == "home"].drop_duplicates("person_id")[["person_id", "geometry"]].rename(columns = { "geometry": "home_geometry" }), - df_spatial[df_spatial["purpose"] == "work"].drop_duplicates("person_id")[["person_id", "geometry"]].rename(columns = { "geometry": "work_geometry" }) + df_spatial[df_spatial["purpose"] == "home"] + .drop_duplicates("person_id")[["person_id", "geometry"]] + .rename(columns={"geometry": "home_geometry"}), + df_spatial[df_spatial["purpose"] == "work"] + .drop_duplicates("person_id")[["person_id", "geometry"]] + .rename(columns={"geometry": "work_geometry"}), ) df_spatial["geometry"] = [ @@ -226,38 +373,50 @@ def execute(context): for od in zip(df_spatial["home_geometry"], df_spatial["work_geometry"]) ] - df_spatial = df_spatial.drop(columns = ["home_geometry", "work_geometry"]) + df_spatial = df_spatial.drop(columns=["home_geometry", "work_geometry"]) if "gpkg" in output_formats: path = "%s/%scommutes.gpkg" % (output_path, output_prefix) - df_spatial.to_file(path, driver = "GPKG") + df_spatial.to_file(path, driver="GPKG") clean_gpkg(path) if "geoparquet" in output_formats: path = "%s/%scommutes.geoparquet" % (output_path, output_prefix) df_spatial.to_parquet(path) # Write spatial trips - df_spatial = pd.merge(df_trips, df_locations[[ - "person_id", "activity_index", "geometry" - ]].rename(columns = { - "activity_index": "preceding_activity_index", - "geometry": "preceding_geometry" - }), how = "left", on = ["person_id", "preceding_activity_index"]) - - df_spatial = pd.merge(df_spatial, df_locations[[ - "person_id", "activity_index", "geometry" - ]].rename(columns = { - "activity_index": "following_activity_index", - "geometry": "following_geometry" - }), how = "left", on = ["person_id", "following_activity_index"]) + df_spatial = pd.merge( + df_trips, + df_locations[["person_id", "activity_index", "geometry"]].rename( + columns={ + "activity_index": "preceding_activity_index", + "geometry": "preceding_geometry", + } + ), + how="left", + on=["person_id", "preceding_activity_index"], + ) + + df_spatial = pd.merge( + df_spatial, + df_locations[["person_id", "activity_index", "geometry"]].rename( + columns={ + "activity_index": "following_activity_index", + "geometry": "following_geometry", + } + ), + how="left", + on=["person_id", "following_activity_index"], + ) df_spatial["geometry"] = [ geo.LineString(od) - for od in zip(df_spatial["preceding_geometry"], df_spatial["following_geometry"]) + for od in zip( + df_spatial["preceding_geometry"], df_spatial["following_geometry"] + ) ] - df_spatial = df_spatial.drop(columns = ["preceding_geometry", "following_geometry"]) + df_spatial = df_spatial.drop(columns=["preceding_geometry", "following_geometry"]) - df_spatial = gpd.GeoDataFrame(df_spatial, crs = df_locations.crs) + df_spatial = gpd.GeoDataFrame(df_spatial, crs=df_locations.crs) df_spatial["following_purpose"] = df_spatial["following_purpose"].astype(str) df_spatial["preceding_purpose"] = df_spatial["preceding_purpose"].astype(str) @@ -266,7 +425,7 @@ def execute(context): if "gpkg" in output_formats: path = "%s/%strips.gpkg" % (output_path, output_prefix) - df_spatial.to_file(path, driver = "GPKG") + df_spatial.to_file(path, driver="GPKG") clean_gpkg(path) if "geoparquet" in output_formats: path = "%s/%strips.geoparquet" % (output_path, output_prefix) diff --git a/synthesis/population/activities.py b/synthesis/population/activities.py index 27d3367a..e0c9590f 100644 --- a/synthesis/population/activities.py +++ b/synthesis/population/activities.py @@ -7,15 +7,22 @@ Transforms the synthetic trip table into a synthetic activity table. """ + def configure(context): context.stage("synthesis.population.enriched") context.stage("synthesis.population.trips") + def execute(context): df_activities = context.stage("synthesis.population.trips") # Add trip count - counts = df_activities.groupby("person_id").size().reset_index(name = "trip_count")["trip_count"].values + counts = ( + df_activities.groupby("person_id") + .size() + .reset_index(name="trip_count")["trip_count"] + .values + ) df_activities["trip_count"] = np.hstack([[count] * count for count in counts]) # Shift times and types of trips to arrive at activities @@ -43,14 +50,40 @@ def execute(context): df_last["activity_index"] = df_last["trip_count"] df_last["trip_index"] = -1 - df_activities = pd.concat([ - df_activities[["person_id", "activity_index", "trip_index", "purpose", "start_time", "end_time", "is_first", "is_last"]], - df_last[["person_id", "activity_index", "trip_index", "purpose", "start_time", "end_time", "is_first", "is_last"]] - ]).sort_values(by = ["person_id", "activity_index"]) + df_activities = pd.concat( + [ + df_activities[ + [ + "person_id", + "activity_index", + "trip_index", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + ] + ], + df_last[ + [ + "person_id", + "activity_index", + "trip_index", + "purpose", + "start_time", + "end_time", + "is_first", + "is_last", + ] + ], + ] + ).sort_values(by=["person_id", "activity_index"]) # Add activities for people without trips df_missing = context.stage("synthesis.population.enriched") - df_missing = df_missing[~df_missing["person_id"].isin(df_activities["person_id"])][["person_id"]] + df_missing = df_missing[~df_missing["person_id"].isin(df_activities["person_id"])][ + ["person_id"] + ] df_missing["activity_index"] = 0 df_missing["trip_index"] = -1 diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py index 15fc5649..22d83427 100644 --- a/synthesis/population/enriched.py +++ b/synthesis/population/enriched.py @@ -13,27 +13,38 @@ This stage fuses census data with HTS data. """ + def configure(context): context.stage("synthesis.population.matched") context.stage("synthesis.population.sampled") context.stage("synthesis.population.income.selected") hts = context.config("hts") - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def execute(context): # Select population columns - df_population = context.stage("synthesis.population.sampled")[[ - "person_id", "household_id", - "census_person_id", "census_household_id", - "age", "sex", "employed", "studies", - "number_of_vehicles", "household_size", "consumption_units", - "socioprofessional_class" - ]] + df_population = context.stage("synthesis.population.sampled")[ + [ + "person_id", + "household_id", + "census_person_id", + "census_household_id", + "age", + "sex", + "employed", + "studies", + "number_of_vehicles", + "household_size", + "consumption_units", + "socioprofessional_class", + ] + ] # Attach matching information df_matching = context.stage("synthesis.population.matched") - df_population = pd.merge(df_population, df_matching, on = "person_id") + df_population = pd.merge(df_population, df_matching, on="person_id") initial_size = len(df_population) initial_person_ids = len(df_population["person_id"].unique()) @@ -41,22 +52,40 @@ def execute(context): # Attach person and household attributes from HTS df_hts_households, df_hts_persons, _ = context.stage("hts") - df_hts_persons = df_hts_persons.rename(columns = { "person_id": "hts_id", "household_id": "hts_household_id" }) - df_hts_households = df_hts_households.rename(columns = { "household_id": "hts_household_id" }) - - df_population = pd.merge(df_population, df_hts_persons[[ - "hts_id", "hts_household_id", "has_license", "has_pt_subscription", "is_passenger" - ]], on = "hts_id") - - df_population = pd.merge(df_population, df_hts_households[[ - "hts_household_id", "number_of_bikes" - ]], on = "hts_household_id") + df_hts_persons = df_hts_persons.rename( + columns={"person_id": "hts_id", "household_id": "hts_household_id"} + ) + df_hts_households = df_hts_households.rename( + columns={"household_id": "hts_household_id"} + ) + + df_population = pd.merge( + df_population, + df_hts_persons[ + [ + "hts_id", + "hts_household_id", + "has_license", + "has_pt_subscription", + "is_passenger", + ] + ], + on="hts_id", + ) + + df_population = pd.merge( + df_population, + df_hts_households[["hts_household_id", "number_of_bikes"]], + on="hts_household_id", + ) # Attach income df_income = context.stage("synthesis.population.income.selected") - df_population = pd.merge(df_population, df_income[[ - "household_id", "household_income" - ]], on = "household_id") + df_population = pd.merge( + df_population, + df_income[["household_id", "household_income"]], + on="household_id", + ) # Check consistency final_size = len(df_population) @@ -68,28 +97,55 @@ def execute(context): assert initial_household_ids == final_household_ids # Add car availability - df_number_of_cars = df_population[["household_id", "number_of_vehicles"]].drop_duplicates("household_id") - df_number_of_licenses = df_population[["household_id", "has_license"]].groupby("household_id").sum().reset_index().rename(columns = { "has_license": "number_of_licenses" }) + df_number_of_cars = df_population[ + ["household_id", "number_of_vehicles"] + ].drop_duplicates("household_id") + df_number_of_licenses = ( + df_population[["household_id", "has_license"]] + .groupby("household_id") + .sum() + .reset_index() + .rename(columns={"has_license": "number_of_licenses"}) + ) df_car_availability = pd.merge(df_number_of_cars, df_number_of_licenses) df_car_availability["car_availability"] = "all" - df_car_availability.loc[df_car_availability["number_of_vehicles"] < df_car_availability["number_of_licenses"], "car_availability"] = "some" - df_car_availability.loc[df_car_availability["number_of_vehicles"] == 0, "car_availability"] = "none" - df_car_availability["car_availability"] = df_car_availability["car_availability"].astype("category") - - df_population = pd.merge(df_population, df_car_availability[["household_id", "car_availability"]]) + df_car_availability.loc[ + df_car_availability["number_of_vehicles"] + < df_car_availability["number_of_licenses"], + "car_availability", + ] = "some" + df_car_availability.loc[ + df_car_availability["number_of_vehicles"] == 0, "car_availability" + ] = "none" + df_car_availability["car_availability"] = df_car_availability[ + "car_availability" + ].astype("category") + + df_population = pd.merge( + df_population, df_car_availability[["household_id", "car_availability"]] + ) # Add bike availability df_population["bike_availability"] = "all" - df_population.loc[df_population["number_of_bikes"] < df_population["household_size"], "bike_availability"] = "some" - df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = "none" - df_population["bike_availability"] = df_population["bike_availability"].astype("category") - + df_population.loc[ + df_population["number_of_bikes"] < df_population["household_size"], + "bike_availability", + ] = "some" + df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = ( + "none" + ) + df_population["bike_availability"] = df_population["bike_availability"].astype( + "category" + ) + # Add age range for education df_population["age_range"] = "higher_education" - df_population.loc[df_population["age"]<=10,"age_range"] = "primary_school" - df_population.loc[df_population["age"].between(11,14),"age_range"] = "middle_school" - df_population.loc[df_population["age"].between(15,17),"age_range"] = "high_school" + df_population.loc[df_population["age"] <= 10, "age_range"] = "primary_school" + df_population.loc[df_population["age"].between(11, 14), "age_range"] = ( + "middle_school" + ) + df_population.loc[df_population["age"].between(15, 17), "age_range"] = "high_school" df_population["age_range"] = df_population["age_range"].astype("category") - + return df_population diff --git a/synthesis/population/income/bhepop2.py b/synthesis/population/income/bhepop2.py index 6aa6b7fb..17f3ae28 100644 --- a/synthesis/population/income/bhepop2.py +++ b/synthesis/population/income/bhepop2.py @@ -1,6 +1,9 @@ import numpy as np import pandas as pd -from synthesis.population.income.utils import income_uniform_sample, MAXIMUM_INCOME_FACTOR +from synthesis.population.income.utils import ( + income_uniform_sample, + MAXIMUM_INCOME_FACTOR, +) from bhepop2.tools import add_household_size_attribute, add_household_type_attribute from bhepop2.sources.marginal_distributions import QuantitativeMarginalDistributions from bhepop2.enrichment.bhepop2 import Bhepop2Enrichment @@ -55,15 +58,17 @@ def _sample_income(context, args): "Filosofi", attribute_selection=[ "size", # modalities: ["1_pers", "2_pers", "3_pers", "4_pers", "5_pers_or_more"] - "family_comp" # modalities: ["Single_man", "Single_wom", "Couple_without_child", "Couple_with_child", "Single_parent", "complex_hh"] + "family_comp", # modalities: ["Single_man", "Single_wom", "Couple_without_child", "Couple_with_child", "Single_parent", "complex_hh"] ], abs_minimum=0, relative_maximum=MAXIMUM_INCOME_FACTOR, - delta_min=1000 + delta_min=1000, ) # create enrichment class - enrich_class = Bhepop2Enrichment(df_selected, source, feature_name=INCOME_COLUMN, seed=random_seed) + enrich_class = Bhepop2Enrichment( + df_selected, source, feature_name=INCOME_COLUMN, seed=random_seed + ) # evaluate feature values on the population pop = enrich_class.assign_feature_values() @@ -84,7 +89,12 @@ def _sample_income(context, args): # get global distribution of the commune distrib_all = distribs[distribs["modality"] == "all"] assert len(distrib_all) == 1 - centiles = list(distrib_all[["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9"]].iloc[0].values / 12) + centiles = list( + distrib_all[["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9"]] + .iloc[0] + .values + / 12 + ) incomes = income_uniform_sample(random, centiles, len(df_selected)) @@ -102,29 +112,39 @@ def execute(context): df_population = add_household_size_attribute(df_population) df_population = add_household_type_attribute(df_population) - df_households = df_population[[ - "household_id", "consumption_units", "size", "family_comp" - ]].drop_duplicates("household_id") + df_households = df_population[ + ["household_id", "consumption_units", "size", "family_comp"] + ].drop_duplicates("household_id") - df_homes = context.stage("synthesis.population.spatial.home.zones")[[ - "household_id", "commune_id" - ]] + df_homes = context.stage("synthesis.population.spatial.home.zones")[ + ["household_id", "commune_id"] + ] df_households = pd.merge(df_households, df_homes) commune_ids = df_households["commune_id"].unique() - random_seeds = random.randint(10000, size = len(commune_ids)) + random_seeds = random.randint(10000, size=len(commune_ids)) # Perform sampling per commune - with context.progress(label = "Imputing income ...", total = len(commune_ids)) as progress: - with context.parallel(dict(households = df_households, income = df_income)) as parallel: - - for f, incomes, method in parallel.imap(_sample_income, zip(commune_ids, random_seeds)): - df_households.loc[f, "household_income"] = incomes * df_households.loc[f, "consumption_units"] + with context.progress( + label="Imputing income ...", total=len(commune_ids) + ) as progress: + with context.parallel( + dict(households=df_households, income=df_income) + ) as parallel: + + for f, incomes, method in parallel.imap( + _sample_income, zip(commune_ids, random_seeds) + ): + df_households.loc[f, "household_income"] = ( + incomes * df_households.loc[f, "consumption_units"] + ) df_households.loc[f, "method"] = method # Cleanup - df_households = df_households[["household_id", "household_income", "consumption_units"]] + df_households = df_households[ + ["household_id", "household_income", "consumption_units"] + ] assert len(df_households) == len(df_households["household_id"].unique()) return df_households diff --git a/synthesis/population/income/selected.py b/synthesis/population/income/selected.py index 24d9abc5..43395d57 100644 --- a/synthesis/population/income/selected.py +++ b/synthesis/population/income/selected.py @@ -1,14 +1,13 @@ - def configure(context): method = context.config("income_assignation_method", "uniform") if method == "uniform": - context.stage("synthesis.population.income.uniform", alias = "income") + context.stage("synthesis.population.income.uniform", alias="income") elif method == "bhepop2": - context.stage("synthesis.population.income.bhepop2", alias = "income") + context.stage("synthesis.population.income.bhepop2", alias="income") else: raise RuntimeError("Unknown income assignation method : %s" % method) + def execute(context): return context.stage("income") - diff --git a/synthesis/population/income/uniform.py b/synthesis/population/income/uniform.py index f3fdd758..918f2aaf 100644 --- a/synthesis/population/income/uniform.py +++ b/synthesis/population/income/uniform.py @@ -12,6 +12,7 @@ income distribution and a random income within the selected stratum is chosen. """ + def configure(context): context.stage("data.income.municipality") context.stage("synthesis.population.sampled") @@ -29,38 +30,56 @@ def _sample_income(context, args): f = df_households["commune_id"] == commune_id df_selected = df_households[f] - centiles = list(df_income[df_income["commune_id"] == commune_id][["q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]].iloc[0].values / 12) + centiles = list( + df_income[df_income["commune_id"] == commune_id][ + ["q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"] + ] + .iloc[0] + .values + / 12 + ) incomes = income_uniform_sample(random, centiles, len(df_selected)) return f, incomes + def execute(context): random = np.random.RandomState(context.config("random_seed")) # Load data df_income = context.stage("data.income.municipality") - df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")] + df_income = df_income[ + (df_income["attribute"] == "all") & (df_income["value"] == "all") + ] - df_households = context.stage("synthesis.population.sampled")[[ - "household_id", "consumption_units" - ]].drop_duplicates("household_id") + df_households = context.stage("synthesis.population.sampled")[ + ["household_id", "consumption_units"] + ].drop_duplicates("household_id") - df_homes = context.stage("synthesis.population.spatial.home.zones")[[ - "household_id", "commune_id" - ]] + df_homes = context.stage("synthesis.population.spatial.home.zones")[ + ["household_id", "commune_id"] + ] df_households = pd.merge(df_households, df_homes) # Perform sampling per commune - with context.parallel(dict(households = df_households, income = df_income)) as parallel: + with context.parallel(dict(households=df_households, income=df_income)) as parallel: commune_ids = df_households["commune_id"].unique() - random_seeds = random.randint(10000, size = len(commune_ids)) + random_seeds = random.randint(10000, size=len(commune_ids)) - for f, incomes in context.progress(parallel.imap(_sample_income, zip(commune_ids, random_seeds)), label = "Imputing income ...", total = len(commune_ids)): - df_households.loc[f, "household_income"] = incomes * df_households.loc[f, "consumption_units"] + for f, incomes in context.progress( + parallel.imap(_sample_income, zip(commune_ids, random_seeds)), + label="Imputing income ...", + total=len(commune_ids), + ): + df_households.loc[f, "household_income"] = ( + incomes * df_households.loc[f, "consumption_units"] + ) # Cleanup - df_households = df_households[["household_id", "household_income", "consumption_units"]] + df_households = df_households[ + ["household_id", "household_income", "consumption_units"] + ] assert len(df_households) == len(df_households["household_id"].unique()) return df_households diff --git a/synthesis/population/income/utils.py b/synthesis/population/income/utils.py index b937417b..22a0ea98 100644 --- a/synthesis/population/income/utils.py +++ b/synthesis/population/income/utils.py @@ -23,6 +23,8 @@ def income_uniform_sample(random_state, deciles, size): indices = random_state.randint(10, size=size) lower_bounds, upper_bounds = deciles[indices], deciles[indices + 1] - incomes = lower_bounds + random_state.random_sample(size=size) * (upper_bounds - lower_bounds) + incomes = lower_bounds + random_state.random_sample(size=size) * ( + upper_bounds - lower_bounds + ) return incomes diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py index 5ab5bed0..09022a6a 100644 --- a/synthesis/population/matched.py +++ b/synthesis/population/matched.py @@ -20,10 +20,14 @@ } DEFAULT_MATCHING_ATTRIBUTES = [ - "sex", "any_cars", "age_class", "socioprofessional_class", - "departement_id" + "sex", + "any_cars", + "age_class", + "socioprofessional_class", + "departement_id", ] + def configure(context): context.config("processes") context.config("random_seed") @@ -34,9 +38,10 @@ def configure(context): context.stage("synthesis.population.income.selected") hts = context.config("hts") - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + -@numba.jit(nopython = True) # Already parallelized parallel = True) +@numba.jit(nopython=True) # Already parallelized parallel = True) def sample_indices(uniform, cdf, selected_indices): indices = np.arange(len(uniform)) @@ -45,7 +50,18 @@ def sample_indices(uniform, cdf, selected_indices): return selected_indices[indices] -def statistical_matching(progress, df_source, source_identifier, weight, df_target, target_identifier, columns, random_seed = 0, minimum_observations = 0): + +def statistical_matching( + progress, + df_source, + source_identifier, + weight, + df_target, + target_identifier, + columns, + random_seed=0, + minimum_observations=0, +): random = np.random.RandomState(random_seed) # Reduce data frames @@ -53,21 +69,27 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ df_target = df_target[[target_identifier] + columns].copy() # Sort data frames - df_source = df_source.sort_values(by = columns) - df_target = df_target.sort_values(by = columns) + df_source = df_source.sort_values(by=columns) + df_target = df_target.sort_values(by=columns) # Find unique values for all columns unique_values = {} for column in columns: - unique_values[column] = list(sorted(set(df_source[column].unique()) | set(df_target[column].unique()))) + unique_values[column] = list( + sorted(set(df_source[column].unique()) | set(df_target[column].unique())) + ) # Generate filters for all columns and values source_filters, target_filters = {}, {} for column, column_unique_values in unique_values.items(): - source_filters[column] = [df_source[column].values == value for value in column_unique_values] - target_filters[column] = [df_target[column].values == value for value in column_unique_values] + source_filters[column] = [ + df_source[column].values == value for value in column_unique_values + ] + target_filters[column] = [ + df_target[column].values == value for value in column_unique_values + ] # Define search order source_filters = [source_filters[column] for column in columns] @@ -75,10 +97,10 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ # Perform matching weights = df_source[weight].values - assigned_indices = np.ones((len(df_target),), dtype = int) * -1 - unassigned_mask = np.ones((len(df_target),), dtype = bool) - assigned_levels = np.ones((len(df_target),), dtype = int) * -1 - uniform = random.random_sample(size = (len(df_target),)) + assigned_indices = np.ones((len(df_target),), dtype=int) * -1 + unassigned_mask = np.ones((len(df_target),), dtype=bool) + assigned_levels = np.ones((len(df_target),), dtype=int) * -1 + uniform = random.random_sample(size=(len(df_target),)) column_indices = [np.arange(len(unique_values[column])) for column in columns] @@ -87,8 +109,13 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ if np.count_nonzero(unassigned_mask) > 0: for column_index in itertools.product(*level_column_indices): - f_source = np.logical_and.reduce([source_filters[i][k] for i, k in enumerate(column_index)]) - f_target = np.logical_and.reduce([target_filters[i][k] for i, k in enumerate(column_index)] + [unassigned_mask]) + f_source = np.logical_and.reduce( + [source_filters[i][k] for i, k in enumerate(column_index)] + ) + f_target = np.logical_and.reduce( + [target_filters[i][k] for i, k in enumerate(column_index)] + + [unassigned_mask] + ) selected_indices = np.nonzero(f_source)[0] requested_samples = np.count_nonzero(f_target) @@ -103,7 +130,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ cdf = np.cumsum(selected_weights) cdf /= cdf[-1] - assigned_indices[f_target] = sample_indices(uniform[f_target], cdf, selected_indices) + assigned_indices[f_target] = sample_indices( + uniform[f_target], cdf, selected_indices + ) assigned_levels[f_target] = level unassigned_mask[f_target] = False @@ -113,13 +142,17 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ cdf = np.cumsum(weights) cdf /= cdf[-1] - assigned_indices[unassigned_mask] = sample_indices(uniform[unassigned_mask], cdf, np.arange(len(weights))) + assigned_indices[unassigned_mask] = sample_indices( + uniform[unassigned_mask], cdf, np.arange(len(weights)) + ) assigned_levels[unassigned_mask] = 0 progress.update(np.count_nonzero(unassigned_mask)) if np.count_nonzero(unassigned_mask) > 0: - raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?") + raise RuntimeError( + "Some target observations could not be matched. Minimum observations configured too high?" + ) assert np.count_nonzero(unassigned_mask) == 0 assert np.count_nonzero(assigned_indices == -1) == 0 @@ -130,6 +163,7 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ return df_target, assigned_levels + def _run_parallel_statistical_matching(context, args): # Pass arguments df_target, random_seed = args @@ -142,28 +176,56 @@ def _run_parallel_statistical_matching(context, args): columns = context.data("columns") minimum_observations = context.data("minimum_observations") - return statistical_matching(context.progress, df_source, source_identifier, weight, df_target, target_identifier, columns, random_seed, minimum_observations) - -def parallel_statistical_matching(context, df_source, source_identifier, weight, df_target, target_identifier, columns, minimum_observations = 0): + return statistical_matching( + context.progress, + df_source, + source_identifier, + weight, + df_target, + target_identifier, + columns, + random_seed, + minimum_observations, + ) + + +def parallel_statistical_matching( + context, + df_source, + source_identifier, + weight, + df_target, + target_identifier, + columns, + minimum_observations=0, +): random_seed = context.config("random_seed") processes = context.config("processes") random = np.random.RandomState(random_seed) chunks = np.array_split(df_target, processes) - with context.progress(label = "Statistical matching ...", total = len(df_target)): - with context.parallel({ - "df_source": df_source, "source_identifier": source_identifier, "weight": weight, - "target_identifier": target_identifier, "columns": columns, - "minimum_observations": minimum_observations - }) as parallel: - random_seeds = random.randint(10000, size = len(chunks)) - results = parallel.map(_run_parallel_statistical_matching, zip(chunks, random_seeds)) + with context.progress(label="Statistical matching ...", total=len(df_target)): + with context.parallel( + { + "df_source": df_source, + "source_identifier": source_identifier, + "weight": weight, + "target_identifier": target_identifier, + "columns": columns, + "minimum_observations": minimum_observations, + } + ) as parallel: + random_seeds = random.randint(10000, size=len(chunks)) + results = parallel.map( + _run_parallel_statistical_matching, zip(chunks, random_seeds) + ) + + levels = np.hstack([r[1] for r in results]) + df_target = pd.concat([r[0] for r in results]) - levels = np.hstack([r[1] for r in results]) - df_target = pd.concat([r[0] for r in results]) + return df_target, levels - return df_target, levels def execute(context): hts = context.config("hts") @@ -178,18 +240,25 @@ def execute(context): try: default_index = columns.index("*default*") - columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES - except ValueError: pass + columns[default_index : default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES + except ValueError: + pass # Define matching attributes AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000] if "age_class" in columns: - df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True) - df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True) + df_target["age_class"] = np.digitize( + df_target["age"], AGE_BOUNDARIES, right=True + ) + df_source["age_class"] = np.digitize( + df_source["age"], AGE_BOUNDARIES, right=True + ) if "income_class" in columns: - df_income = context.stage("synthesis.population.income.selected")[["household_id", "household_income"]] + df_income = context.stage("synthesis.population.income.selected")[ + ["household_id", "household_income"] + ] df_target = pd.merge(df_target, df_income) df_target["income_class"] = INCOME_CLASS[hts](df_target) @@ -199,30 +268,47 @@ def execute(context): df_source["any_cars"] = df_source["number_of_vehicles"] > 0 # Perform statistical matching - df_source = df_source.rename(columns = { "person_id": "hts_id" }) + df_source = df_source.rename(columns={"person_id": "hts_id"}) for column in columns: if not column in df_source: - raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column)) + raise RuntimeError( + "Attribute not available in source (HTS) for matching: {}".format( + column + ) + ) if not column in df_target: - raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column)) + raise RuntimeError( + "Attribute not available in target (census) for matching: {}".format( + column + ) + ) df_assignment, levels = parallel_statistical_matching( context, - df_source, "hts_id", "person_weight", - df_target, "person_id", + df_source, + "hts_id", + "person_weight", + df_target, + "person_id", columns, - minimum_observations = context.config("matching_minimum_observations")) + minimum_observations=context.config("matching_minimum_observations"), + ) - df_target = pd.merge(df_target, df_assignment, on = "person_id") + df_target = pd.merge(df_target, df_assignment, on="person_id") assert len(df_target) == len(df_assignment) - context.set_info("matched_counts", { - count: np.count_nonzero(levels >= count) for count in range(len(columns) + 1) - }) + context.set_info( + "matched_counts", + {count: np.count_nonzero(levels >= count) for count in range(len(columns) + 1)}, + ) for count in range(len(columns) + 1): - print("%d matched levels:" % count, np.count_nonzero(levels >= count), "%.2f%%" % (100 * np.count_nonzero(levels >= count) / len(df_target),)) + print( + "%d matched levels:" % count, + np.count_nonzero(levels >= count), + "%.2f%%" % (100 * np.count_nonzero(levels >= count) / len(df_target),), + ) return df_target[["person_id", "hts_id"]] diff --git a/synthesis/population/projection/ipu.py b/synthesis/population/projection/ipu.py index 580ce007..e19bc82a 100644 --- a/synthesis/population/projection/ipu.py +++ b/synthesis/population/projection/ipu.py @@ -5,10 +5,12 @@ This stage reweights the census data set according to the projection data for a different year. """ + def configure(context): context.stage("data.census.cleaned") context.stage("data.census.projection") + def execute(context): df_census = context.stage("data.census.cleaned") projection = context.stage("data.census.projection") @@ -17,7 +19,9 @@ def execute(context): adjust_projection(projection) # Prepare indexing - df_households = df_census[["household_id", "household_size", "weight"]].drop_duplicates("household_id") + df_households = df_census[ + ["household_id", "household_size", "weight"] + ].drop_duplicates("household_id") df_households["household_index"] = np.arange(len(df_households)) df_census = pd.merge(df_census, df_households[["household_id", "household_index"]]) @@ -33,7 +37,11 @@ def execute(context): # Proccesing age ... df_marginal = projection["age"] - for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: age", total = len(df_marginal)): + for index, row in context.progress( + df_marginal.iterrows(), + label="Processing attribute: age", + total=len(df_marginal), + ): f = df_census["age"] == row["age"] assert np.count_nonzero(f) > 0 @@ -42,10 +50,14 @@ def execute(context): attribute_membership.append(df_counts.index.values) attribute_counts.append(df_counts.values) attributes.append("age={}".format(row["age"])) - + # Processing sex ... df_marginal = projection["sex"] - for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: sex", total = len(df_marginal)): + for index, row in context.progress( + df_marginal.iterrows(), + label="Processing attribute: sex", + total=len(df_marginal), + ): f = df_census["sex"] == row["sex"] f &= (df_census["age"] > 0) & (df_census["age"] <= 104) assert np.count_nonzero(f) > 0 @@ -58,7 +70,11 @@ def execute(context): # Processing age x sex ... df_marginal = projection["cross"] - for index, row in context.progress(df_marginal.iterrows(), label = "Processing attributes: sex x age", total = len(df_marginal)): + for index, row in context.progress( + df_marginal.iterrows(), + label="Processing attributes: sex x age", + total=len(df_marginal), + ): f = (df_census["sex"] == row["sex"]) & (df_census["age"] == row["age"]) assert np.count_nonzero(f) > 0 @@ -71,7 +87,7 @@ def execute(context): # Processing total ... f = (df_census["age"] > 0) & (df_census["age"] <= 104) assert np.count_nonzero(f) > 0 - + df_counts = df_census.loc[f, "household_index"].value_counts() attribute_targets.append(projection["total"]["projection"].values[0]) attribute_membership.append(df_counts.index.values) @@ -86,37 +102,49 @@ def execute(context): maximum_iterations = 100 for iteration in range(maximum_iterations): - factors = [] + factors = [] for k in np.arange(len(attributes)): selection = attribute_membership[k] - + target = attribute_targets[k] - current = np.sum(update[selection] * household_weights[selection] * attribute_counts[k]) - + current = np.sum( + update[selection] * household_weights[selection] * attribute_counts[k] + ) + factor = target / current factors.append(factor) - + update[selection] *= factor - print("IPU it={} min={} max={}".format(iteration, np.min(factors), np.max(factors))) + print( + "IPU it={} min={} max={}".format( + iteration, np.min(factors), np.max(factors) + ) + ) converged = np.abs(1 - np.max(factors)) < convergence_threshold converged &= np.abs(1 - np.min(factors)) < convergence_threshold - if converged: break + if converged: + break # Check that the applied factors in the last iteration are sufficiently small assert converged - print("IPF updates min={} max={} mean={}".format(np.min(update), np.max(update), np.mean(update))) + print( + "IPF updates min={} max={} mean={}".format( + np.min(update), np.max(update), np.mean(update) + ) + ) # Update the weights df_households["weight"] *= update - + return df_households[["household_id", "weight"]] + def adjust_projection(projection): # The projection data contains information on zero-year old persons. However, there is a big difference between the - # RP data and the projection, probably because RP is fixed to a certain reference date and not all of them are + # RP data and the projection, probably because RP is fixed to a certain reference date and not all of them are # registered. We, in particular, see that there is a large jump between 0 years and 1 years. # Therefore, we exclude the zero-year persons from the projection. This, however, means adapting all the marginals. # Also, exclude everything that is 105+ @@ -131,18 +159,16 @@ def adjust_projection(projection): if row["age"] == 0 or row["age"] == "105+": f_sex = df_sex["sex"] == row["sex"] - df_sex.loc[f_sex, "projection"] = df_sex.loc[f_sex, "projection"] - row["projection"] + df_sex.loc[f_sex, "projection"] = ( + df_sex.loc[f_sex, "projection"] - row["projection"] + ) df_total["projection"] = df_total["projection"] - row["projection"] - + projection["sex"] = df_sex projection["total"] = df_total # Remove zero old years from cross distribution - projection["cross"] = df_cross[ - (df_cross["age"] != 0) & (df_cross["age"] != "105+") - ] + projection["cross"] = df_cross[(df_cross["age"] != 0) & (df_cross["age"] != "105+")] # Remove zero old years from age distribution - projection["age"] = df_age[ - (df_age["age"] != 0) & (df_age["age"] != "105+") - ] + projection["age"] = df_age[(df_age["age"] != 0) & (df_age["age"] != "105+")] diff --git a/synthesis/population/projection/reweighted.py b/synthesis/population/projection/reweighted.py index 9863e6a3..5450c4e3 100644 --- a/synthesis/population/projection/reweighted.py +++ b/synthesis/population/projection/reweighted.py @@ -5,18 +5,20 @@ This stage reweights the census data set according to the projection data for a different year. """ + def configure(context): context.stage("data.census.filtered") context.stage("synthesis.population.projection.ipu") + def execute(context): df_census = context.stage("data.census.filtered") df_weights = context.stage("synthesis.population.projection.ipu") initial_size = len(df_census) - df_census = df_census.drop(columns = "weight") - df_census = pd.merge(df_census, df_weights, on = "household_id") + df_census = df_census.drop(columns="weight") + df_census = pd.merge(df_census, df_weights, on="household_id") final_size = len(df_census) assert initial_size == final_size diff --git a/synthesis/population/sampled.py b/synthesis/population/sampled.py index c4a33592..a2a7ae55 100644 --- a/synthesis/population/sampled.py +++ b/synthesis/population/sampled.py @@ -8,25 +8,31 @@ through the 'sampling_rate' configuration option. """ + def configure(context): if context.config("projection_year", None) is None: - context.stage("data.census.filtered", alias = "source") + context.stage("data.census.filtered", alias="source") else: - context.stage("synthesis.population.projection.reweighted", alias = "source") + context.stage("synthesis.population.projection.reweighted", alias="source") context.config("random_seed") context.config("sampling_rate") + def execute(context): - df_census = context.stage("source").sort_values(by = "household_id").copy() + df_census = context.stage("source").sort_values(by="household_id").copy() sampling_rate = context.config("sampling_rate") random = np.random.RandomState(context.config("random_seed")) # Perform stochastic rounding for the population (and scale weights) - df_rounding = df_census[["household_id", "weight", "household_size"]].drop_duplicates("household_id") + df_rounding = df_census[ + ["household_id", "weight", "household_size"] + ].drop_duplicates("household_id") df_rounding["multiplicator"] = np.floor(df_rounding["weight"]) - df_rounding["multiplicator"] += random.random_sample(len(df_rounding)) <= (df_rounding["weight"] - df_rounding["multiplicator"]) + df_rounding["multiplicator"] += random.random_sample(len(df_rounding)) <= ( + df_rounding["weight"] - df_rounding["multiplicator"] + ) df_rounding["multiplicator"] = df_rounding["multiplicator"].astype(int) # Multiply households (use same multiplicator for all household members) @@ -50,7 +56,9 @@ def execute(context): household_sizes = np.repeat(household_sizes, household_multiplicators) household_count = np.sum(household_multiplicators) - df_census.loc[:, "household_id"] = np.repeat(np.arange(household_count), household_sizes) + df_census.loc[:, "household_id"] = np.repeat( + np.arange(household_count), household_sizes + ) # Select sample from 100% population selector = random.random_sample(household_count) < sampling_rate diff --git a/synthesis/population/spatial/commute_distance.py b/synthesis/population/spatial/commute_distance.py index 49064d89..c9f0c2e6 100644 --- a/synthesis/population/spatial/commute_distance.py +++ b/synthesis/population/spatial/commute_distance.py @@ -1,28 +1,32 @@ import pandas as pd + def configure(context): context.stage("synthesis.population.enriched") context.stage("data.hts.commute_distance") + def execute(context): df_matching = context.stage("synthesis.population.enriched") df_commute_distance = context.stage("data.hts.commute_distance") df_work = pd.merge( df_matching[["person_id", "hts_id"]], - df_commute_distance["work"][["person_id", "commute_distance"]].rename(columns = dict(person_id = "hts_id")), - how = "left" + df_commute_distance["work"][["person_id", "commute_distance"]].rename( + columns=dict(person_id="hts_id") + ), + how="left", ) df_education = pd.merge( df_matching[["person_id", "hts_id"]], - df_commute_distance["education"][["person_id", "commute_distance"]].rename(columns = dict(person_id = "hts_id")), - how = "left" + df_commute_distance["education"][["person_id", "commute_distance"]].rename( + columns=dict(person_id="hts_id") + ), + how="left", ) assert len(df_work) == len(df_matching) assert len(df_education) == len(df_matching) - return dict( - work = df_work, education = df_education - ) + return dict(work=df_work, education=df_education) diff --git a/synthesis/population/spatial/home/locations.py b/synthesis/population/spatial/home/locations.py index 9347e5ec..0604941b 100644 --- a/synthesis/population/spatial/home/locations.py +++ b/synthesis/population/spatial/home/locations.py @@ -3,13 +3,15 @@ import pandas as pd import geopandas as gpd + def configure(context): context.stage("synthesis.population.spatial.home.zones") context.stage("synthesis.locations.home.locations") context.config("home_location_source", "addresses") - + context.config("random_seed") + def _sample_locations(context, args): # Extract data sets df_locations = context.data("df_locations") @@ -35,33 +37,39 @@ def _sample_locations(context, args): cdf = np.cumsum(df_locations["weight"].values) cdf /= cdf[-1] - indices = np.array([np.count_nonzero(cdf < u) - for u in random.random_sample(size = home_count)]) - + indices = np.array( + [np.count_nonzero(cdf < u) for u in random.random_sample(size=home_count)] + ) + # Apply selection df_homes["geometry"] = df_locations.iloc[indices]["geometry"].values df_homes["home_location_id"] = df_locations.iloc[indices]["home_location_id"].values - + # Update progress context.progress.update() - return gpd.GeoDataFrame(df_homes, crs = df_locations.crs) + return gpd.GeoDataFrame(df_homes, crs=df_locations.crs) + def execute(context): random = np.random.RandomState(context.config("random_seed")) df_homes = context.stage("synthesis.population.spatial.home.zones") df_locations = context.stage("synthesis.locations.home.locations") - + # Sample locations for home unique_iris_ids = sorted(set(df_homes["iris_id"].unique())) - with context.progress(label = "Sampling home locations ...", total = len(unique_iris_ids)): - with context.parallel(dict( - df_locations = df_locations, df_homes = df_homes - )) as parallel: - seeds = random.randint(10000, size = len(unique_iris_ids)) - df_homes = pd.concat(parallel.map(_sample_locations, zip(unique_iris_ids, seeds))) + with context.progress( + label="Sampling home locations ...", total=len(unique_iris_ids) + ): + with context.parallel( + dict(df_locations=df_locations, df_homes=df_homes) + ) as parallel: + seeds = random.randint(10000, size=len(unique_iris_ids)) + df_homes = pd.concat( + parallel.map(_sample_locations, zip(unique_iris_ids, seeds)) + ) out = ["household_id", "commune_id", "home_location_id", "geometry"] - + return df_homes[out] diff --git a/synthesis/population/spatial/home/zones.py b/synthesis/population/spatial/home/zones.py index 2964fdc5..4ae6da82 100644 --- a/synthesis/population/spatial/home/zones.py +++ b/synthesis/population/spatial/home/zones.py @@ -12,6 +12,7 @@ has less than 200 inhabitants to the second case. """ + def configure(context): context.stage("synthesis.population.sampled") @@ -21,66 +22,100 @@ def configure(context): context.config("random_seed") + def execute(context): random = np.random.RandomState(context.config("random_seed")) - df_households = context.stage("synthesis.population.sampled").drop_duplicates("household_id")[[ - "household_id", "commune_id", "iris_id", "departement_id" - ]].copy().set_index("household_id") + df_households = ( + context.stage("synthesis.population.sampled") + .drop_duplicates("household_id")[ + ["household_id", "commune_id", "iris_id", "departement_id"] + ] + .copy() + .set_index("household_id") + ) f_has_commune = df_households["commune_id"] != "undefined" f_has_iris = df_households["iris_id"] != "undefined" # Fix missing communes (we select from those without IRIS) - df_municipalities = context.stage("data.spatial.municipalities").set_index("commune_id") - df_municipalities["population"] = context.stage("data.spatial.population").groupby("commune_id")["population"].sum() + df_municipalities = context.stage("data.spatial.municipalities").set_index( + "commune_id" + ) + df_municipalities["population"] = ( + context.stage("data.spatial.population") + .groupby("commune_id")["population"] + .sum() + ) df_households["commune_id"] = df_households["commune_id"].cat.add_categories( - sorted(set(df_municipalities.index.unique()) - set(df_households["commune_id"].cat.categories))) + sorted( + set(df_municipalities.index.unique()) + - set(df_households["commune_id"].cat.categories) + ) + ) departements = df_households[~f_has_commune]["departement_id"].unique() - for departement_id in context.progress(departements, label = "Fixing missing communes ..."): + for departement_id in context.progress( + departements, label="Fixing missing communes ..." + ): df_candidates = df_municipalities[ - ~df_municipalities["has_iris"] & - (df_municipalities["departement_id"].astype(str) == departement_id)] + ~df_municipalities["has_iris"] + & (df_municipalities["departement_id"].astype(str) == departement_id) + ] df_target = df_households[ - ~f_has_commune & - (df_households["departement_id"] == departement_id)].copy() + ~f_has_commune & (df_households["departement_id"] == departement_id) + ].copy() weights = df_candidates["population"].values.astype(float) weights /= np.sum(weights) - indices = np.repeat(np.arange(weights.shape[0]), random.multinomial(len(df_target), weights)) - df_target["commune_id"] = df_candidates.reset_index()["commune_id"].iloc[indices].values + indices = np.repeat( + np.arange(weights.shape[0]), random.multinomial(len(df_target), weights) + ) + df_target["commune_id"] = ( + df_candidates.reset_index()["commune_id"].iloc[indices].values + ) df_households.loc[df_target.index, "commune_id"] = df_target["commune_id"] # Fix missing IRIS (we select from those with <200 inhabitants) df_iris = context.stage("data.spatial.iris").set_index("iris_id") - df_iris["population"] = context.stage("data.spatial.population").set_index("iris_id")["population"] + df_iris["population"] = context.stage("data.spatial.population").set_index( + "iris_id" + )["population"] df_households["iris_id"] = df_households["iris_id"].cat.add_categories( - sorted(set(df_iris.index.unique()) - set(df_households["iris_id"].cat.categories))) + sorted( + set(df_iris.index.unique()) - set(df_households["iris_id"].cat.categories) + ) + ) communes = df_households[~f_has_iris & f_has_commune]["commune_id"].unique() - for commune_id in context.progress(communes, label = "Fixing missing IRIS ..."): + for commune_id in context.progress(communes, label="Fixing missing IRIS ..."): df_candidates = df_iris[ - (df_iris["population"] <= 200) & - (df_iris["commune_id"].astype(str) == commune_id)] + (df_iris["population"] <= 200) + & (df_iris["commune_id"].astype(str) == commune_id) + ] df_target = df_households[ - f_has_commune & ~f_has_iris & - (df_households["commune_id"] == commune_id)].copy() + f_has_commune & ~f_has_iris & (df_households["commune_id"] == commune_id) + ].copy() weights = df_candidates["population"].values.astype(float) - if (weights == 0.0).all(): weights += 1.0 + if (weights == 0.0).all(): + weights += 1.0 weights /= np.sum(weights) - indices = np.repeat(np.arange(weights.shape[0]), random.multinomial(len(df_target), weights)) - df_target["iris_id"] = df_candidates.reset_index()["iris_id"].iloc[indices].values + indices = np.repeat( + np.arange(weights.shape[0]), random.multinomial(len(df_target), weights) + ) + df_target["iris_id"] = ( + df_candidates.reset_index()["iris_id"].iloc[indices].values + ) df_households.loc[df_target.index, "iris_id"] = df_target["iris_id"] @@ -90,14 +125,20 @@ def execute(context): # Now there are some people left who don't have an IRIS, because the commune # is not covered in IRIS. Hence, we drive the commune-based IRIS for them. f = df_households["iris_id"] == "undefined" - df_households.loc[f, "iris_id"] = df_households.loc[f, "commune_id"].astype(str) + "0000" + df_households.loc[f, "iris_id"] = ( + df_households.loc[f, "commune_id"].astype(str) + "0000" + ) # Finally, make sure that we have no invalid codes - invalid_communes = set(df_households["commune_id"].unique()) - set(df_municipalities.index.unique()) + invalid_communes = set(df_households["commune_id"].unique()) - set( + df_municipalities.index.unique() + ) invalid_iris = set(df_households["iris_id"].unique()) - set(df_iris.index.unique()) assert len(invalid_communes) == 0 assert len(invalid_iris) == 0 assert np.count_nonzero(df_households["iris_id"] == "undefined") == 0 - return df_households.reset_index()[["household_id", "departement_id", "commune_id", "iris_id"]] + return df_households.reset_index()[ + ["household_id", "departement_id", "commune_id", "iris_id"] + ] diff --git a/synthesis/population/spatial/locations.py b/synthesis/population/spatial/locations.py index 2397e095..adc9bb2d 100644 --- a/synthesis/population/spatial/locations.py +++ b/synthesis/population/spatial/locations.py @@ -2,6 +2,7 @@ import geopandas as gpd import numpy as np + def configure(context): context.stage("synthesis.population.spatial.home.locations") context.stage("synthesis.population.spatial.primary.locations") @@ -11,57 +12,94 @@ def configure(context): context.stage("synthesis.population.sampled") context.stage("data.spatial.iris") + def execute(context): df_home = context.stage("synthesis.population.spatial.home.locations") - df_work, df_education = context.stage("synthesis.population.spatial.primary.locations") + df_work, df_education = context.stage( + "synthesis.population.spatial.primary.locations" + ) df_secondary = context.stage("synthesis.population.spatial.secondary.locations")[0] - df_persons = context.stage("synthesis.population.sampled")[["person_id", "household_id"]] - df_locations = context.stage("synthesis.population.activities")[["person_id", "activity_index", "purpose"]] + df_persons = context.stage("synthesis.population.sampled")[ + ["person_id", "household_id"] + ] + df_locations = context.stage("synthesis.population.activities")[ + ["person_id", "activity_index", "purpose"] + ] # Home locations df_home_locations = df_locations[df_locations["purpose"] == "home"] - df_home_locations = pd.merge(df_home_locations, df_persons, on = "person_id") - df_home_locations = pd.merge(df_home_locations, df_home[["household_id", "geometry"]], on = "household_id") + df_home_locations = pd.merge(df_home_locations, df_persons, on="person_id") + df_home_locations = pd.merge( + df_home_locations, df_home[["household_id", "geometry"]], on="household_id" + ) df_home_locations["location_id"] = -1 - df_home_locations = df_home_locations[["person_id", "activity_index", "location_id", "geometry"]] + df_home_locations = df_home_locations[ + ["person_id", "activity_index", "location_id", "geometry"] + ] # Work locations df_work_locations = df_locations[df_locations["purpose"] == "work"] - df_work_locations = pd.merge(df_work_locations, df_work[["person_id", "location_id", "geometry"]], on = "person_id") - df_work_locations = df_work_locations[["person_id", "activity_index", "location_id", "geometry"]] + df_work_locations = pd.merge( + df_work_locations, + df_work[["person_id", "location_id", "geometry"]], + on="person_id", + ) + df_work_locations = df_work_locations[ + ["person_id", "activity_index", "location_id", "geometry"] + ] assert not df_work_locations["geometry"].isna().any() # Education locations df_education_locations = df_locations[df_locations["purpose"] == "education"] - df_education_locations = pd.merge(df_education_locations, df_education[["person_id", "location_id", "geometry"]], on = "person_id") - df_education_locations = df_education_locations[["person_id", "activity_index", "location_id", "geometry"]] + df_education_locations = pd.merge( + df_education_locations, + df_education[["person_id", "location_id", "geometry"]], + on="person_id", + ) + df_education_locations = df_education_locations[ + ["person_id", "activity_index", "location_id", "geometry"] + ] assert not df_education_locations["geometry"].isna().any() # Secondary locations - df_secondary_locations = df_locations[~df_locations["purpose"].isin(("home", "work", "education"))].copy() - df_secondary_locations = pd.merge(df_secondary_locations, df_secondary[[ - "person_id", "activity_index", "location_id", "geometry" - ]], on = ["person_id", "activity_index"], how = "left") - df_secondary_locations = df_secondary_locations[["person_id", "activity_index", "location_id", "geometry"]] + df_secondary_locations = df_locations[ + ~df_locations["purpose"].isin(("home", "work", "education")) + ].copy() + df_secondary_locations = pd.merge( + df_secondary_locations, + df_secondary[["person_id", "activity_index", "location_id", "geometry"]], + on=["person_id", "activity_index"], + how="left", + ) + df_secondary_locations = df_secondary_locations[ + ["person_id", "activity_index", "location_id", "geometry"] + ] assert not df_secondary_locations["geometry"].isna().any() # Validation initial_count = len(df_locations) - df_locations = pd.concat([df_home_locations, df_work_locations, df_education_locations, df_secondary_locations]) + df_locations = pd.concat( + [ + df_home_locations, + df_work_locations, + df_education_locations, + df_secondary_locations, + ] + ) - df_locations = df_locations.sort_values(by = ["person_id", "activity_index"]) + df_locations = df_locations.sort_values(by=["person_id", "activity_index"]) final_count = len(df_locations) assert initial_count == final_count assert not df_locations["geometry"].isna().any() - df_locations = gpd.GeoDataFrame(df_locations, crs = df_home.crs) + df_locations = gpd.GeoDataFrame(df_locations, crs=df_home.crs) # add municipalities df_iris = context.stage("data.spatial.iris") - df_iris = gpd.GeoDataFrame(df_iris, crs = df_home.crs) + df_iris = gpd.GeoDataFrame(df_iris, crs=df_home.crs) - df_locations = gpd.sjoin(df_locations,df_iris,how="left") + df_locations = gpd.sjoin(df_locations, df_iris, how="left") return df_locations diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py index 7af9963c..811bb5be 100644 --- a/synthesis/population/spatial/primary/candidates.py +++ b/synthesis/population/spatial/primary/candidates.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np + def configure(context): context.stage("data.od.weighted") @@ -15,11 +16,14 @@ def configure(context): context.config("random_seed") context.config("education_location_source", "bpe") + EDUCATION_MAPPING = { "primary_school": ["C1"], "middle_school": ["C2"], "high_school": ["C3"], - "higher_education": ["C4", "C5", "C6"]} + "higher_education": ["C4", "C5", "C6"], +} + def sample_destination_municipalities(context, arguments): # Load data @@ -37,6 +41,7 @@ def sample_destination_municipalities(context, arguments): context.progress.update() return df_od[["origin_id", "destination_id", "count"]] + def sample_locations(context, arguments): # Load data destination_id, random_seed = arguments @@ -45,7 +50,7 @@ def sample_locations(context, arguments): # Prepare state random = np.random.RandomState(random_seed) df_locations = df_locations[df_locations["commune_id"] == destination_id] - + # Determine demand df_flow = df_flow[df_flow["destination_id"] == destination_id] count = df_flow["count"].sum() @@ -55,40 +60,45 @@ def sample_locations(context, arguments): if "weight" in df_locations: weight = df_locations["weight"].values / df_locations["weight"].sum() - + location_counts = random.multinomial(count, weight) location_ids = df_locations["location_id"].values location_ids = np.repeat(location_ids, location_counts) - # Shuffle, as otherwise it is likely that *all* copies + # Shuffle, as otherwise it is likely that *all* copies # of the first location id go to the first origin, and so on random.shuffle(location_ids) # Construct a data set for all commutes to this zone origin_id = np.repeat(df_flow["origin_id"].values, df_flow["count"].values) - df_result = pd.DataFrame.from_records(dict( - origin_id = origin_id, - location_id = location_ids - )) + df_result = pd.DataFrame.from_records( + dict(origin_id=origin_id, location_id=location_ids) + ) df_result["destination_id"] = destination_id return df_result -def process(context, purpose, random, df_persons, df_od, df_locations,step_name): + +def process(context, purpose, random, df_persons, df_od, df_locations, step_name): df_persons = df_persons[df_persons["has_%s_trip" % purpose]] # Sample commute flows based on population - df_demand = df_persons.groupby("commune_id").size().reset_index(name = "count") + df_demand = df_persons.groupby("commune_id").size().reset_index(name="count") df_demand["random_seed"] = random.randint(0, int(1e6), len(df_demand)) df_demand = df_demand[["commune_id", "count", "random_seed"]] df_demand = df_demand[df_demand["count"] > 0] df_flow = [] - with context.progress(label = "Sampling %s municipalities" % step_name, total = len(df_demand)) as progress: - with context.parallel(dict(df_od = df_od)) as parallel: - for df_partial in parallel.imap_unordered(sample_destination_municipalities, df_demand.itertuples(index = False, name = None)): + with context.progress( + label="Sampling %s municipalities" % step_name, total=len(df_demand) + ) as progress: + with context.parallel(dict(df_od=df_od)) as parallel: + for df_partial in parallel.imap_unordered( + sample_destination_municipalities, + df_demand.itertuples(index=False, name=None), + ): df_flow.append(df_partial) df_flow = pd.concat(df_flow).sort_values(["origin_id", "destination_id"]) @@ -99,30 +109,45 @@ def process(context, purpose, random, df_persons, df_od, df_locations,step_name) df_result = [] - with context.progress(label = "Sampling %s destinations" % purpose, total = len(df_demand)) as progress: - with context.parallel(dict(df_locations = df_locations, df_flow = df_flow)) as parallel: - for df_partial in parallel.imap_unordered(sample_locations, zip(unique_ids, random_seeds)): + with context.progress( + label="Sampling %s destinations" % purpose, total=len(df_demand) + ) as progress: + with context.parallel( + dict(df_locations=df_locations, df_flow=df_flow) + ) as parallel: + for df_partial in parallel.imap_unordered( + sample_locations, zip(unique_ids, random_seeds) + ): df_result.append(df_partial) df_result = pd.concat(df_result).sort_values(["origin_id", "destination_id"]) return df_result[["origin_id", "destination_id", "location_id"]] + def execute(context): # Prepare population data - df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age_range"]].copy() + df_persons = context.stage("synthesis.population.enriched")[ + ["person_id", "household_id", "age_range"] + ].copy() df_trips = context.stage("synthesis.population.trips") - df_persons["has_work_trip"] = df_persons["person_id"].isin(df_trips[ - (df_trips["following_purpose"] == "work") | (df_trips["preceding_purpose"] == "work") - ]["person_id"]) - - df_persons["has_education_trip"] = df_persons["person_id"].isin(df_trips[ - (df_trips["following_purpose"] == "education") | (df_trips["preceding_purpose"] == "education") - ]["person_id"]) + df_persons["has_work_trip"] = df_persons["person_id"].isin( + df_trips[ + (df_trips["following_purpose"] == "work") + | (df_trips["preceding_purpose"] == "work") + ]["person_id"] + ) + + df_persons["has_education_trip"] = df_persons["person_id"].isin( + df_trips[ + (df_trips["following_purpose"] == "education") + | (df_trips["preceding_purpose"] == "education") + ]["person_id"] + ) df_homes = context.stage("synthesis.population.spatial.home.zones") - df_persons = pd.merge(df_persons, df_homes, on = "household_id") + df_persons = pd.merge(df_persons, df_homes, on="household_id") # Prepare spatial data df_work_od, df_education_od = context.stage("data.od.weighted") @@ -132,27 +157,50 @@ def execute(context): df_locations = context.stage("synthesis.locations.work") df_locations["weight"] = df_locations["employees"] - df_work = process(context, "work", random, df_persons, - df_work_od, df_locations, "work" + df_work = process( + context, "work", random, df_persons, df_work_od, df_locations, "work" ) df_locations = context.stage("synthesis.locations.education") - if context.config("education_location_source") == 'bpe': - df_education = process(context, "education", random, df_persons, df_education_od, df_locations,"education") - else : + if context.config("education_location_source") == "bpe": + df_education = process( + context, + "education", + random, + df_persons, + df_education_od, + df_locations, + "education", + ) + else: df_education = [] for prefix, education_type in EDUCATION_MAPPING.items(): df_education.append( - process(context, "education", random, - df_persons[df_persons["age_range"]==prefix], - df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["education_type"].isin(education_type)],prefix) + process( + context, + "education", + random, + df_persons[df_persons["age_range"] == prefix], + df_education_od[df_education_od["age_range"] == prefix], + df_locations[df_locations["education_type"].isin(education_type)], + prefix, + ) ) df_education = pd.concat(df_education) return dict( - work_candidates = df_work, - education_candidates = df_education, - persons = df_persons[df_persons["has_work_trip"] | df_persons["has_education_trip"]][[ - "person_id", "household_id", "age_range", "commune_id", "has_work_trip", "has_education_trip" - ]] + work_candidates=df_work, + education_candidates=df_education, + persons=df_persons[ + df_persons["has_work_trip"] | df_persons["has_education_trip"] + ][ + [ + "person_id", + "household_id", + "age_range", + "commune_id", + "has_work_trip", + "has_education_trip", + ] + ], ) diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py index 136e18ac..92c1c5f1 100644 --- a/synthesis/population/spatial/primary/locations.py +++ b/synthesis/population/spatial/primary/locations.py @@ -3,6 +3,7 @@ import geopandas as gpd from .candidates import EDUCATION_MAPPING + def configure(context): context.stage("synthesis.population.spatial.primary.candidates") context.stage("synthesis.population.spatial.commute_distance") @@ -16,17 +17,20 @@ def configure(context): def define_distance_ordering(df_persons, df_candidates, progress): indices = [] - f_available = np.ones((len(df_candidates),), dtype = bool) + f_available = np.ones((len(df_candidates),), dtype=bool) costs = np.ones((len(df_candidates),)) * np.inf - commute_coordinates = np.vstack([ - df_candidates["geometry"].x.values, - df_candidates["geometry"].y.values - ]).T + commute_coordinates = np.vstack( + [df_candidates["geometry"].x.values, df_candidates["geometry"].y.values] + ).T - for home_coordinate, commute_distance in zip(df_persons["home_location"], df_persons["commute_distance"]): + for home_coordinate, commute_distance in zip( + df_persons["home_location"], df_persons["commute_distance"] + ): home_coordinate = np.array([home_coordinate.x, home_coordinate.y]) - distances = np.sqrt(np.sum((commute_coordinates[f_available] - home_coordinate)**2, axis = 1)) + distances = np.sqrt( + np.sum((commute_coordinates[f_available] - home_coordinate) ** 2, axis=1) + ) costs[f_available] = np.abs(distances - commute_distance) selected_index = np.argmin(costs) @@ -40,20 +44,25 @@ def define_distance_ordering(df_persons, df_candidates, progress): return indices + def define_random_ordering(df_persons, df_candidates, progress): progress.update(len(df_candidates)) return np.arange(len(df_candidates)) + define_ordering = define_distance_ordering + def process_municipality(context, origin_id): # Load data - df_candidates, df_persons = context.data("df_candidates"), context.data("df_persons") + df_candidates, df_persons = context.data("df_candidates"), context.data( + "df_persons" + ) # Find relevant records - df_persons = df_persons[df_persons["commune_id"] == origin_id][[ - "person_id", "home_location", "commute_distance" - ]].copy() + df_persons = df_persons[df_persons["commune_id"] == origin_id][ + ["person_id", "home_location", "commute_distance"] + ].copy() df_candidates = df_candidates[df_candidates["origin_id"] == origin_id] # From previous step, this should be equal! @@ -63,22 +72,28 @@ def process_municipality(context, origin_id): df_candidates = df_candidates.iloc[indices] df_candidates["person_id"] = df_persons["person_id"].values - df_candidates = df_candidates.rename(columns = dict(destination_id = "commune_id")) + df_candidates = df_candidates.rename(columns=dict(destination_id="commune_id")) return df_candidates[["person_id", "commune_id", "location_id", "geometry"]] + def process(context, purpose, df_persons, df_candidates): unique_ids = df_candidates["origin_id"].unique() df_result = [] - with context.progress(label = "Distributing %s destinations" % purpose, total = len(df_persons)) as progress: - with context.parallel(dict(df_persons = df_persons, df_candidates = df_candidates)) as parallel: + with context.progress( + label="Distributing %s destinations" % purpose, total=len(df_persons) + ) as progress: + with context.parallel( + dict(df_persons=df_persons, df_candidates=df_candidates) + ) as parallel: for df_partial in parallel.imap_unordered(process_municipality, unique_ids): df_result.append(df_partial) return pd.concat(df_result).sort_index() + def execute(context): data = context.stage("synthesis.population.spatial.primary.candidates") df_persons = data["persons"] @@ -90,38 +105,69 @@ def execute(context): # Attach home locations df_home = context.stage("synthesis.population.spatial.home.locations") - df_work = pd.merge(df_work, df_home[["household_id", "geometry"]].rename(columns = { - "geometry": "home_location" - }), how = "left", on = "household_id") - - df_education = pd.merge(df_education, df_home[["household_id", "geometry"]].rename(columns = { - "geometry": "home_location" - }), how = "left", on = "household_id") + df_work = pd.merge( + df_work, + df_home[["household_id", "geometry"]].rename( + columns={"geometry": "home_location"} + ), + how="left", + on="household_id", + ) + + df_education = pd.merge( + df_education, + df_home[["household_id", "geometry"]].rename( + columns={"geometry": "home_location"} + ), + how="left", + on="household_id", + ) # Attach commute distances df_commute_distance = context.stage("synthesis.population.spatial.commute_distance") - df_work = pd.merge(df_work, df_commute_distance["work"], how = "left", on = "person_id") - df_education = pd.merge(df_education, df_commute_distance["education"], how = "left", on = "person_id") + df_work = pd.merge(df_work, df_commute_distance["work"], how="left", on="person_id") + df_education = pd.merge( + df_education, df_commute_distance["education"], how="left", on="person_id" + ) # Attach geometry - df_locations = context.stage("synthesis.locations.work")[["location_id", "geometry"]] + df_locations = context.stage("synthesis.locations.work")[ + ["location_id", "geometry"] + ] df_work_candidates = data["work_candidates"] - df_work_candidates = pd.merge(df_work_candidates, df_locations, how = "left", on = "location_id") + df_work_candidates = pd.merge( + df_work_candidates, df_locations, how="left", on="location_id" + ) df_work_candidates = gpd.GeoDataFrame(df_work_candidates) - df_locations = context.stage("synthesis.locations.education")[["education_type", "location_id", "geometry"]] + df_locations = context.stage("synthesis.locations.education")[ + ["education_type", "location_id", "geometry"] + ] df_education_candidates = data["education_candidates"] - df_education_candidates = pd.merge(df_education_candidates, df_locations, how = "left", on = "location_id") + df_education_candidates = pd.merge( + df_education_candidates, df_locations, how="left", on="location_id" + ) df_education_candidates = gpd.GeoDataFrame(df_education_candidates) # Assign destinations df_work = process(context, "work", df_work, df_work_candidates) - if context.config("education_location_source") == 'bpe': - df_education = process(context, "education", df_education, df_education_candidates) - else : + if context.config("education_location_source") == "bpe": + df_education = process( + context, "education", df_education, df_education_candidates + ) + else: education = [] for prefix, education_type in EDUCATION_MAPPING.items(): - education.append(process(context, prefix,df_education[df_education["age_range"]==prefix],df_education_candidates[df_education_candidates["education_type"].isin(education_type)])) + education.append( + process( + context, + prefix, + df_education[df_education["age_range"] == prefix], + df_education_candidates[ + df_education_candidates["education_type"].isin(education_type) + ], + ) + ) df_education = pd.concat(education).sort_index() return df_work, df_education diff --git a/synthesis/population/spatial/secondary/components.py b/synthesis/population/spatial/secondary/components.py index d16a8877..49627650 100644 --- a/synthesis/population/spatial/secondary/components.py +++ b/synthesis/population/spatial/secondary/components.py @@ -2,9 +2,12 @@ import sklearn.neighbors import numpy as np + class CustomDistanceSampler(rda.FeasibleDistanceSampler): - def __init__(self, random, distributions, maximum_iterations = 1000): - rda.FeasibleDistanceSampler.__init__(self, random = random, maximum_iterations = maximum_iterations) + def __init__(self, random, distributions, maximum_iterations=1000): + rda.FeasibleDistanceSampler.__init__( + self, random=random, maximum_iterations=maximum_iterations + ) self.random = random self.distributions = distributions @@ -12,7 +15,9 @@ def __init__(self, random, distributions, maximum_iterations = 1000): def sample_distances(self, problem): distances = np.zeros((len(problem["modes"]))) - for index, (mode, travel_time) in enumerate(zip(problem["modes"], problem["travel_times"])): + for index, (mode, travel_time) in enumerate( + zip(problem["modes"], problem["travel_times"]) + ): mode_distribution = self.distributions[mode] bound_index = np.count_nonzero(travel_time > mode_distribution["bounds"]) @@ -24,6 +29,7 @@ def sample_distances(self, problem): return distances + class CandidateIndex: def __init__(self, data): self.data = data @@ -34,7 +40,9 @@ def __init__(self, data): self.indices[purpose] = sklearn.neighbors.KDTree(data["locations"]) def query(self, purpose, location): - index = self.indices[purpose].query(location.reshape(1, -1), return_distance = False)[0][0] + index = self.indices[purpose].query( + location.reshape(1, -1), return_distance=False + )[0][0] identifier = self.data[purpose]["identifiers"][index] location = self.data[purpose]["locations"][index] return identifier, location @@ -45,6 +53,7 @@ def sample(self, purpose, random): location = self.data[purpose]["locations"][index] return identifier, location + class CustomDiscretizationSolver(rda.DiscretizationSolver): def __init__(self, index): self.index = index @@ -62,9 +71,12 @@ def solve(self, problem, locations): assert len(discretized_locations) == problem["size"] return dict( - valid = True, locations = np.vstack(discretized_locations), identifiers = discretized_identifiers + valid=True, + locations=np.vstack(discretized_locations), + identifiers=discretized_identifiers, ) + class CustomFreeChainSolver(rda.RelaxationSolver): def __init__(self, random, index): self.random = random @@ -76,4 +88,4 @@ def solve(self, problem, distances): locations = np.vstack((anchor, locations)) assert len(locations) == len(distances) + 1 - return dict(valid = True, locations = locations) + return dict(valid=True, locations=locations) diff --git a/synthesis/population/spatial/secondary/distance_distributions.py b/synthesis/population/spatial/secondary/distance_distributions.py index 7fb7273b..fbf31424 100644 --- a/synthesis/population/spatial/secondary/distance_distributions.py +++ b/synthesis/population/spatial/secondary/distance_distributions.py @@ -1,8 +1,10 @@ import numpy as np import pandas as pd + def configure(context): - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def calculate_bounds(values, bin_size): values = np.sort(values) @@ -26,25 +28,44 @@ def calculate_bounds(values, bin_size): bounds[-1] = np.inf else: bounds.append(np.inf) - + return bounds + def execute(context): # Prepare data df_households, df_persons, df_trips = context.stage("hts") - df_trips = pd.merge(df_trips, df_persons[["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" })) + df_trips = pd.merge( + df_trips, + df_persons[["person_id", "person_weight"]].rename( + columns={"person_weight": "weight"} + ), + ) df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"] - distance_column = "euclidean_distance" if "euclidean_distance" in df_trips else "routed_distance" - df = df_trips[["mode", "travel_time", distance_column, "weight", "preceding_purpose", "following_purpose"]].rename(columns = { distance_column: "distance" }) + distance_column = ( + "euclidean_distance" if "euclidean_distance" in df_trips else "routed_distance" + ) + df = df_trips[ + [ + "mode", + "travel_time", + distance_column, + "weight", + "preceding_purpose", + "following_purpose", + ] + ].rename(columns={distance_column: "distance"}) # Filtering primary_activities = ["home", "work", "education"] - df = df[~( - df["preceding_purpose"].isin(primary_activities) & - df["following_purpose"].isin(primary_activities) - )] + df = df[ + ~( + df["preceding_purpose"].isin(primary_activities) + & df["following_purpose"].isin(primary_activities) + ) + ] # Calculate distributions modes = df["mode"].unique() @@ -57,11 +78,13 @@ def execute(context): f_mode = df["mode"] == mode bounds = calculate_bounds(df[f_mode]["travel_time"].values, bin_size) - distributions[mode] = dict(bounds = np.array(bounds), distributions = []) + distributions[mode] = dict(bounds=np.array(bounds), distributions=[]) # Second, calculate distribution per band for lower_bound, upper_bound in zip([-np.inf] + bounds[:-1], bounds): - f_bound = (df["travel_time"] > lower_bound) & (df["travel_time"] <= upper_bound) + f_bound = (df["travel_time"] > lower_bound) & ( + df["travel_time"] <= upper_bound + ) # Set up distribution values = df[f_mode & f_bound]["distance"].values @@ -75,6 +98,8 @@ def execute(context): cdf /= cdf[-1] # Write distribution - distributions[mode]["distributions"].append(dict(cdf = cdf, values = values, weights = weights)) + distributions[mode]["distributions"].append( + dict(cdf=cdf, values=values, weights=weights) + ) return distributions diff --git a/synthesis/population/spatial/secondary/locations.py b/synthesis/population/spatial/secondary/locations.py index b36fb214..f6871e79 100644 --- a/synthesis/population/spatial/secondary/locations.py +++ b/synthesis/population/spatial/secondary/locations.py @@ -6,6 +6,7 @@ from synthesis.population.spatial.secondary.problems import find_assignment_problems + def configure(context): context.stage("synthesis.population.trips") @@ -21,72 +22,113 @@ def configure(context): context.config("secloc_maximum_iterations", np.inf) + def prepare_locations(context): # Load persons and their primary locations df_home = context.stage("synthesis.population.spatial.home.locations") - df_work, df_education = context.stage("synthesis.population.spatial.primary.locations") + df_work, df_education = context.stage( + "synthesis.population.spatial.primary.locations" + ) crs = df_home.crs - df_home = df_home.rename(columns = { "geometry": "home" }) - df_work = df_work.rename(columns = { "geometry": "work" }) - df_education = df_education.rename(columns = { "geometry": "education" }) + df_home = df_home.rename(columns={"geometry": "home"}) + df_work = df_work.rename(columns={"geometry": "work"}) + df_education = df_education.rename(columns={"geometry": "education"}) + + df_locations = context.stage("synthesis.population.sampled")[ + ["person_id", "household_id"] + ] + df_locations = pd.merge( + df_locations, df_home[["household_id", "home"]], how="left", on="household_id" + ) + df_locations = pd.merge( + df_locations, df_work[["person_id", "work"]], how="left", on="person_id" + ) + df_locations = pd.merge( + df_locations, + df_education[["person_id", "education"]], + how="left", + on="person_id", + ) - df_locations = context.stage("synthesis.population.sampled")[["person_id", "household_id"]] - df_locations = pd.merge(df_locations, df_home[["household_id", "home"]], how = "left", on = "household_id") - df_locations = pd.merge(df_locations, df_work[["person_id", "work"]], how = "left", on = "person_id") - df_locations = pd.merge(df_locations, df_education[["person_id", "education"]], how = "left", on = "person_id") + return ( + df_locations[["person_id", "home", "work", "education"]].sort_values( + by="person_id" + ), + crs, + ) - return df_locations[["person_id", "home", "work", "education"]].sort_values(by = "person_id"), crs def prepare_destinations(context): df_locations = context.stage("synthesis.locations.secondary") identifiers = df_locations["location_id"].values - locations = np.vstack(df_locations["geometry"].apply(lambda x: np.array([x.x, x.y])).values) + locations = np.vstack( + df_locations["geometry"].apply(lambda x: np.array([x.x, x.y])).values + ) data = {} for purpose in ("shop", "leisure", "other"): f = df_locations["offers_%s" % purpose].values - data[purpose] = dict( - identifiers = identifiers[f], - locations = locations[f] - ) + data[purpose] = dict(identifiers=identifiers[f], locations=locations[f]) return data + def resample_cdf(cdf, factor): if factor >= 0.0: cdf = cdf * (1.0 + factor * np.arange(1, len(cdf) + 1) / len(cdf)) else: - cdf = cdf * (1.0 + abs(factor) - abs(factor) * np.arange(1, len(cdf) + 1) / len(cdf)) + cdf = cdf * ( + 1.0 + abs(factor) - abs(factor) * np.arange(1, len(cdf) + 1) / len(cdf) + ) cdf /= cdf[-1] return cdf + def resample_distributions(distributions, factors): for mode, mode_distributions in distributions.items(): for distribution in mode_distributions["distributions"]: distribution["cdf"] = resample_cdf(distribution["cdf"], factors[mode]) -from synthesis.population.spatial.secondary.rda import AssignmentSolver, DiscretizationErrorObjective, GravityChainSolver, AngularTailSolver, GeneralRelaxationSolver -from synthesis.population.spatial.secondary.components import CustomDistanceSampler, CustomDiscretizationSolver, CandidateIndex, CustomFreeChainSolver + +from synthesis.population.spatial.secondary.rda import ( + AssignmentSolver, + DiscretizationErrorObjective, + GravityChainSolver, + AngularTailSolver, + GeneralRelaxationSolver, +) +from synthesis.population.spatial.secondary.components import ( + CustomDistanceSampler, + CustomDiscretizationSolver, + CandidateIndex, + CustomFreeChainSolver, +) + def execute(context): # Load trips and primary locations - df_trips = context.stage("synthesis.population.trips").sort_values(by = ["person_id", "trip_index"]) + df_trips = context.stage("synthesis.population.trips").sort_values( + by=["person_id", "trip_index"] + ) df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"] df_primary, crs = prepare_locations(context) # Prepare data - distance_distributions = context.stage("synthesis.population.spatial.secondary.distance_distributions") + distance_distributions = context.stage( + "synthesis.population.spatial.secondary.distance_distributions" + ) destinations = prepare_destinations(context) # Resampling for calibration - resample_distributions(distance_distributions, dict( - car = 0.0, car_passenger = 0.1, pt = 0.5, bike = 0.0, walk = -0.5 - )) + resample_distributions( + distance_distributions, + dict(car=0.0, car_passenger=0.1, pt=0.5, bike=0.0, walk=-0.5), + ) # Segment into subsamples processes = context.config("processes") @@ -96,108 +138,133 @@ def execute(context): unique_person_ids = np.array_split(unique_person_ids, processes) random = np.random.RandomState(context.config("random_seed")) - random_seeds = random.randint(10000, size = processes) + random_seeds = random.randint(10000, size=processes) # Create batch problems for parallelization batches = [] for index in range(processes): - batches.append(( - df_trips[df_trips["person_id"].isin(unique_person_ids[index])], - df_primary[df_primary["person_id"].isin(unique_person_ids[index])], - random_seeds[index], crs - )) + batches.append( + ( + df_trips[df_trips["person_id"].isin(unique_person_ids[index])], + df_primary[df_primary["person_id"].isin(unique_person_ids[index])], + random_seeds[index], + crs, + ) + ) # Run algorithm in parallel - with context.progress(label = "Assigning secondary locations to persons", total = number_of_persons): - with context.parallel(processes = processes, data = dict( - distance_distributions = distance_distributions, - destinations = destinations - )) as parallel: + with context.progress( + label="Assigning secondary locations to persons", total=number_of_persons + ): + with context.parallel( + processes=processes, + data=dict( + distance_distributions=distance_distributions, destinations=destinations + ), + ) as parallel: df_locations, df_convergence = [], [] - for df_locations_item, df_convergence_item in parallel.imap_unordered(process, batches): + for df_locations_item, df_convergence_item in parallel.imap_unordered( + process, batches + ): df_locations.append(df_locations_item) df_convergence.append(df_convergence_item) - df_locations = pd.concat(df_locations).sort_values(by = ["person_id", "activity_index"]) + df_locations = pd.concat(df_locations).sort_values( + by=["person_id", "activity_index"] + ) df_convergence = pd.concat(df_convergence) print("Success rate:", df_convergence["valid"].mean()) return df_locations, df_convergence + def process(context, arguments): - df_trips, df_primary, random_seed, crs = arguments - - # Set up RNG - random = np.random.RandomState(random_seed) - maximum_iterations = context.config("secloc_maximum_iterations") - - # Set up discretization solver - destinations = context.data("destinations") - candidate_index = CandidateIndex(destinations) - discretization_solver = CustomDiscretizationSolver(candidate_index) - - # Set up distance sampler - distance_distributions = context.data("distance_distributions") - distance_sampler = CustomDistanceSampler( - maximum_iterations = min(1000, maximum_iterations), - random = random, - distributions = distance_distributions) - - # Set up relaxation solver; currently, we do not consider tail problems. - chain_solver = GravityChainSolver( - random = random, eps = 10.0, lateral_deviation = 10.0, alpha = 0.1, - maximum_iterations = min(1000, maximum_iterations) + df_trips, df_primary, random_seed, crs = arguments + + # Set up RNG + random = np.random.RandomState(random_seed) + maximum_iterations = context.config("secloc_maximum_iterations") + + # Set up discretization solver + destinations = context.data("destinations") + candidate_index = CandidateIndex(destinations) + discretization_solver = CustomDiscretizationSolver(candidate_index) + + # Set up distance sampler + distance_distributions = context.data("distance_distributions") + distance_sampler = CustomDistanceSampler( + maximum_iterations=min(1000, maximum_iterations), + random=random, + distributions=distance_distributions, ) - tail_solver = AngularTailSolver(random = random) - free_solver = CustomFreeChainSolver(random, candidate_index) + # Set up relaxation solver; currently, we do not consider tail problems. + chain_solver = GravityChainSolver( + random=random, + eps=10.0, + lateral_deviation=10.0, + alpha=0.1, + maximum_iterations=min(1000, maximum_iterations), + ) + + tail_solver = AngularTailSolver(random=random) + free_solver = CustomFreeChainSolver(random, candidate_index) - relaxation_solver = GeneralRelaxationSolver(chain_solver, tail_solver, free_solver) + relaxation_solver = GeneralRelaxationSolver(chain_solver, tail_solver, free_solver) - # Set up assignment solver - thresholds = dict( - car = 200.0, car_passenger = 200.0, pt = 200.0, - bike = 100.0, walk = 100.0 - ) + # Set up assignment solver + thresholds = dict(car=200.0, car_passenger=200.0, pt=200.0, bike=100.0, walk=100.0) - assignment_objective = DiscretizationErrorObjective(thresholds = thresholds) - assignment_solver = AssignmentSolver( - distance_sampler = distance_sampler, - relaxation_solver = relaxation_solver, - discretization_solver = discretization_solver, - objective = assignment_objective, - maximum_iterations = min(20, maximum_iterations) - ) + assignment_objective = DiscretizationErrorObjective(thresholds=thresholds) + assignment_solver = AssignmentSolver( + distance_sampler=distance_sampler, + relaxation_solver=relaxation_solver, + discretization_solver=discretization_solver, + objective=assignment_objective, + maximum_iterations=min(20, maximum_iterations), + ) - df_locations = [] - df_convergence = [] + df_locations = [] + df_convergence = [] - last_person_id = None + last_person_id = None - for problem in find_assignment_problems(df_trips, df_primary): - result = assignment_solver.solve(problem) + for problem in find_assignment_problems(df_trips, df_primary): + result = assignment_solver.solve(problem) - starting_activity_index = problem["activity_index"] + starting_activity_index = problem["activity_index"] - for index, (identifier, location) in enumerate(zip(result["discretization"]["identifiers"], result["discretization"]["locations"])): - df_locations.append(( - problem["person_id"], starting_activity_index + index, identifier, geo.Point(location) - )) + for index, (identifier, location) in enumerate( + zip( + result["discretization"]["identifiers"], + result["discretization"]["locations"], + ) + ): + df_locations.append( + ( + problem["person_id"], + starting_activity_index + index, + identifier, + geo.Point(location), + ) + ) - df_convergence.append(( - result["valid"], problem["size"] - )) + df_convergence.append((result["valid"], problem["size"])) - if problem["person_id"] != last_person_id: - last_person_id = problem["person_id"] - context.progress.update() + if problem["person_id"] != last_person_id: + last_person_id = problem["person_id"] + context.progress.update() - df_locations = pd.DataFrame.from_records(df_locations, columns = ["person_id", "activity_index", "location_id", "geometry"]) - df_locations = gpd.GeoDataFrame(df_locations, crs = crs) - assert not df_locations["geometry"].isna().any() + df_locations = pd.DataFrame.from_records( + df_locations, columns=["person_id", "activity_index", "location_id", "geometry"] + ) + df_locations = gpd.GeoDataFrame(df_locations, crs=crs) + assert not df_locations["geometry"].isna().any() - df_convergence = pd.DataFrame.from_records(df_convergence, columns = ["valid", "size"]) - return df_locations, df_convergence + df_convergence = pd.DataFrame.from_records( + df_convergence, columns=["valid", "size"] + ) + return df_locations, df_convergence diff --git a/synthesis/population/spatial/secondary/problems.py b/synthesis/population/spatial/secondary/problems.py index b4f7295e..ae60266c 100644 --- a/synthesis/population/spatial/secondary/problems.py +++ b/synthesis/population/spatial/secondary/problems.py @@ -1,14 +1,29 @@ import numpy as np import pandas as pd -FIELDS = ["person_id", "trip_index", "preceding_purpose", "following_purpose", "mode", "travel_time"] +FIELDS = [ + "person_id", + "trip_index", + "preceding_purpose", + "following_purpose", + "mode", + "travel_time", +] FIXED_PURPOSES = ["home", "work", "education"] + def find_bare_assignment_problems(df): problem = None - for row in df[FIELDS].itertuples(index = False): - person_id, trip_index, preceding_purpose, following_purpose, mode, travel_time = row + for row in df[FIELDS].itertuples(index=False): + ( + person_id, + trip_index, + preceding_purpose, + following_purpose, + mode, + travel_time, + ) = row if not problem is None and person_id != problem["person_id"]: # We switch person, but we're still tracking a problem. This is a tail! @@ -18,8 +33,11 @@ def find_bare_assignment_problems(df): if problem is None: # Start a new problem problem = dict( - person_id = person_id, trip_index = trip_index, purposes = [preceding_purpose], - modes = [], travel_times = [] + person_id=person_id, + trip_index=trip_index, + purposes=[preceding_purpose], + modes=[], + travel_times=[], ) problem["purposes"].append(following_purpose) @@ -34,16 +52,18 @@ def find_bare_assignment_problems(df): if not problem is None: yield problem + LOCATION_FIELDS = ["person_id", "home", "work", "education"] + def find_assignment_problems(df, df_locations): """ - Enriches assignment problems with: - - Locations of the fixed activities - - Size of the problem - - Reduces purposes to the variable ones + Enriches assignment problems with: + - Locations of the fixed activities + - Size of the problem + - Reduces purposes to the variable ones """ - location_iterator = df_locations[LOCATION_FIELDS].itertuples(index = False) + location_iterator = df_locations[LOCATION_FIELDS].itertuples(index=False) current_location = None for problem in find_bare_assignment_problems(df): @@ -61,13 +81,13 @@ def find_assignment_problems(df, df_locations): problem["purposes"] = problem["purposes"][:-1] else: - pass # Neither chain nor tail + pass # Neither chain nor tail # Define size problem["size"] = len(problem["purposes"]) if problem["size"] == 0: - continue # We can skip if there are no variable activities + continue # We can skip if there are no variable activities # Advance location iterator until we arrive at the current problem's person while current_location is None or current_location[0] != problem["person_id"]: @@ -78,12 +98,18 @@ def find_assignment_problems(df, df_locations): problem["destination"] = None if origin_purpose in FIXED_PURPOSES: - problem["origin"] = current_location[LOCATION_FIELDS.index(origin_purpose)] # Shapely POINT + problem["origin"] = current_location[ + LOCATION_FIELDS.index(origin_purpose) + ] # Shapely POINT problem["origin"] = np.array([[problem["origin"].x, problem["origin"].y]]) if destination_purpose in FIXED_PURPOSES: - problem["destination"] = current_location[LOCATION_FIELDS.index(destination_purpose)] # Shapely POINT - problem["destination"] = np.array([[problem["destination"].x, problem["destination"].y]]) + problem["destination"] = current_location[ + LOCATION_FIELDS.index(destination_purpose) + ] # Shapely POINT + problem["destination"] = np.array( + [[problem["destination"].x, problem["destination"].y]] + ) if problem["origin"] is None: problem["activity_index"] = problem["trip_index"] diff --git a/synthesis/population/spatial/secondary/rda.py b/synthesis/population/spatial/secondary/rda.py index 232d1c86..3eaddc1c 100644 --- a/synthesis/population/spatial/secondary/rda.py +++ b/synthesis/population/spatial/secondary/rda.py @@ -1,10 +1,15 @@ import numpy as np import numpy.linalg as la -def check_feasibility(distances, direct_distance, consider_total_distance = True): - return calculate_feasibility(distances, direct_distance, consider_total_distance) == 0.0 -def calculate_feasibility(distances, direct_distance, consider_total_distance = True): +def check_feasibility(distances, direct_distance, consider_total_distance=True): + return ( + calculate_feasibility(distances, direct_distance, consider_total_distance) + == 0.0 + ) + + +def calculate_feasibility(distances, direct_distance, consider_total_distance=True): total_distance = np.sum(distances) delta_distance = 0.0 @@ -16,24 +21,38 @@ def calculate_feasibility(distances, direct_distance, consider_total_distance = return float(max(delta, 0)) + class DiscretizationSolver: def solve(self, problem, locations): raise NotImplementedError() + class RelaxationSolver: def solve(self, problem, distances): raise NotImplementedError() + class DistanceSampler: def sample(self, problem): raise NotImplementedError() + class AssignmentObjective: - def evaluate(self, problem, distance_result, relaxation_result, discretization_result): + def evaluate( + self, problem, distance_result, relaxation_result, discretization_result + ): raise NotImplementedError() + class AssignmentSolver: - def __init__(self, distance_sampler, relaxation_solver, discretization_solver, objective, maximum_iterations = 1000): + def __init__( + self, + distance_sampler, + relaxation_solver, + discretization_solver, + objective, + maximum_iterations=1000, + ): self.maximum_iterations = maximum_iterations self.relaxation_solver = relaxation_solver @@ -47,12 +66,21 @@ def solve(self, problem): for assignment_iteration in range(self.maximum_iterations): distance_result = self.distance_sampler.sample(problem) - relaxation_result = self.relaxation_solver.solve(problem, distance_result["distances"]) - discretization_result = self.discretization_solver.solve(problem, relaxation_result["locations"]) + relaxation_result = self.relaxation_solver.solve( + problem, distance_result["distances"] + ) + discretization_result = self.discretization_solver.solve( + problem, relaxation_result["locations"] + ) - assignment_result = self.objective.evaluate(problem, distance_result, relaxation_result, discretization_result) + assignment_result = self.objective.evaluate( + problem, distance_result, relaxation_result, discretization_result + ) - if best_result is None or assignment_result["objective"] < best_result["objective"]: + if ( + best_result is None + or assignment_result["objective"] < best_result["objective"] + ): best_result = assignment_result assignment_result["distance"] = distance_result @@ -65,8 +93,9 @@ def solve(self, problem): return best_result + class GeneralRelaxationSolver(RelaxationSolver): - def __init__(self, chain_solver, tail_solver = None, free_solver = None): + def __init__(self, chain_solver, tail_solver=None, free_solver=None): self.chain_solver = chain_solver self.tail_solver = tail_solver self.free_solver = free_solver @@ -81,6 +110,7 @@ def solve(self, problem, distances): else: return self.chain_solver.solve(problem, distances) + def sample_tail(random, anchor, distances): angles = random.random_sample(len(distances)) * 2.0 * np.pi offsets = np.vstack([np.cos(angles), np.sin(angles)]).T * distances[:, np.newaxis] @@ -92,6 +122,7 @@ def sample_tail(random, anchor, distances): return np.vstack(locations[1:]) + class AngularTailSolver(RelaxationSolver): def __init__(self, random): self.random = random @@ -111,26 +142,38 @@ def solve(self, problem, distances): raise RuntimeError("Invalid chain for AngularTailSolver") locations = sample_tail(self.random, anchor, distances) - if reverse: locations = locations[::-1,:] + if reverse: + locations = locations[::-1, :] assert len(locations) == len(distances) - return dict(valid = True, locations = locations) + return dict(valid=True, locations=locations) + class GravityChainSolver: - def __init__(self, random, alpha = 0.3, eps = 1.0, maximum_iterations = 1000, lateral_deviation = None): + def __init__( + self, + random, + alpha=0.3, + eps=1.0, + maximum_iterations=1000, + lateral_deviation=None, + ): self.alpha = 0.3 self.eps = 1e-2 self.maximum_iterations = maximum_iterations self.random = random self.lateral_deviation = lateral_deviation - def solve_two_points(self, problem, origin, destination, distances, direction, direct_distance): + def solve_two_points( + self, problem, origin, destination, distances, direction, direct_distance + ): if direct_distance == 0.0: location = origin + direction * distances[0] return dict( - valid = distances[0] == distances[1], - locations = location.reshape(-1, 2), iterations = None + valid=distances[0] == distances[1], + locations=location.reshape(-1, 2), + iterations=None, ) elif direct_distance > np.sum(distances): @@ -141,9 +184,7 @@ def solve_two_points(self, problem, origin, destination, distances, direction, d location = origin + direction * ratio * direct_distance - return dict( - valid = False, locations = location.reshape(-1, 2), iterations = None - ) + return dict(valid=False, locations=location.reshape(-1, 2), iterations=None) elif direct_distance < np.abs(distances[0] - distances[1]): ratio = 1.0 @@ -154,24 +195,24 @@ def solve_two_points(self, problem, origin, destination, distances, direction, d maximum_distance = max(distances) location = origin + direction * ratio * maximum_distance - return dict( - valid = False, locations = location.reshape(-1, 2), iterations = None - ) + return dict(valid=False, locations=location.reshape(-1, 2), iterations=None) else: - A = 0.5 * ( distances[0]**2 - distances[1]**2 + direct_distance**2 ) / direct_distance - H = np.sqrt(max(0, distances[0]**2 - A**2)) + A = ( + 0.5 + * (distances[0] ** 2 - distances[1] ** 2 + direct_distance**2) + / direct_distance + ) + H = np.sqrt(max(0, distances[0] ** 2 - A**2)) r = self.random.random_sample() center = origin + direction * A offset = direction * H - offset = np.array([offset[0,1], -offset[0,0]]) + offset = np.array([offset[0, 1], -offset[0, 0]]) location = center + (1.0 if r < 0.5 else -1.0) * offset - return dict( - valid = True, locations = location.reshape(-1, 2), iterations = None - ) + return dict(valid=True, locations=location.reshape(-1, 2), iterations=None) def solve(self, problem, distances): origin, destination = problem["origin"], problem["destination"] @@ -182,21 +223,23 @@ def solve(self, problem, distances): # Prepare direction and normal direction direct_distance = la.norm(destination - origin) - if direct_distance < 1e-12: # We have a zero direct distance, choose a direction randomly + if ( + direct_distance < 1e-12 + ): # We have a zero direct distance, choose a direction randomly angle = self.random.random() * np.pi * 2.0 - direction = np.array([ - np.cos(angle), np.sin(angle) - ]).reshape((1, 2)) + direction = np.array([np.cos(angle), np.sin(angle)]).reshape((1, 2)) else: direction = (destination - origin) / direct_distance - normal = np.array([direction[0,1], -direction[0,0]]) + normal = np.array([direction[0, 1], -direction[0, 0]]) # If we have only one variable point, take a short cut if problem["size"] == 1: - return self.solve_two_points(problem, origin, destination, distances, direction, direct_distance) + return self.solve_two_points( + problem, origin, destination, distances, direction, direct_distance + ) # Prepare initial locations if np.sum(distances) < 1e-12: @@ -208,52 +251,76 @@ def solve(self, problem, distances): locations = np.vstack([origin, locations, destination]) if not check_feasibility(distances, direct_distance): - return dict( # We still return some locations although they may not be perfect - valid = False, locations = locations[1:-1], iterations = None + return ( + dict( # We still return some locations although they may not be perfect + valid=False, locations=locations[1:-1], iterations=None + ) ) # Add lateral devations - lateral_deviation = self.lateral_deviation if not self.lateral_deviation is None else max(direct_distance, 1.0) - locations[1:-1] += normal * 2.0 * (self.random.normal(size = len(distances) - 1)[:, np.newaxis] - 0.5) * lateral_deviation + lateral_deviation = ( + self.lateral_deviation + if not self.lateral_deviation is None + else max(direct_distance, 1.0) + ) + locations[1:-1] += ( + normal + * 2.0 + * (self.random.normal(size=len(distances) - 1)[:, np.newaxis] - 0.5) + * lateral_deviation + ) # Prepare gravity simulation valid = False origin_weights = np.ones((len(distances) - 1, 2)) - origin_weights[0,:] = 2.0 + origin_weights[0, :] = 2.0 destination_weights = np.ones((len(distances) - 1, 2)) - destination_weights[-1,:] = 2.0 + destination_weights[-1, :] = 2.0 # Run gravity simulation for k in range(self.maximum_iterations): directions = locations[:-1] - locations[1:] - lengths = la.norm(directions, axis = 1) + lengths = la.norm(directions, axis=1) offset = distances - lengths lengths[lengths < 1.0] = 1.0 directions /= lengths[:, np.newaxis] - if np.all(np.abs(offset) < self.eps): # Check if we have converged + if np.all(np.abs(offset) < self.eps): # Check if we have converged valid = True break # Apply adjustment to locations adjustment = np.zeros((len(distances) - 1, 2)) - adjustment -= 0.5 * self.alpha * offset[:-1, np.newaxis] * directions[:-1] * origin_weights - adjustment += 0.5 * self.alpha * offset[1:, np.newaxis] * directions[1:] * destination_weights + adjustment -= ( + 0.5 + * self.alpha + * offset[:-1, np.newaxis] + * directions[:-1] + * origin_weights + ) + adjustment += ( + 0.5 + * self.alpha + * offset[1:, np.newaxis] + * directions[1:] + * destination_weights + ) locations[1:-1] += adjustment if np.isnan(locations).any() or np.isinf(locations).any(): - raise RuntimeError("NaN/Inf value encountered during gravity simulation") + raise RuntimeError( + "NaN/Inf value encountered during gravity simulation" + ) + + return dict(valid=valid, locations=locations[1:-1], iterations=k) - return dict( - valid = valid, locations = locations[1:-1], iterations = k - ) class FeasibleDistanceSampler(DistanceSampler): - def __init__(self, random, maximum_iterations = 1000): + def __init__(self, random, maximum_iterations=1000): self.maximum_iterations = maximum_iterations self.random = random @@ -264,26 +331,26 @@ def sample_distances(self, problem): def sample(self, problem): origin, destination = problem["origin"], problem["destination"] - if origin is None and destination is None: # This is a free chain + if origin is None and destination is None: # This is a free chain distances = self.sample_distances(problem) - return dict(valid = True, distances = distances, iterations = None) + return dict(valid=True, distances=distances, iterations=None) - elif origin is None: # This is a left tail + elif origin is None: # This is a left tail distances = self.sample_distances(problem) - return dict(valid = True, distances = distances, iterations = None) + return dict(valid=True, distances=distances, iterations=None) - elif destination is None: # This is a right tail + elif destination is None: # This is a right tail distances = self.sample_distances(problem) - return dict(valid = True, distances = distances, iterations = None) + return dict(valid=True, distances=distances, iterations=None) - direct_distance = la.norm(destination - origin, axis = 1) + direct_distance = la.norm(destination - origin, axis=1) # One point and two trips if direct_distance < 1e-3 and problem["size"] == 1: distances = self.sample_distances(problem) distances = np.array([distances[0], distances[0]]) - return dict(valid = True, distances = distances, iterations = None) + return dict(valid=True, distances=distances, iterations=None) # This is the general case best_distances = None @@ -300,32 +367,35 @@ def sample(self, problem): if delta == 0.0: break - return dict( - valid = best_delta == 0.0, - distances = best_distances, - iterations = k - ) + return dict(valid=best_delta == 0.0, distances=best_distances, iterations=k) + class DiscretizationErrorObjective(AssignmentObjective): def __init__(self, thresholds): self.thresholds = thresholds - def evaluate(self, problem, distance_result, relaxation_result, discretization_result): + def evaluate( + self, problem, distance_result, relaxation_result, discretization_result + ): sampled_distances = distance_result["distances"] discretized_locations = [] - if not problem["origin"] is None: discretized_locations.append(problem["origin"]) + if not problem["origin"] is None: + discretized_locations.append(problem["origin"]) discretized_locations.append(discretization_result["locations"]) - if not problem["destination"] is None: discretized_locations.append(problem["destination"]) + if not problem["destination"] is None: + discretized_locations.append(problem["destination"]) discretized_locations = np.vstack(discretized_locations) - discretized_distances = la.norm(discretized_locations[:-1] - discretized_locations[1:], axis = 1) + discretized_distances = la.norm( + discretized_locations[:-1] - discretized_locations[1:], axis=1 + ) discretization_error = np.abs(sampled_distances - discretized_distances) objective = 0.0 for error, mode in zip(discretization_error, problem["modes"]): target_error = self.thresholds[mode] - excess_error = max(0.0, error - target_error ) + excess_error = max(0.0, error - target_error) objective = max(objective, excess_error) valid = objective == 0.0 @@ -333,4 +403,4 @@ def evaluate(self, problem, distance_result, relaxation_result, discretization_r valid &= relaxation_result["valid"] valid &= discretization_result["valid"] - return dict(valid = valid, objective = objective) + return dict(valid=valid, objective=objective) diff --git a/synthesis/population/trips.py b/synthesis/population/trips.py index 7a76af96..c4483743 100644 --- a/synthesis/population/trips.py +++ b/synthesis/population/trips.py @@ -7,12 +7,14 @@ This stage duplicates trips and attaches them to the synthetic population. """ + def configure(context): context.stage("synthesis.population.matched") context.config("random_seed") hts = context.config("hts") - context.stage("data.hts.selected", alias = "hts") + context.stage("data.hts.selected", alias="hts") + def execute(context): # Load data @@ -20,23 +22,39 @@ def execute(context): # Duplicate with synthetic persons df_matching = context.stage("synthesis.population.matched") - df_trips = df_trips.rename(columns = { "person_id": "hts_id" }) - df_trips = pd.merge(df_matching, df_trips, on = "hts_id") - df_trips = df_trips.sort_values(by = ["person_id", "trip_id"]) + df_trips = df_trips.rename(columns={"person_id": "hts_id"}) + df_trips = pd.merge(df_matching, df_trips, on="hts_id") + df_trips = df_trips.sort_values(by=["person_id", "trip_id"]) # Define trip index - df_count = df_trips.groupby("person_id").size().reset_index(name = "count") - df_trips["trip_index"] = np.hstack([np.arange(count) for count in df_count["count"].values]) - df_trips = df_trips.sort_values(by = ["person_id", "trip_index"]) + df_count = df_trips.groupby("person_id").size().reset_index(name="count") + df_trips["trip_index"] = np.hstack( + [np.arange(count) for count in df_count["count"].values] + ) + df_trips = df_trips.sort_values(by=["person_id", "trip_index"]) # Diversify departure times random = np.random.RandomState(context.config("random_seed")) - counts = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "count")["count"].values + counts = ( + df_trips[["person_id"]] + .groupby("person_id") + .size() + .reset_index(name="count")["count"] + .values + ) - interval = df_trips[["person_id", "departure_time"]].groupby("person_id").min().reset_index()["departure_time"].values - interval = np.minimum(1800.0, interval) # If first departure time is just 5min after midnight, we only add a deviation of 5min + interval = ( + df_trips[["person_id", "departure_time"]] + .groupby("person_id") + .min() + .reset_index()["departure_time"] + .values + ) + interval = np.minimum( + 1800.0, interval + ) # If first departure time is just 5min after midnight, we only add a deviation of 5min - offset = random.random_sample(size = (len(counts), )) * interval * 2.0 - interval + offset = random.random_sample(size=(len(counts),)) * interval * 2.0 - interval offset = np.repeat(offset, counts) df_trips["departure_time"] += offset @@ -47,11 +65,18 @@ def execute(context): assert (df_trips["departure_time"] >= 0.0).all() assert (df_trips["arrival_time"] >= 0.0).all() - return df_trips[[ - "person_id", "trip_index", - "departure_time", "arrival_time", - "preceding_purpose", "following_purpose", - "is_first_trip", "is_last_trip", - "trip_duration", "activity_duration", - "mode" - ]] + return df_trips[ + [ + "person_id", + "trip_index", + "departure_time", + "arrival_time", + "preceding_purpose", + "following_purpose", + "is_first_trip", + "is_last_trip", + "trip_duration", + "activity_duration", + "mode", + ] + ] diff --git a/synthesis/vehicles/cars/default.py b/synthesis/vehicles/cars/default.py index 1bf32836..0ebf92e4 100644 --- a/synthesis/vehicles/cars/default.py +++ b/synthesis/vehicles/cars/default.py @@ -5,20 +5,34 @@ Creates a vehicle fleet based on a default vehicle type """ + def configure(context): context.stage("synthesis.population.enriched") + def execute(context): df_persons = context.stage("synthesis.population.enriched") - df_vehicle_types = pd.DataFrame.from_records([{ - "type_id": "default_car", "nb_seats": 4, "length": 5.0, "width": 1.0, "pce": 1.0, "mode": "car", - "hbefa_cat": "PASSENGER_CAR", "hbefa_tech": "average", "hbefa_size": "average", "hbefa_emission": "average", - }]) + df_vehicle_types = pd.DataFrame.from_records( + [ + { + "type_id": "default_car", + "nb_seats": 4, + "length": 5.0, + "width": 1.0, + "pce": 1.0, + "mode": "car", + "hbefa_cat": "PASSENGER_CAR", + "hbefa_tech": "average", + "hbefa_size": "average", + "hbefa_emission": "average", + } + ] + ) df_vehicles = df_persons[["person_id"]].copy() - df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" }) - + df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"}) + df_vehicles["mode"] = "car" df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car" @@ -28,4 +42,4 @@ def execute(context): df_vehicles["age"] = 0 df_vehicles["euro"] = 6 - return df_vehicle_types, df_vehicles \ No newline at end of file + return df_vehicle_types, df_vehicles diff --git a/synthesis/vehicles/cars/fleet_sampling.py b/synthesis/vehicles/cars/fleet_sampling.py index dcd20a5a..269a2404 100644 --- a/synthesis/vehicles/cars/fleet_sampling.py +++ b/synthesis/vehicles/cars/fleet_sampling.py @@ -7,6 +7,7 @@ Creates the synthetic vehicle fleet """ + def configure(context): context.stage("synthesis.population.enriched") context.stage("synthesis.population.spatial.home.zones") @@ -15,21 +16,30 @@ def configure(context): context.config("vehicles_year", 2021) + def _sample_vehicle(context, args): vehicle = args year = context.config("vehicles_year") - df_vehicle_fleet_counts, df_vehicle_age_counts = context.data("fleet"), context.data("age") + df_vehicle_fleet_counts, df_vehicle_age_counts = context.data( + "fleet" + ), context.data("age") commune_id = vehicle["commune_id"] - if commune_id in df_vehicle_fleet_counts["commune_id"].unique(): - fleet = df_vehicle_fleet_counts.loc[df_vehicle_fleet_counts["commune_id"] == commune_id] + if commune_id in df_vehicle_fleet_counts["commune_id"].unique(): + fleet = df_vehicle_fleet_counts.loc[ + df_vehicle_fleet_counts["commune_id"] == commune_id + ] choice = fleet.sample(weights="fleet") critair = choice["critair"].values[0] technology = choice["technology"].values[0] - age_mask = (df_vehicle_age_counts["critair"] == critair) & (df_vehicle_age_counts["technology"] == technology) - age = df_vehicle_age_counts.loc[age_mask].sample(weights="fleet")["age"].values[0] + age_mask = (df_vehicle_age_counts["critair"] == critair) & ( + df_vehicle_age_counts["technology"] == technology + ) + age = ( + df_vehicle_age_counts.loc[age_mask].sample(weights="fleet")["age"].values[0] + ) else: choice = df_vehicle_age_counts.sample(weights="fleet") critair = choice["critair"].values[0] @@ -53,16 +63,24 @@ def _sample_vehicle(context, args): context.progress.update() return vehicle + def _get_euro_from_critair(vehicle, year): - critair = vehicle["critair"] # Crit'air 1, Crit'air 2, ..., Crit'air 5, Crit'air E, Non classée - technology = vehicle["technology"] # Gazole, Essence, Electrique et hydrogène, Essence hybride rechargeable, Gaz, Gazole hybride rechargeable - age = vehicle["age"] # 0 ans, 1 ans, ..., 19 ans, >20 ans + critair = vehicle[ + "critair" + ] # Crit'air 1, Crit'air 2, ..., Crit'air 5, Crit'air E, Non classée + technology = vehicle[ + "technology" + ] # Gazole, Essence, Electrique et hydrogène, Essence hybride rechargeable, Gaz, Gazole hybride rechargeable + age = vehicle["age"] # 0 ans, 1 ans, ..., 19 ans, >20 ans # we are using the following table : https://www.ecologie.gouv.fr/sites/default/files/Tableau_classification_des_vehicules.pdf - age_num = re.findall(r'\d+', age) + age_num = re.findall(r"\d+", age) if len(age_num) == 0: - raise RuntimeError("Badly formatted 'age' variable found for vehicle (id: %s) : %s" % (age, vehicle["vehicle_id"])) + raise RuntimeError( + "Badly formatted 'age' variable found for vehicle (id: %s) : %s" + % (age, vehicle["vehicle_id"]) + ) birthday = int(year) - int(age_num[0]) @@ -92,7 +110,7 @@ def _get_euro_from_critair(vehicle, year): if critair == "Crit'air 2" and technology == "Gazole": euro = max(euro, 5) # or 6 in table if critair == "Crit'air 3" and technology == "Essence": - euro = max(euro, 2) # or 3 in table + euro = max(euro, 2) # or 3 in table if critair == "Crit'air 3" and technology == "Gazole": euro = max(euro, 4) if critair == "Crit'air 4" and technology == "Gazole": @@ -103,14 +121,15 @@ def _get_euro_from_critair(vehicle, year): euro = max(euro, 1) euro = str(euro) - if euro == '6': + if euro == "6": if 2016 <= birthday < 2019: - euro = '6ab' + euro = "6ab" else: - euro = '6c' + euro = "6c" return euro + def execute(context): df_vehicle_types = context.stage("data.vehicles.types") @@ -118,11 +137,15 @@ def execute(context): df_persons = context.stage("synthesis.population.enriched") df_homes = context.stage("synthesis.population.spatial.home.zones") - df_vehicles = pd.merge(df_persons[["household_id", "person_id"]], df_homes[["household_id", "commune_id"]], on = "household_id") + df_vehicles = pd.merge( + df_persons[["household_id", "person_id"]], + df_homes[["household_id", "commune_id"]], + on="household_id", + ) - df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" }) + df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"}) df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car" - df_vehicles = df_vehicles.drop_duplicates("vehicle_id") # is this needed? + df_vehicles = df_vehicles.drop_duplicates("vehicle_id") # is this needed? df_vehicles["type_id"] = "default_car" df_vehicles["mode"] = "car" @@ -130,11 +153,17 @@ def execute(context): res = [] - with context.progress(label = "Processing vehicles data ...", total = len(df_vehicles)) as progress: - with context.parallel(dict(fleet = df_vehicle_fleet_counts, age = df_vehicle_age_counts)) as parallel: - for df_partial in parallel.imap(_sample_vehicle, df_vehicles.to_dict(orient="records")): + with context.progress( + label="Processing vehicles data ...", total=len(df_vehicles) + ) as progress: + with context.parallel( + dict(fleet=df_vehicle_fleet_counts, age=df_vehicle_age_counts) + ) as parallel: + for df_partial in parallel.imap( + _sample_vehicle, df_vehicles.to_dict(orient="records") + ): res.append(df_partial) df_vehicles = pd.DataFrame.from_dict(res) - return df_vehicle_types, df_vehicles \ No newline at end of file + return df_vehicle_types, df_vehicles diff --git a/synthesis/vehicles/passengers/default.py b/synthesis/vehicles/passengers/default.py index 6916f5bb..4ed9249f 100644 --- a/synthesis/vehicles/passengers/default.py +++ b/synthesis/vehicles/passengers/default.py @@ -5,20 +5,34 @@ Creates a vehicle fleet based on a default vehicle type for the dummy passenger mode """ + def configure(context): context.stage("synthesis.population.enriched") + def execute(context): df_persons = context.stage("synthesis.population.enriched") - df_vehicle_types = pd.DataFrame.from_records([{ - "type_id": "default_car_passenger", "nb_seats": 4, "length": 5.0, "width": 1.0, "pce": 1.0, "mode": "car_passenger", - "hbefa_cat": "PASSENGER_CAR", "hbefa_tech": "average", "hbefa_size": "average", "hbefa_emission": "average", - }]) + df_vehicle_types = pd.DataFrame.from_records( + [ + { + "type_id": "default_car_passenger", + "nb_seats": 4, + "length": 5.0, + "width": 1.0, + "pce": 1.0, + "mode": "car_passenger", + "hbefa_cat": "PASSENGER_CAR", + "hbefa_tech": "average", + "hbefa_size": "average", + "hbefa_emission": "average", + } + ] + ) df_vehicles = df_persons[["person_id"]].copy() - df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" }) - + df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"}) + df_vehicles["mode"] = "car_passenger" df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car_passenger" @@ -28,4 +42,4 @@ def execute(context): df_vehicles["age"] = 0 df_vehicles["euro"] = 6 - return df_vehicle_types, df_vehicles \ No newline at end of file + return df_vehicle_types, df_vehicles diff --git a/synthesis/vehicles/vehicles.py b/synthesis/vehicles/vehicles.py index 922cd36c..14cf552b 100644 --- a/synthesis/vehicles/vehicles.py +++ b/synthesis/vehicles/vehicles.py @@ -1,20 +1,24 @@ import pandas as pd + def configure(context): method = context.config("vehicles_method", "default") if method == "default": - context.stage("synthesis.vehicles.cars.default", alias = "cars") + context.stage("synthesis.vehicles.cars.default", alias="cars") elif method == "fleet_sample": - context.stage("synthesis.vehicles.cars.fleet_sampling", alias = "cars") + context.stage("synthesis.vehicles.cars.fleet_sampling", alias="cars") else: raise RuntimeError("Unknown vehicles generation method : %s" % method) - + context.stage("synthesis.vehicles.passengers.default") + def execute(context): df_car_types, df_cars = context.stage("cars") - df_passenger_types, df_passengers = context.stage("synthesis.vehicles.passengers.default") + df_passenger_types, df_passengers = context.stage( + "synthesis.vehicles.passengers.default" + ) df_vehicles = pd.concat([df_cars, df_passengers]) df_types = pd.concat([df_car_types, df_passenger_types]) diff --git a/tests/test_determinism.py b/tests/test_determinism.py index e2755d7a..233c7934 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -4,6 +4,7 @@ from . import testdata import sqlite3 + def hash_sqlite_db(path): """ Hash SQLite database file from its dump. @@ -37,6 +38,7 @@ def hash_file(file): f.close() return hash.hexdigest() + def test_determinism(tmpdir): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -44,51 +46,62 @@ def test_determinism(tmpdir): for index in range(2): _test_determinism(index, data_path, tmpdir) + def _test_determinism(index, data_path, tmpdir): print("Running index %d" % index) cache_path = str(tmpdir.mkdir("cache_%d" % index)) output_path = str(tmpdir.mkdir("output_%d" % index)) config = dict( - data_path = data_path, output_path = output_path, - regions = [10, 11], sampling_rate = 1.0, hts = "entd", - random_seed = 1000, processes = 1, - secloc_maximum_iterations = 10, - maven_skip_tests = True, - matching_attributes = [ - "sex", "any_cars", "age_class", "socioprofessional_class", - "income_class", "departement_id" - ] + data_path=data_path, + output_path=output_path, + regions=[10, 11], + sampling_rate=1.0, + hts="entd", + random_seed=1000, + processes=1, + secloc_maximum_iterations=10, + maven_skip_tests=True, + matching_attributes=[ + "sex", + "any_cars", + "age_class", + "socioprofessional_class", + "income_class", + "departement_id", + ], ) stages = [ - dict(descriptor = "synthesis.output"), + dict(descriptor="synthesis.output"), ] - synpp.run(stages, config, working_directory = cache_path) + synpp.run(stages, config, working_directory=cache_path) REFERENCE_CSV_HASHES = { - "ile_de_france_activities.csv": "53c44fb4026d2037729ee8ff1c8fb93f", - "ile_de_france_households.csv": "ca2a29ef13467326f937638f1ff8be1a", - "ile_de_france_persons.csv": "ddbe9b418c915b14e888b54efbdf9b1e", - "ile_de_france_trips.csv": "6c5f3427e41e683da768eeb53796a806", - "ile_de_france_vehicle_types.csv": "00bee1ea6d7bc9af43ae6c7101dd75da", - "ile_de_france_vehicles.csv": "3567b0f29e51d521b13d91c82c77cecb", + "ile_de_france_activities.csv": "53c44fb4026d2037729ee8ff1c8fb93f", + "ile_de_france_households.csv": "ca2a29ef13467326f937638f1ff8be1a", + "ile_de_france_persons.csv": "ddbe9b418c915b14e888b54efbdf9b1e", + "ile_de_france_trips.csv": "6c5f3427e41e683da768eeb53796a806", + "ile_de_france_vehicle_types.csv": "00bee1ea6d7bc9af43ae6c7101dd75da", + "ile_de_france_vehicles.csv": "3567b0f29e51d521b13d91c82c77cecb", } REFERENCE_GPKG_HASHES = { - "ile_de_france_activities.gpkg": "884eec1fd0c29904284eb4362ff89be1", - "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3", - "ile_de_france_homes.gpkg": "a85e973f0e2f51031cd60170d351845e", - "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8", + "ile_de_france_activities.gpkg": "884eec1fd0c29904284eb4362ff89be1", + "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3", + "ile_de_france_homes.gpkg": "a85e973f0e2f51031cd60170d351845e", + "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8", } generated_csv_hashes = { - file: hash_file("%s/%s" % (output_path, file)) for file in REFERENCE_CSV_HASHES.keys() + file: hash_file("%s/%s" % (output_path, file)) + for file in REFERENCE_CSV_HASHES.keys() } generated_gpkg_hashes = { - file: hash_sqlite_db("%s/%s" % (output_path, file)) for file in REFERENCE_GPKG_HASHES.keys() + file: hash_sqlite_db("%s/%s" % (output_path, file)) + for file in REFERENCE_GPKG_HASHES.keys() } print("Generated CSV hashes: ", generated_csv_hashes) @@ -100,6 +113,7 @@ def _test_determinism(index, data_path, tmpdir): for file in REFERENCE_GPKG_HASHES.keys(): assert REFERENCE_GPKG_HASHES[file] == generated_gpkg_hashes[file] + def test_determinism_matsim(tmpdir): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -107,36 +121,45 @@ def test_determinism_matsim(tmpdir): for index in range(2): _test_determinism_matsim(index, data_path, tmpdir) + def _test_determinism_matsim(index, data_path, tmpdir): print("Running index %d" % index) cache_path = str(tmpdir.mkdir("cache_%d" % index)) output_path = str(tmpdir.mkdir("output_%d" % index)) config = dict( - data_path = data_path, output_path = output_path, - regions = [10, 11], sampling_rate = 1.0, hts = "entd", - random_seed = 1000, processes = 1, - secloc_maximum_iterations = 10, - maven_skip_tests = True, - matching_attributes = [ - "sex", "any_cars", "age_class", "socioprofessional_class", - "income_class", "departement_id" - ] + data_path=data_path, + output_path=output_path, + regions=[10, 11], + sampling_rate=1.0, + hts="entd", + random_seed=1000, + processes=1, + secloc_maximum_iterations=10, + maven_skip_tests=True, + matching_attributes=[ + "sex", + "any_cars", + "age_class", + "socioprofessional_class", + "income_class", + "departement_id", + ], ) stages = [ - dict(descriptor = "matsim.output"), + dict(descriptor="matsim.output"), ] - synpp.run(stages, config, working_directory = cache_path) + synpp.run(stages, config, working_directory=cache_path) REFERENCE_HASHES = { - #"ile_de_france_population.xml.gz": "e1407f918cb92166ebf46ad769d8d085", - #"ile_de_france_network.xml.gz": "5f10ec295b49d2bb768451c812955794", - "ile_de_france_households.xml.gz": "64a0c9fab72aad51bc6adb926a1c9d44", - #"ile_de_france_facilities.xml.gz": "5ad41afff9ae5c470082510b943e6778", - "ile_de_france_config.xml": "30871dfbbd2b5bf6922be1dfe20ffe73", - "ile_de_france_vehicles.xml.gz": "d7c8d0dba531a21dc83355b2f82778c2" + # "ile_de_france_population.xml.gz": "e1407f918cb92166ebf46ad769d8d085", + # "ile_de_france_network.xml.gz": "5f10ec295b49d2bb768451c812955794", + "ile_de_france_households.xml.gz": "64a0c9fab72aad51bc6adb926a1c9d44", + # "ile_de_france_facilities.xml.gz": "5ad41afff9ae5c470082510b943e6778", + "ile_de_france_config.xml": "30871dfbbd2b5bf6922be1dfe20ffe73", + "ile_de_france_vehicles.xml.gz": "d7c8d0dba531a21dc83355b2f82778c2", } # activities.gpkg, trips.gpkg, meta.json, @@ -147,7 +170,8 @@ def _test_determinism_matsim(index, data_path, tmpdir): # detailed inspection of meta.json would make sense! generated_hashes = { - file: hash_file("%s/%s" % (output_path, file)) for file in REFERENCE_HASHES.keys() + file: hash_file("%s/%s" % (output_path, file)) + for file in REFERENCE_HASHES.keys() } print("Generated hashes: ", generated_hashes) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index d9856f52..384f8242 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -4,6 +4,7 @@ from . import testdata import pandas as pd + def test_data(tmpdir): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -11,50 +12,55 @@ def test_data(tmpdir): cache_path = str(tmpdir.mkdir("cache")) output_path = str(tmpdir.mkdir("output")) config = dict( - data_path = data_path, output_path = output_path, - regions = [10, 11], hts = "entd") + data_path=data_path, output_path=output_path, regions=[10, 11], hts="entd" + ) stages = [ - dict(descriptor = "data.spatial.iris"), - dict(descriptor = "data.spatial.codes"), - dict(descriptor = "data.spatial.population"), - dict(descriptor = "data.bpe.cleaned"), - dict(descriptor = "data.income.municipality"), - dict(descriptor = "data.hts.entd.cleaned"), - dict(descriptor = "data.hts.egt.cleaned"), - dict(descriptor = "data.census.cleaned"), - dict(descriptor = "data.od.cleaned"), - dict(descriptor = "data.hts.output"), - dict(descriptor = "data.sirene.output"), + dict(descriptor="data.spatial.iris"), + dict(descriptor="data.spatial.codes"), + dict(descriptor="data.spatial.population"), + dict(descriptor="data.bpe.cleaned"), + dict(descriptor="data.income.municipality"), + dict(descriptor="data.hts.entd.cleaned"), + dict(descriptor="data.hts.egt.cleaned"), + dict(descriptor="data.census.cleaned"), + dict(descriptor="data.od.cleaned"), + dict(descriptor="data.hts.output"), + dict(descriptor="data.sirene.output"), ] - synpp.run(stages, config, working_directory = cache_path) + synpp.run(stages, config, working_directory=cache_path) assert os.path.isfile("%s/ile_de_france_hts_households.csv" % output_path) assert os.path.isfile("%s/ile_de_france_hts_persons.csv" % output_path) assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path) assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path) -def run_population(tmpdir, hts, update = {}): + +def run_population(tmpdir, hts, update={}): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) cache_path = str(tmpdir.mkdir("cache")) output_path = str(tmpdir.mkdir("output")) config = dict( - data_path = data_path, output_path = output_path, - regions = [10, 11], sampling_rate = 1.0, hts = hts, - random_seed = 1000, processes = 1, - secloc_maximum_iterations = 10, - maven_skip_tests = True + data_path=data_path, + output_path=output_path, + regions=[10, 11], + sampling_rate=1.0, + hts=hts, + random_seed=1000, + processes=1, + secloc_maximum_iterations=10, + maven_skip_tests=True, ) config.update(update) stages = [ - dict(descriptor = "synthesis.output"), + dict(descriptor="synthesis.output"), ] - synpp.run(stages, config, working_directory = cache_path) + synpp.run(stages, config, working_directory=cache_path) assert os.path.isfile("%s/ile_de_france_activities.csv" % output_path) assert os.path.isfile("%s/ile_de_france_persons.csv" % output_path) @@ -63,50 +69,94 @@ def run_population(tmpdir, hts, update = {}): assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path) assert os.path.isfile("%s/ile_de_france_meta.json" % output_path) - assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";")) - assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";")) - assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";")) - - assert 447 * 2 == len(pd.read_csv("%s/ile_de_france_vehicles.csv" % output_path, usecols = ["vehicle_id"], sep = ";")) + assert 2235 == len( + pd.read_csv( + "%s/ile_de_france_activities.csv" % output_path, + usecols=["household_id"], + sep=";", + ) + ) + assert 447 == len( + pd.read_csv( + "%s/ile_de_france_persons.csv" % output_path, + usecols=["household_id"], + sep=";", + ) + ) + assert 149 == len( + pd.read_csv( + "%s/ile_de_france_households.csv" % output_path, + usecols=["household_id"], + sep=";", + ) + ) + + assert 447 * 2 == len( + pd.read_csv( + "%s/ile_de_france_vehicles.csv" % output_path, + usecols=["vehicle_id"], + sep=";", + ) + ) if "vehicles_method" in update and update["vehicles_method"] == "fleet_sample": - assert 17 + 1 == len(pd.read_csv("%s/ile_de_france_vehicle_types.csv" % output_path, usecols = ["type_id"], sep = ";")) + assert 17 + 1 == len( + pd.read_csv( + "%s/ile_de_france_vehicle_types.csv" % output_path, + usecols=["type_id"], + sep=";", + ) + ) else: - assert 2 == len(pd.read_csv("%s/ile_de_france_vehicle_types.csv" % output_path, usecols = ["type_id"], sep = ";")) + assert 2 == len( + pd.read_csv( + "%s/ile_de_france_vehicle_types.csv" % output_path, + usecols=["type_id"], + sep=";", + ) + ) + def test_population_with_entd(tmpdir): run_population(tmpdir, "entd") + def test_population_with_egt(tmpdir): run_population(tmpdir, "egt") + def test_population_with_mode_choice(tmpdir): - run_population(tmpdir, "entd", { "mode_choice": True }) + run_population(tmpdir, "entd", {"mode_choice": True}) + def test_population_with_fleet_sample(tmpdir): - run_population(tmpdir, "entd", { - "vehicles_method": "fleet_sample", - "vehicles_year": 2021 - }) + run_population( + tmpdir, "entd", {"vehicles_method": "fleet_sample", "vehicles_year": 2021} + ) + def test_population_with_bhepop2_income(tmpdir): - run_population(tmpdir, "egt", { - "income_assignation_method": "bhepop2" - }) + run_population(tmpdir, "egt", {"income_assignation_method": "bhepop2"}) + def test_population_with_urban_type(tmpdir): - run_population(tmpdir, "entd", { - "use_urban_type": True, - "matching_attributes": [ - "urban_type", "*default*" - ], - "matching_minimum_observations": 5 - }) + run_population( + tmpdir, + "entd", + { + "use_urban_type": True, + "matching_attributes": ["urban_type", "*default*"], + "matching_minimum_observations": 5, + }, + ) + def test_population_with_urban_type_and_egt(tmpdir): - run_population(tmpdir, "egt", { - "use_urban_type": True, - "matching_attributes": [ - "urban_type", "*default*" - ], - "matching_minimum_observations": 5 - }) + run_population( + tmpdir, + "egt", + { + "use_urban_type": True, + "matching_attributes": ["urban_type", "*default*"], + "matching_minimum_observations": 5, + }, + ) diff --git a/tests/test_simulation.py b/tests/test_simulation.py index e31d6be9..baf1a2bc 100644 --- a/tests/test_simulation.py +++ b/tests/test_simulation.py @@ -3,6 +3,7 @@ import hashlib from . import testdata + def test_simulation(tmpdir): data_path = str(tmpdir.mkdir("data")) testdata.create(data_path) @@ -11,18 +12,20 @@ def test_simulation(tmpdir): output_path = str(tmpdir.mkdir("output")) config = dict( - data_path = data_path, output_path = output_path, - regions = [10, 11], sampling_rate = 1.0, hts = "entd", - random_seed = 1000, processes = 1, - secloc_maximum_iterations = 10, - maven_skip_tests = True + data_path=data_path, + output_path=output_path, + regions=[10, 11], + sampling_rate=1.0, + hts="entd", + random_seed=1000, + processes=1, + secloc_maximum_iterations=10, + maven_skip_tests=True, ) - stages = [ - dict(descriptor = "matsim.output") - ] + stages = [dict(descriptor="matsim.output")] - synpp.run(stages, config, working_directory = cache_path) + synpp.run(stages, config, working_directory=cache_path) assert os.path.isfile("%s/ile_de_france_population.xml.gz" % output_path) assert os.path.isfile("%s/ile_de_france_network.xml.gz" % output_path) diff --git a/tests/testdata.py b/tests/testdata.py index 6e75f71d..1ef1fae5 100644 --- a/tests/testdata.py +++ b/tests/testdata.py @@ -7,6 +7,7 @@ import glob import subprocess + def create(output_path): """ This script creates test fixtures for the Île-de-France / France pipeline. @@ -76,23 +77,27 @@ def create(output_path): print("Creating zoning system ...") df = [] - WITH_IRIS = set([ - "1B013", "1B014", "1B018", "1B019", - "2D007", "2D008", "2D012", "2D013" - ]) + WITH_IRIS = set( + ["1B013", "1B014", "1B018", "1B019", "2D007", "2D008", "2D012", "2D013"] + ) for region_column in np.arange(2): region_prefix = region_column + 1 - region_number = region_prefix * 10 # TODO: This means we will have 10 and 20, but the unit tests define 10 and 11 (so only 10 is used -> TODO) + region_number = ( + region_prefix * 10 + ) # TODO: This means we will have 10 and 20, but the unit tests define 10 and 11 (so only 10 is used -> TODO) region_x = anchor_x + region_column * REGION_LENGTH region_y = anchor_y + 0 for department_row in np.arange(2): for department_column in np.arange(2): - department_letter = { (0, 0): "A", (0, 1): "B", (1, 0): "C", (1, 1): "D" }[( - department_row, department_column - )] + department_letter = { + (0, 0): "A", + (0, 1): "B", + (1, 0): "C", + (1, 1): "D", + }[(department_row, department_column)] department_name = "%d%s" % (region_prefix, department_letter) @@ -100,13 +105,20 @@ def create(output_path): department_y = region_y - department_row * DEPARTMENT_LENGTH for municipality_index in np.arange(25): - municipality_name = "%s%03d" % (department_name, municipality_index + 1) + municipality_name = "%s%03d" % ( + department_name, + municipality_index + 1, + ) municipality_row = municipality_index // 5 municipality_column = municipality_index % 5 - municipality_x = department_x + municipality_column * MUNICIPALITY_LENGTH - municipality_y = department_y - municipality_row * MUNICIPALITY_LENGTH + municipality_x = ( + department_x + municipality_column * MUNICIPALITY_LENGTH + ) + municipality_y = ( + department_y - municipality_row * MUNICIPALITY_LENGTH + ) if municipality_name in WITH_IRIS: for iris_index in np.arange(100): @@ -118,48 +130,61 @@ def create(output_path): iris_x = municipality_x + iris_column * IRIS_LENGTH iris_y = municipality_y - iris_row * IRIS_LENGTH - iris_polygon = geo.Polygon([ - (iris_x, iris_y), (iris_x + IRIS_LENGTH, iris_y), - (iris_x + IRIS_LENGTH, iris_y - IRIS_LENGTH), - (iris_x, iris_y - IRIS_LENGTH) - ]) - - df.append(dict( - region = region_number, - department = department_name, - municipality = municipality_name, - iris = iris_name, - geometry = iris_polygon - )) + iris_polygon = geo.Polygon( + [ + (iris_x, iris_y), + (iris_x + IRIS_LENGTH, iris_y), + (iris_x + IRIS_LENGTH, iris_y - IRIS_LENGTH), + (iris_x, iris_y - IRIS_LENGTH), + ] + ) + + df.append( + dict( + region=region_number, + department=department_name, + municipality=municipality_name, + iris=iris_name, + geometry=iris_polygon, + ) + ) else: - municipality_polygon = geo.Polygon([ - (municipality_x, municipality_y), (municipality_x + MUNICIPALITY_LENGTH, municipality_y), - (municipality_x + MUNICIPALITY_LENGTH, municipality_y - MUNICIPALITY_LENGTH), - (municipality_x, municipality_y - MUNICIPALITY_LENGTH) - ]) + municipality_polygon = geo.Polygon( + [ + (municipality_x, municipality_y), + (municipality_x + MUNICIPALITY_LENGTH, municipality_y), + ( + municipality_x + MUNICIPALITY_LENGTH, + municipality_y - MUNICIPALITY_LENGTH, + ), + (municipality_x, municipality_y - MUNICIPALITY_LENGTH), + ] + ) iris_name = "%s0000" % municipality_name - df.append(dict( - region = region_number, - department = department_name, - municipality = municipality_name, - iris = iris_name, - geometry = municipality_polygon - )) + df.append( + dict( + region=region_number, + department=department_name, + municipality=municipality_name, + iris=iris_name, + geometry=municipality_polygon, + ) + ) df = pd.DataFrame.from_records(df) - df = gpd.GeoDataFrame(df, crs = "EPSG:2154") - + df = gpd.GeoDataFrame(df, crs="EPSG:2154") + # Dataset: IRIS zones # Required attributes: CODE_IRIS, INSEE_COM, geometry print("Creating IRIS zones ...") df_iris = df.copy() - df_iris = df_iris[["iris", "municipality", "geometry"]].rename(columns = dict( - iris = "CODE_IRIS", municipality = "INSEE_COM" - )) + df_iris = df_iris[["iris", "municipality", "geometry"]].rename( + columns=dict(iris="CODE_IRIS", municipality="INSEE_COM") + ) os.mkdir("%s/iris_2021" % output_path) df_iris.to_file("%s/iris_2021/CONTOURS-IRIS.shp" % output_path) @@ -174,17 +199,20 @@ def create(output_path): print("Creating codes ...") df_codes = df.copy() - df_codes = df_codes[["iris", "municipality", "department", "region"]].rename(columns = dict( - iris = "CODE_IRIS", municipality = "DEPCOM", department = "DEP", region = "REG" - )) + df_codes = df_codes[["iris", "municipality", "department", "region"]].rename( + columns=dict( + iris="CODE_IRIS", municipality="DEPCOM", department="DEP", region="REG" + ) + ) os.mkdir("%s/codes_2021" % output_path) - with zipfile.ZipFile("%s/codes_2021/reference_IRIS_geo2021.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/codes_2021/reference_IRIS_geo2021.zip" % output_path, "w" + ) as archive: with archive.open("reference_IRIS_geo2021.xlsx", "w") as f: df_codes.to_excel( - f, sheet_name = "Emboitements_IRIS", - startrow = 5, index = False + f, sheet_name="Emboitements_IRIS", startrow=5, index=False ) # Dataset: Aggregate census @@ -192,20 +220,22 @@ def create(output_path): print("Creating aggregate census ...") df_population = df.copy() - df_population = df_population[["iris", "municipality", "department", "region"]].rename(columns = dict( - iris = "IRIS", municipality = "COM", department = "DEP", region = "REG" - )) + df_population = df_population[ + ["iris", "municipality", "department", "region"] + ].rename( + columns=dict(iris="IRIS", municipality="COM", department="DEP", region="REG") + ) # Set all population to fixed number df_population["P19_POP"] = 120.0 os.mkdir("%s/rp_2019" % output_path) - with zipfile.ZipFile("%s/rp_2019/base-ic-evol-struct-pop-2019.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/rp_2019/base-ic-evol-struct-pop-2019.zip" % output_path, "w" + ) as archive: with archive.open("base-ic-evol-struct-pop-2019.xlsx", "w") as f: - df_population.to_excel( - f, sheet_name = "IRIS", startrow = 5, index = False - ) + df_population.to_excel(f, sheet_name="IRIS", startrow=5, index=False) # Dataset: BPE # Required attributes: DCIRIS, LAMBERT_X, LAMBERT_Y, TYPEQU, DEPCOM, DEP @@ -215,13 +245,15 @@ def create(output_path): observations = BPE_OBSERVATIONS categories = np.array(["A", "B", "C", "D", "E", "F", "G"]) - df_selection = df.iloc[random.randint(0, len(df), size = observations)].copy() + df_selection = df.iloc[random.randint(0, len(df), size=observations)].copy() df_selection["DCIRIS"] = df_selection["iris"] df_selection["DEPCOM"] = df_selection["municipality"] df_selection["DEP"] = df_selection["department"] df_selection["LAMBERT_X"] = df_selection["geometry"].centroid.x df_selection["LAMBERT_Y"] = df_selection["geometry"].centroid.y - df_selection["TYPEQU"] = categories[random.randint(0, len(categories), size = len(df_selection))] + df_selection["TYPEQU"] = categories[ + random.randint(0, len(categories), size=len(df_selection)) + ] # Deliberately set coordinates for some to NaN df_selection.iloc[-10:, df_selection.columns.get_loc("LAMBERT_X")] = np.nan @@ -231,10 +263,11 @@ def create(output_path): os.mkdir("%s/bpe_2021" % output_path) - with zipfile.ZipFile("%s/bpe_2021/bpe21_ensemble_xy_csv.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/bpe_2021/bpe21_ensemble_xy_csv.zip" % output_path, "w" + ) as archive: with archive.open("bpe21_ensemble_xy.csv", "w") as f: - df_selection[columns].to_csv(f, - sep = ";", index = False) + df_selection[columns].to_csv(f, sep=";", index=False) # Dataset: Tax data # Required attributes: CODGEO, D115, ..., D915 @@ -244,23 +277,80 @@ def create(output_path): filosofi_year = "19" income_data = { "househod_size": [ - {"name": "1_pers", "sheet": "TAILLEM_1", "col_pattern": "TME1", "data": [9820,13380,15730,18140,20060,22050,24710,28120,34150]}, - {"name": "2_pers", "sheet": "TAILLEM_2", "col_pattern": "TME2", "data": [12950,16840,19920,22660,25390,28500,32080,37030,45910]}, - {"name": "3_pers", "sheet": "TAILLEM_3", "col_pattern": "TME3", "data": [11440,14850,18070,21040,23960,27190,30930,36130,45680]}, - {"name": "4_pers", "sheet": "TAILLEM_4", "col_pattern": "TME4", "data": [11920,15720,19130,22440,25540,28750,32400,37520,46870]}, - {"name": "5_pers_or_more", "sheet": "TAILLEM_5", "col_pattern": "TME5", "data": [9320,11510,13580,16180,19920,24570,29180,35460,46370]}, + { + "name": "1_pers", + "sheet": "TAILLEM_1", + "col_pattern": "TME1", + "data": [9820, 13380, 15730, 18140, 20060, 22050, 24710, 28120, 34150], + }, + { + "name": "2_pers", + "sheet": "TAILLEM_2", + "col_pattern": "TME2", + "data": [12950, 16840, 19920, 22660, 25390, 28500, 32080, 37030, 45910], + }, + { + "name": "3_pers", + "sheet": "TAILLEM_3", + "col_pattern": "TME3", + "data": [11440, 14850, 18070, 21040, 23960, 27190, 30930, 36130, 45680], + }, + { + "name": "4_pers", + "sheet": "TAILLEM_4", + "col_pattern": "TME4", + "data": [11920, 15720, 19130, 22440, 25540, 28750, 32400, 37520, 46870], + }, + { + "name": "5_pers_or_more", + "sheet": "TAILLEM_5", + "col_pattern": "TME5", + "data": [9320, 11510, 13580, 16180, 19920, 24570, 29180, 35460, 46370], + }, ], "family_comp": [ - {"name": "Single_man", "sheet": "TYPMENR_1", "col_pattern": "TYM1", "data": [9180,12830,15100,17740,19800,21890,24780,28290,34850]}, - {"name": "Single_wom", "sheet": "TYPMENR_2", "col_pattern": "TYM2", "data": [10730,13730,16220,18420,20260,22160,24680,27990,33570]}, - {"name": "Couple_without_child", "sheet": "TYPMENR_3", "col_pattern": "TYM3", "data": [15360,19560,22600,25260,27990,30980,34710,39640,49110]}, - {"name": "Couple_with_child", "sheet": "TYPMENR_4", "col_pattern": "TYM4", "data": [11790,15540,19240,22670,25850,29180,33090,38570,48700]}, - {"name": "Single_parent", "sheet": "TYPMENR_5", "col_pattern": "TYM5", "data": [9350,11150,12830,14660,16640,18760,21230,24700,31170]}, - {"name": "complex_hh", "sheet": "TYPMENR_6", "col_pattern": "TYM6", "data": [9280,11850,14100,16740,19510,22480,26100,30640,38970]}, - ] + { + "name": "Single_man", + "sheet": "TYPMENR_1", + "col_pattern": "TYM1", + "data": [9180, 12830, 15100, 17740, 19800, 21890, 24780, 28290, 34850], + }, + { + "name": "Single_wom", + "sheet": "TYPMENR_2", + "col_pattern": "TYM2", + "data": [10730, 13730, 16220, 18420, 20260, 22160, 24680, 27990, 33570], + }, + { + "name": "Couple_without_child", + "sheet": "TYPMENR_3", + "col_pattern": "TYM3", + "data": [15360, 19560, 22600, 25260, 27990, 30980, 34710, 39640, 49110], + }, + { + "name": "Couple_with_child", + "sheet": "TYPMENR_4", + "col_pattern": "TYM4", + "data": [11790, 15540, 19240, 22670, 25850, 29180, 33090, 38570, 48700], + }, + { + "name": "Single_parent", + "sheet": "TYPMENR_5", + "col_pattern": "TYM5", + "data": [9350, 11150, 12830, 14660, 16640, 18760, 21230, 24700, 31170], + }, + { + "name": "complex_hh", + "sheet": "TYPMENR_6", + "col_pattern": "TYM6", + "data": [9280, 11850, 14100, 16740, 19510, 22480, 26100, 30640, 38970], + }, + ], } - df_income = df.drop_duplicates("municipality")[["municipality"]].rename(columns = dict(municipality = "CODGEO")) + df_income = df.drop_duplicates("municipality")[["municipality"]].rename( + columns=dict(municipality="CODGEO") + ) df_income_ensemble = df_income.copy() @@ -276,9 +366,9 @@ def create(output_path): df_income_ensemble["D919"] = 32303.0 # Deliberately remove some of them - df_income_ensemble = df_income_ensemble[~df_income_ensemble["CODGEO"].isin([ - "1A015", "1A016" - ])] + df_income_ensemble = df_income_ensemble[ + ~df_income_ensemble["CODGEO"].isin(["1A015", "1A016"]) + ] # Deliberately only provide median for some f = df_income_ensemble["CODGEO"].isin(["1D002", "1D005"]) @@ -288,17 +378,25 @@ def create(output_path): value["df"] = df_income.copy() col_pattern = value["col_pattern"] columns = [ - "%sD%d" % (col_pattern, q) + filosofi_year if q != 5 else col_pattern + "Q2" + filosofi_year + ( + "%sD%d" % (col_pattern, q) + filosofi_year + if q != 5 + else col_pattern + "Q2" + filosofi_year + ) for q in range(1, 10) ] for i, column in enumerate(columns): value["df"][column] = value["data"][i] - + for value in income_data["family_comp"]: value["df"] = df_income.copy() col_pattern = value["col_pattern"] columns = [ - "%sD%d" % (col_pattern, q) + filosofi_year if q != 5 else col_pattern + "Q2" + filosofi_year + ( + "%sD%d" % (col_pattern, q) + filosofi_year + if q != 5 + else col_pattern + "Q2" + filosofi_year + ) for q in range(1, 10) ] for i, column in enumerate(columns): @@ -306,30 +404,33 @@ def create(output_path): os.mkdir("%s/filosofi_2019" % output_path) - with zipfile.ZipFile("%s/filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip" % output_path, + "w", + ) as archive: with archive.open("FILO2019_DISP_COM.xlsx", "w") as f: - with pd.ExcelWriter(f) as writer: + with pd.ExcelWriter(f) as writer: df_income_ensemble.to_excel( - writer, sheet_name = "ENSEMBLE", startrow = 5, index = False + writer, sheet_name="ENSEMBLE", startrow=5, index=False ) for value in income_data["househod_size"]: value["df"].to_excel( - writer, sheet_name = value["sheet"], startrow = 5, index = False + writer, sheet_name=value["sheet"], startrow=5, index=False ) for value in income_data["family_comp"]: value["df"].to_excel( - writer, sheet_name = value["sheet"], startrow = 5, index = False + writer, sheet_name=value["sheet"], startrow=5, index=False ) # Data set: ENTD print("Creating ENTD ...") data = dict( - Q_MENAGE = [], - Q_TCM_MENAGE = [], - Q_INDIVIDU = [], - Q_TCM_INDIVIDU = [], - K_DEPLOC = [], + Q_MENAGE=[], + Q_TCM_MENAGE=[], + Q_INDIVIDU=[], + Q_TCM_INDIVIDU=[], + K_DEPLOC=[], ) for household_index in range(HTS_HOUSEHOLDS): @@ -338,107 +439,182 @@ def create(output_path): region = random.choice([10, 20]) department = "%d%s" % (region // 10, random.choice(["A", "B", "C", "D"])) - data["Q_MENAGE"].append(dict( - DEP = department, idENT_MEN = household_id, PONDV1 = 1.0, - RG = region, V1_JNBVELOADT = random.randint(4), - V1_JNBVEH = random.randint(3), V1_JNBMOTO = random.randint(2), - V1_JNBCYCLO = 0 - )) - - data["Q_TCM_MENAGE"].append(dict( - NPERS = 3, PONDV1 = 1.0, DEP = department, - idENT_MEN = household_id, RG = region, - TrancheRevenuMensuel = random.choice([ - "Moins de 400", "De 400", "De 600", "De 800", - "De 1 000", "De 1 200", "De 1 500", "De 1800", - "De 2 000", "De 2 500", "De 3 000", "De 4 000", - "De 6 000", "10 000" - ]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4] - )) + data["Q_MENAGE"].append( + dict( + DEP=department, + idENT_MEN=household_id, + PONDV1=1.0, + RG=region, + V1_JNBVELOADT=random.randint(4), + V1_JNBVEH=random.randint(3), + V1_JNBMOTO=random.randint(2), + V1_JNBCYCLO=0, + ) + ) + + data["Q_TCM_MENAGE"].append( + dict( + NPERS=3, + PONDV1=1.0, + DEP=department, + idENT_MEN=household_id, + RG=region, + TrancheRevenuMensuel=random.choice( + [ + "Moins de 400", + "De 400", + "De 600", + "De 800", + "De 1 000", + "De 1 200", + "De 1 500", + "De 1800", + "De 2 000", + "De 2 500", + "De 3 000", + "De 4 000", + "De 6 000", + "10 000", + ] + ), + numcom_UU2010=["B", "C", "I", "R"][household_index % 4], + ) + ) for person_index in range(HTS_HOUSEHOLD_MEMBERS): person_id = household_id * 1000 + person_index studies = random.random_sample() < 0.3 - data["Q_INDIVIDU"].append(dict( - IDENT_IND = person_id, idENT_MEN = household_id, - RG = region, - V1_GPERMIS = random.choice([1, 2]), V1_GPERMIS2R = random.choice([1, 2]), - V1_ICARTABON = random.choice([1, 2]), - )) - - data["Q_TCM_INDIVIDU"].append(dict( - AGE = random.randint(90), SEXE = random.choice([1, 2]), - CS24 = random.randint(8) * 10, DEP = department, - ETUDES = 1 if studies else 2, IDENT_IND = person_id, - IDENT_MEN = household_id, PONDV1 = 1.0, - SITUA = random.choice([1, 2]) - )) - - if person_index == 0: # Only one person per household has activity chain + data["Q_INDIVIDU"].append( + dict( + IDENT_IND=person_id, + idENT_MEN=household_id, + RG=region, + V1_GPERMIS=random.choice([1, 2]), + V1_GPERMIS2R=random.choice([1, 2]), + V1_ICARTABON=random.choice([1, 2]), + ) + ) + + data["Q_TCM_INDIVIDU"].append( + dict( + AGE=random.randint(90), + SEXE=random.choice([1, 2]), + CS24=random.randint(8) * 10, + DEP=department, + ETUDES=1 if studies else 2, + IDENT_IND=person_id, + IDENT_MEN=household_id, + PONDV1=1.0, + SITUA=random.choice([1, 2]), + ) + ) + + if person_index == 0: # Only one person per household has activity chain home_department = department work_department = random.choice(df["department"].unique()) purpose = "1.11" if studies else "9" mode = random.choice(["1", "2", "2.20", "2.23", "4"]) - data["K_DEPLOC"].append(dict( - IDENT_IND = person_id, V2_MMOTIFDES = purpose, V2_MMOTIFORI = 1, - V2_TYPJOUR = 1, V2_MORIHDEP = "08:00:00", V2_MDESHARR = "09:00:00", - V2_MDISTTOT = 3, # km - IDENT_JOUR = 1, V2_MTP = mode, - V2_MDESDEP = work_department, - V2_MORIDEP = home_department, - NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0 - )) - - data["K_DEPLOC"].append(dict( - IDENT_IND = person_id, V2_MMOTIFDES = 2, V2_MMOTIFORI = purpose, - V2_TYPJOUR = 1, V2_MORIHDEP = "17:00:00", V2_MDESHARR = "17:30:00", - V2_MDISTTOT = 3, # km - IDENT_JOUR = 1, V2_MTP = mode, - V2_MDESDEP = home_department, - V2_MORIDEP = work_department, - NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0 - )) - - data["K_DEPLOC"].append(dict( - IDENT_IND = person_id, V2_MMOTIFDES = 1, V2_MMOTIFORI = 2, - V2_TYPJOUR = 1, V2_MORIHDEP = "18:00:00", V2_MDESHARR = "19:00:00", - V2_MDISTTOT = 3, # km - IDENT_JOUR = 1, V2_MTP = mode, - V2_MDESDEP = home_department, - V2_MORIDEP = home_department, - NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0 - )) + data["K_DEPLOC"].append( + dict( + IDENT_IND=person_id, + V2_MMOTIFDES=purpose, + V2_MMOTIFORI=1, + V2_TYPJOUR=1, + V2_MORIHDEP="08:00:00", + V2_MDESHARR="09:00:00", + V2_MDISTTOT=3, # km + IDENT_JOUR=1, + V2_MTP=mode, + V2_MDESDEP=work_department, + V2_MORIDEP=home_department, + NDEP=4, + V2_MOBILREF=1, + PONDKI=3.0, + ) + ) + + data["K_DEPLOC"].append( + dict( + IDENT_IND=person_id, + V2_MMOTIFDES=2, + V2_MMOTIFORI=purpose, + V2_TYPJOUR=1, + V2_MORIHDEP="17:00:00", + V2_MDESHARR="17:30:00", + V2_MDISTTOT=3, # km + IDENT_JOUR=1, + V2_MTP=mode, + V2_MDESDEP=home_department, + V2_MORIDEP=work_department, + NDEP=4, + V2_MOBILREF=1, + PONDKI=3.0, + ) + ) + + data["K_DEPLOC"].append( + dict( + IDENT_IND=person_id, + V2_MMOTIFDES=1, + V2_MMOTIFORI=2, + V2_TYPJOUR=1, + V2_MORIHDEP="18:00:00", + V2_MDESHARR="19:00:00", + V2_MDISTTOT=3, # km + IDENT_JOUR=1, + V2_MTP=mode, + V2_MDESDEP=home_department, + V2_MORIDEP=home_department, + NDEP=4, + V2_MOBILREF=1, + PONDKI=3.0, + ) + ) # Add a tail - data["K_DEPLOC"].append(dict( - IDENT_IND = person_id, V2_MMOTIFDES = 2, V2_MMOTIFORI = 1, - V2_TYPJOUR = 1, V2_MORIHDEP = "21:00:00", V2_MDESHARR = "22:00:00", - V2_MDISTTOT = 3, # km - IDENT_JOUR = 1, V2_MTP = mode, - V2_MDESDEP = home_department, - V2_MORIDEP = home_department, - NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0 - )) + data["K_DEPLOC"].append( + dict( + IDENT_IND=person_id, + V2_MMOTIFDES=2, + V2_MMOTIFORI=1, + V2_TYPJOUR=1, + V2_MORIHDEP="21:00:00", + V2_MDESHARR="22:00:00", + V2_MDISTTOT=3, # km + IDENT_JOUR=1, + V2_MTP=mode, + V2_MDESDEP=home_department, + V2_MORIDEP=home_department, + NDEP=4, + V2_MOBILREF=1, + PONDKI=3.0, + ) + ) os.mkdir("%s/entd_2008" % output_path) - pd.DataFrame.from_records(data["Q_MENAGE"]).to_csv("%s/entd_2008/Q_menage.csv" % output_path, index = False, sep = ";") - pd.DataFrame.from_records(data["Q_TCM_MENAGE"]).to_csv("%s/entd_2008/Q_tcm_menage_0.csv" % output_path, index = False, sep = ";") - pd.DataFrame.from_records(data["Q_INDIVIDU"]).to_csv("%s/entd_2008/Q_individu.csv" % output_path, index = False, sep = ";") - pd.DataFrame.from_records(data["Q_TCM_INDIVIDU"]).to_csv("%s/entd_2008/Q_tcm_individu.csv" % output_path, index = False, sep = ";") - pd.DataFrame.from_records(data["K_DEPLOC"]).to_csv("%s/entd_2008/K_deploc.csv" % output_path, index = False, sep = ";") - + pd.DataFrame.from_records(data["Q_MENAGE"]).to_csv( + "%s/entd_2008/Q_menage.csv" % output_path, index=False, sep=";" + ) + pd.DataFrame.from_records(data["Q_TCM_MENAGE"]).to_csv( + "%s/entd_2008/Q_tcm_menage_0.csv" % output_path, index=False, sep=";" + ) + pd.DataFrame.from_records(data["Q_INDIVIDU"]).to_csv( + "%s/entd_2008/Q_individu.csv" % output_path, index=False, sep=";" + ) + pd.DataFrame.from_records(data["Q_TCM_INDIVIDU"]).to_csv( + "%s/entd_2008/Q_tcm_individu.csv" % output_path, index=False, sep=";" + ) + pd.DataFrame.from_records(data["K_DEPLOC"]).to_csv( + "%s/entd_2008/K_deploc.csv" % output_path, index=False, sep=";" + ) # Data set: EGT print("Creating EGT ...") - data = dict( - households = [], - persons = [], - trips = [] - ) + data = dict(households=[], persons=[], trips=[]) person_index = 0 for household_index in range(HTS_HOUSEHOLDS): @@ -448,30 +624,50 @@ def create(output_path): region = df[df["municipality"] == municipality]["region"].values[0] department = df[df["municipality"] == municipality]["department"].values[0] - data["households"].append(dict( - RESDEP = department, NQUEST = household_id, POIDSM = 1.0, - NB_VELO = random.randint(3), NB_VD = random.randint(2), - RESCOMM = municipality, NB_2RM = 0, - MNP = 3, REVENU = random.randint(12) - )) + data["households"].append( + dict( + RESDEP=department, + NQUEST=household_id, + POIDSM=1.0, + NB_VELO=random.randint(3), + NB_VD=random.randint(2), + RESCOMM=municipality, + NB_2RM=0, + MNP=3, + REVENU=random.randint(12), + ) + ) for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1): studies = random.random_sample() < 0.3 - data["persons"].append(dict( - RESDEP = department, NP = person_id, POIDSP = 1.0, - NQUEST = household_id, SEXE = random.choice([1, 2]), - AGE = random.randint(90), PERMVP = random.choice([1, 2]), - ABONTC = random.choice([1, 2]), OCCP = 3 if studies else 2, - PERM2RM = random.choice([1, 2]), NBDEPL = 2, CS8 = random.randint(9) - )) + data["persons"].append( + dict( + RESDEP=department, + NP=person_id, + POIDSP=1.0, + NQUEST=household_id, + SEXE=random.choice([1, 2]), + AGE=random.randint(90), + PERMVP=random.choice([1, 2]), + ABONTC=random.choice([1, 2]), + OCCP=3 if studies else 2, + PERM2RM=random.choice([1, 2]), + NBDEPL=2, + CS8=random.randint(9), + ) + ) home_department = department home_municipality = municipality work_municipality = random.choice(df["municipality"].unique()) - work_region = df[df["municipality"] == work_municipality]["region"].values[0] - work_department = df[df["municipality"] == work_municipality]["department"].values[0] + work_region = df[df["municipality"] == work_municipality]["region"].values[ + 0 + ] + work_department = df[df["municipality"] == work_municipality][ + "department" + ].values[0] purpose = 4 if studies else 2 mode = random.choice([1, 2, 3, 5, 7]) @@ -484,43 +680,97 @@ def create(output_path): origin_hour = 0 origin_minute = 12 - data["trips"].append(dict( - NQUEST = household_id, NP = person_id, - ND = 1, ORDEP = home_department, DESTDEP = work_department, - ORH = origin_hour, ORM = origin_minute, DESTH = 9, DESTM = 0, ORCOMM = home_municipality, - DESTCOMM = work_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = purpose, ORMOT_H9 = 1 - )) - - data["trips"].append(dict( - NQUEST = household_id, NP = person_id, - ND = 2, ORDEP = work_department, DESTDEP = home_department, - ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality, - DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 5, ORMOT_H9 = purpose - )) - - data["trips"].append(dict( - NQUEST = household_id, NP = person_id, - ND = 3, ORDEP = home_department, DESTDEP = home_department, - ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality, - DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 1, ORMOT_H9 = 5 - )) + data["trips"].append( + dict( + NQUEST=household_id, + NP=person_id, + ND=1, + ORDEP=home_department, + DESTDEP=work_department, + ORH=origin_hour, + ORM=origin_minute, + DESTH=9, + DESTM=0, + ORCOMM=home_municipality, + DESTCOMM=work_municipality, + DPORTEE=3, + MODP_H7=2, + DESTMOT_H9=purpose, + ORMOT_H9=1, + ) + ) + + data["trips"].append( + dict( + NQUEST=household_id, + NP=person_id, + ND=2, + ORDEP=work_department, + DESTDEP=home_department, + ORH=8, + ORM=0, + DESTH=9, + DESTM=0, + ORCOMM=work_municipality, + DESTCOMM=home_municipality, + DPORTEE=3, + MODP_H7=2, + DESTMOT_H9=5, + ORMOT_H9=purpose, + ) + ) + + data["trips"].append( + dict( + NQUEST=household_id, + NP=person_id, + ND=3, + ORDEP=home_department, + DESTDEP=home_department, + ORH=17, + ORM=0, + DESTH=18, + DESTM=0, + ORCOMM=home_municipality, + DESTCOMM=home_municipality, + DPORTEE=3, + MODP_H7=2, + DESTMOT_H9=1, + ORMOT_H9=5, + ) + ) # Tail - data["trips"].append(dict( - NQUEST = household_id, NP = person_id, - ND = 4, ORDEP = home_department, DESTDEP = home_department, - ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality, - DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2, - DESTMOT_H9 = 5, ORMOT_H9 = 1 - )) + data["trips"].append( + dict( + NQUEST=household_id, + NP=person_id, + ND=4, + ORDEP=home_department, + DESTDEP=home_department, + ORH=22, + ORM=0, + DESTH=21, + DESTM=0, + ORCOMM=home_municipality, + DESTCOMM=home_municipality, + DPORTEE=3, + MODP_H7=2, + DESTMOT_H9=5, + ORMOT_H9=1, + ) + ) os.mkdir("%s/egt_2010" % output_path) - pd.DataFrame.from_records(data["households"]).to_csv("%s/egt_2010/Menages_semaine.csv" % output_path, index = False, sep = ",") - pd.DataFrame.from_records(data["persons"]).to_csv("%s/egt_2010/Personnes_semaine.csv" % output_path, index = False, sep = ",") - pd.DataFrame.from_records(data["trips"]).to_csv("%s/egt_2010/Deplacements_semaine.csv" % output_path, index = False, sep = ",") + pd.DataFrame.from_records(data["households"]).to_csv( + "%s/egt_2010/Menages_semaine.csv" % output_path, index=False, sep="," + ) + pd.DataFrame.from_records(data["persons"]).to_csv( + "%s/egt_2010/Personnes_semaine.csv" % output_path, index=False, sep="," + ) + pd.DataFrame.from_records(data["trips"]).to_csv( + "%s/egt_2010/Deplacements_semaine.csv" % output_path, index=False, sep="," + ) # Data set: Census print("Creating census ...") @@ -532,40 +782,68 @@ def create(output_path): iris = df["iris"].iloc[random.randint(len(df))] department = iris[:2] - if iris.endswith("0000"): iris = iris[:-4] + "XXXX" + if iris.endswith("0000"): + iris = iris[:-4] + "XXXX" - if random.random_sample() < 0.1: # For some, commune is not known + if random.random_sample() < 0.1: # For some, commune is not known iris = "ZZZZZZZZZ" destination_municipality = random.choice(df["municipality"].unique()) - destination_department = df[df["municipality"] == destination_municipality]["department"].values[0] + destination_department = df[df["municipality"] == destination_municipality][ + "department" + ].values[0] for person_index in range(CENSUS_HOUSEHOLD_MEMBERS): - persons.append(dict( - CANTVILLE = "ABCE", NUMMI = household_id, - AGED = "%03d" % random.randint(90), COUPLE = random.choice([1, 2]), - CS1 = random.randint(9), - DEPT = department, IRIS = iris, REGION = region, ETUD = random.choice([1, 2]), - ILETUD = 4 if department != destination_department else 0, - ILT = 4 if department != destination_department else 0, - IPONDI = float(1.0), - SEXE = random.choice([1, 2]), - TACT = random.choice([1, 2]), - TRANS = 4, VOIT = random.randint(3), DEROU = random.randint(2) - )) + persons.append( + dict( + CANTVILLE="ABCE", + NUMMI=household_id, + AGED="%03d" % random.randint(90), + COUPLE=random.choice([1, 2]), + CS1=random.randint(9), + DEPT=department, + IRIS=iris, + REGION=region, + ETUD=random.choice([1, 2]), + ILETUD=4 if department != destination_department else 0, + ILT=4 if department != destination_department else 0, + IPONDI=float(1.0), + SEXE=random.choice([1, 2]), + TACT=random.choice([1, 2]), + TRANS=4, + VOIT=random.randint(3), + DEROU=random.randint(2), + ) + ) columns = [ - "CANTVILLE", "NUMMI", "AGED", "COUPLE", "CS1", "DEPT", "IRIS", "REGION", - "ETUD", "ILETUD", "ILT", "IPONDI", - "SEXE", "TACT", "TRANS", "VOIT", "DEROU" + "CANTVILLE", + "NUMMI", + "AGED", + "COUPLE", + "CS1", + "DEPT", + "IRIS", + "REGION", + "ETUD", + "ILETUD", + "ILT", + "IPONDI", + "SEXE", + "TACT", + "TRANS", + "VOIT", + "DEROU", ] df_persons = pd.DataFrame.from_records(persons)[columns] df_persons.columns = columns - with zipfile.ZipFile("%s/rp_2019/RP2019_INDCVI_csv.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/rp_2019/RP2019_INDCVI_csv.zip" % output_path, "w" + ) as archive: with archive.open("FD_INDCVI_2019.csv", "w") as f: - df_persons.to_csv(f, sep = ";") + df_persons.to_csv(f, sep=";") # Data set: commute flows print("Creating commute flows ...") @@ -574,11 +852,15 @@ def create(output_path): observations = COMMUTE_FLOW_OBSERVATIONS # ... work - df_work = pd.DataFrame(dict( - COMMUNE = municipalities[random.randint(0, len(municipalities), observations)], - DCLT = municipalities[random.randint(0, len(municipalities), observations)], - TRANS = random.randint(1, 6, size = (observations,)) - )) + df_work = pd.DataFrame( + dict( + COMMUNE=municipalities[ + random.randint(0, len(municipalities), observations) + ], + DCLT=municipalities[random.randint(0, len(municipalities), observations)], + TRANS=random.randint(1, 6, size=(observations,)), + ) + ) df_work["ARM"] = "Z" df_work["IPONDI"] = 1.0 @@ -586,25 +868,33 @@ def create(output_path): columns = ["COMMUNE", "DCLT", "TRANS", "ARM", "IPONDI"] df_work.columns = columns - with zipfile.ZipFile("%s/rp_2019/RP2019_MOBPRO_csv.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/rp_2019/RP2019_MOBPRO_csv.zip" % output_path, "w" + ) as archive: with archive.open("FD_MOBPRO_2019.csv", "w") as f: - df_work.to_csv(f, sep = ";") + df_work.to_csv(f, sep=";") # ... education - df_education = pd.DataFrame(dict( - COMMUNE = municipalities[random.randint(0, len(municipalities), observations)], - DCETUF = municipalities[random.randint(0, len(municipalities), observations)] - )) + df_education = pd.DataFrame( + dict( + COMMUNE=municipalities[ + random.randint(0, len(municipalities), observations) + ], + DCETUF=municipalities[random.randint(0, len(municipalities), observations)], + ) + ) df_education["ARM"] = "Z" df_education["IPONDI"] = 1.0 df_education["AGEREV10"] = 1 - columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI","AGEREV10"] + columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI", "AGEREV10"] df_education.columns = columns - with zipfile.ZipFile("%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w" + ) as archive: with archive.open("FD_MOBSCO_2019.csv", "w") as f: - df_education.to_csv(f, sep = ";") + df_education.to_csv(f, sep=";") # Data set: BD-TOPO print("Creating BD-TOPO ...") @@ -615,43 +905,56 @@ def create(output_path): x = df_selection["geometry"].centroid.x.values y = df_selection["geometry"].centroid.y.values - z = random.randint(100, 400, observations) # Not used but keeping unit test hashes constant + z = random.randint( + 100, 400, observations + ) # Not used but keeping unit test hashes constant ids = [ - "BATIMENT{:016d}".format(n) for n in random.randint(1000, 1000000, observations) + "BATIMENT{:016d}".format(n) for n in random.randint(1000, 1000000, observations) ] - - ids[0] = ids[1] # setting multiple adresses for 1 building usecase - - df_bdtopo = gpd.GeoDataFrame({ - "nombre_de_logements": random.randint(0, 10, observations), - "cleabs": ids, - "geometry": [ - geo.Point(x, y) for x, y in zip(x, y) - ] - }, crs = "EPSG:2154") + + ids[0] = ids[1] # setting multiple adresses for 1 building usecase + + df_bdtopo = gpd.GeoDataFrame( + { + "nombre_de_logements": random.randint(0, 10, observations), + "cleabs": ids, + "geometry": [geo.Point(x, y) for x, y in zip(x, y)], + }, + crs="EPSG:2154", + ) # polygons as buildings from iris centroid points - df_bdtopo.set_geometry(df_bdtopo.buffer(40),inplace=True,drop=True,crs="EPSG:2154") + df_bdtopo.set_geometry( + df_bdtopo.buffer(40), inplace=True, drop=True, crs="EPSG:2154" + ) os.mkdir("{}/bdtopo_idf".format(output_path)) - df_bdtopo.to_file("{}/bdtopo_idf/content.gpkg".format(output_path), layer = "batiment") + df_bdtopo.to_file( + "{}/bdtopo_idf/content.gpkg".format(output_path), layer="batiment" + ) bdtopo_date = "2022-03-15" bdtopo_departments = ["1A", "1B", "1C", "1D", "2A", "2B", "2C", "2D"] - with py7zr.SevenZipFile("{}/bdtopo_idf/bdtopo.7z".format(output_path), "w") as archive: - archive.write("{}/bdtopo_idf/content.gpkg".format(output_path), "content/content.gpkg") + with py7zr.SevenZipFile( + "{}/bdtopo_idf/bdtopo.7z".format(output_path), "w" + ) as archive: + archive.write( + "{}/bdtopo_idf/content.gpkg".format(output_path), "content/content.gpkg" + ) os.remove("{}/bdtopo_idf/content.gpkg".format(output_path)) - + for department in bdtopo_departments: shutil.copyfile( - "{}/bdtopo_idf/bdtopo.7z".format(output_path), + "{}/bdtopo_idf/bdtopo.7z".format(output_path), "{}/bdtopo_idf/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_{}.7z".format( - output_path, department, bdtopo_date)) - + output_path, department, bdtopo_date + ), + ) + os.remove("{}/bdtopo_idf/bdtopo.7z".format(output_path)) - + # Data set: BAN print("Creating BAN ...") @@ -663,16 +966,26 @@ def create(output_path): y = df_selection["geometry"].centroid.y.values municipality = df["municipality"].unique() - df_ban = pd.DataFrame({ - "code_insee": municipality[random.randint(0, len(municipality), observations)], - "x": x, - "y": y}) + df_ban = pd.DataFrame( + { + "code_insee": municipality[ + random.randint(0, len(municipality), observations) + ], + "x": x, + "y": y, + } + ) - df_ban = df_ban[:round(len(x)*.8)] + df_ban = df_ban[: round(len(x) * 0.8)] os.mkdir("%s/ban_idf" % output_path) for dep in df["department"].unique(): - df_ban.to_csv("%s/ban_idf/adresses-%s.csv.gz" % (output_path, dep), compression='gzip', sep=";", index=False) + df_ban.to_csv( + "%s/ban_idf/adresses-%s.csv.gz" % (output_path, dep), + compression="gzip", + sep=";", + index=False, + ) # Data set: SIRENE print("Creating SIRENE ...") @@ -681,25 +994,35 @@ def create(output_path): identifiers = random.randint(0, 99999999, observations) - df_sirene = pd.DataFrame({ - "siren": identifiers, - "siret": identifiers, - "codeCommuneEtablissement": municipalities[random.randint(0, len(municipalities), observations)], - "etatAdministratifEtablissement": "A" - }) + df_sirene = pd.DataFrame( + { + "siren": identifiers, + "siret": identifiers, + "codeCommuneEtablissement": municipalities[ + random.randint(0, len(municipalities), observations) + ], + "etatAdministratifEtablissement": "A", + } + ) df_sirene["activitePrincipaleEtablissement"] = "52.1" df_sirene["trancheEffectifsEtablissement"] = "03" - os.mkdir("%s/sirene" % output_path) - df_sirene.to_csv(output_path + "/sirene/StockEtablissement_utf8.zip", index = False, compression={'method': 'zip', 'archive_name': 'StockEtablissement_utf8.csv'}) - + df_sirene.to_csv( + output_path + "/sirene/StockEtablissement_utf8.zip", + index=False, + compression={"method": "zip", "archive_name": "StockEtablissement_utf8.csv"}, + ) df_sirene = df_sirene[["siren"]].copy() df_sirene["categorieJuridiqueUniteLegale"] = "1000" - df_sirene.to_csv(output_path + "/sirene/StockUniteLegale_utf8.zip", index = False, compression={'method': 'zip', 'archive_name': 'StockUniteLegale_utf8.csv'}) + df_sirene.to_csv( + output_path + "/sirene/StockUniteLegale_utf8.zip", + index=False, + compression={"method": "zip", "archive_name": "StockUniteLegale_utf8.csv"}, + ) # Data set: SIRENE GEOLOCATION print("Creating SIRENE GEOLOCATION...") @@ -708,32 +1031,53 @@ def create(output_path): x = df_selection["geometry"].centroid.x.values y = df_selection["geometry"].centroid.y.values - codes_com = df_codes["DEPCOM"].iloc[random.randint(0, len(df_iris), observations)] + codes_com = df_codes["DEPCOM"].iloc[random.randint(0, len(df_iris), observations)] + + df_sirene_geoloc = pd.DataFrame( + { + "siret": identifiers, + "x": x, + "y": y, + "plg_code_commune": codes_com, + } + ) - df_sirene_geoloc = pd.DataFrame({ - "siret": identifiers, - "x": x, - "y": y, - "plg_code_commune":codes_com, - }) - - df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'}) + df_sirene_geoloc.to_csv( + "%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" + % output_path, + index=False, + sep=";", + compression={ + "method": "zip", + "archive_name": "GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv", + }, + ) # Data set: Urban type print("Creating urban type ...") - df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" }) + df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns={"DEPCOM": "CODGEO"}) df_urban_type = df_urban_type.drop_duplicates() - df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))] + df_urban_type["STATUT_2017"] = [ + ["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type)) + ] - df_urban_type = pd.concat([df_urban_type, pd.DataFrame({ - "CODGEO": ["75056", "69123", "13055"], - "STATUT_2017": ["C", "C", "C"] - })]) + df_urban_type = pd.concat( + [ + df_urban_type, + pd.DataFrame( + {"CODGEO": ["75056", "69123", "13055"], "STATUT_2017": ["C", "C", "C"]} + ), + ] + ) os.mkdir("%s/urban_type" % output_path) - with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w" + ) as archive: with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f: - df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False) + df_urban_type.to_excel( + f, startrow=5, sheet_name="Composition_communale", index=False + ) # Data set: OSM # We add add a road grid of 500m @@ -754,10 +1098,14 @@ def create(output_path): for i in range(lengthx): for j in range(lengthy): - df_nodes.append(dict( - id = node_index, - geometry = geo.Point(anchor_x + 500 * i + 250, anchor_y - 500 * j - 250) - )) + df_nodes.append( + dict( + id=node_index, + geometry=geo.Point( + anchor_x + 500 * i + 250, anchor_y - 500 * j - 250 + ), + ) + ) if j < lengthy - 1: links.append([node_index, node_index + 1]) @@ -767,75 +1115,117 @@ def create(output_path): node_index += 1 - df_nodes = gpd.GeoDataFrame(df_nodes, crs = "EPSG:2154") + df_nodes = gpd.GeoDataFrame(df_nodes, crs="EPSG:2154") df_nodes = df_nodes.to_crs("EPSG:4326") for row in df_nodes.itertuples(): - osm.append('' % ( - row[1], row[2].y, row[2].x - )) + osm.append( + '' + % (row[1], row[2].y, row[2].x) + ) for index, link in enumerate(links): - osm.append('' % (index + 1)) + osm.append( + '' % (index + 1) + ) osm.append('' % link[0]) osm.append('' % link[1]) osm.append('') - osm.append('') + osm.append("") - osm.append('') + osm.append("") import gzip + os.mkdir("%s/osm_idf" % output_path) with gzip.open("%s/osm_idf/ile-de-france-220101.osm.gz" % output_path, "wb+") as f: f.write(bytes("\n".join(osm), "utf-8")) - import subprocess - subprocess.check_call([ - shutil.which("osmosis"), "--read-xml", "%s/osm_idf/ile-de-france-220101.osm.gz" % output_path, - "--write-pbf", "%s/osm_idf/ile-de-france-220101.osm.pbf" % output_path - ]) - + subprocess.check_call( + [ + shutil.which("osmosis"), + "--read-xml", + "%s/osm_idf/ile-de-france-220101.osm.gz" % output_path, + "--write-pbf", + "%s/osm_idf/ile-de-france-220101.osm.pbf" % output_path, + ] + ) # Data set: GTFS print("Creating GTFS ...") feed = {} - feed["agency"] = pd.DataFrame.from_records([dict( - agency_id = 1, agency_name = "eqasim", agency_timezone = "Europe/Paris", - agency_url = "https://eqasim.org" - )]) + feed["agency"] = pd.DataFrame.from_records( + [ + dict( + agency_id=1, + agency_name="eqasim", + agency_timezone="Europe/Paris", + agency_url="https://eqasim.org", + ) + ] + ) - feed["calendar"] = pd.DataFrame.from_records([dict( - service_id = 1, monday = 1, tuesday = 1, wednesday = 1, - thursday = 1, friday = 1, saturday = 1, sunday = 1, start_date = "20100101", - end_date = "20500101" - )]) + feed["calendar"] = pd.DataFrame.from_records( + [ + dict( + service_id=1, + monday=1, + tuesday=1, + wednesday=1, + thursday=1, + friday=1, + saturday=1, + sunday=1, + start_date="20100101", + end_date="20500101", + ) + ] + ) - feed["routes"] = pd.DataFrame.from_records([dict( - route_id = 1, agency_id = 1, route_short_name = "EQ", - route_long_name = "The eqasim train", route_desc = "", - route_type = 2 - )]) + feed["routes"] = pd.DataFrame.from_records( + [ + dict( + route_id=1, + agency_id=1, + route_short_name="EQ", + route_long_name="The eqasim train", + route_desc="", + route_type=2, + ) + ] + ) df_stops = df[df["municipality"].isin(["1B019", "2D007"])].copy() df_stops = df_stops.to_crs("EPSG:4326") - feed["stops"] = pd.DataFrame.from_records([dict( - stop_id = "A", stop_code = "A", stop_name = "A", - stop_desc = "", - stop_lat = df_stops["geometry"].iloc[0].centroid.y, - stop_lon = df_stops["geometry"].iloc[0].centroid.x, - location_type = 1, parent_station = None - ), dict( - stop_id = "B", stop_code = "B", stop_name = "B", - stop_desc = "", - stop_lat = df_stops["geometry"].iloc[1].centroid.y, - stop_lon = df_stops["geometry"].iloc[1].centroid.x, - location_type = 1, parent_station = None - )]) + feed["stops"] = pd.DataFrame.from_records( + [ + dict( + stop_id="A", + stop_code="A", + stop_name="A", + stop_desc="", + stop_lat=df_stops["geometry"].iloc[0].centroid.y, + stop_lon=df_stops["geometry"].iloc[0].centroid.x, + location_type=1, + parent_station=None, + ), + dict( + stop_id="B", + stop_code="B", + stop_name="B", + stop_desc="", + stop_lat=df_stops["geometry"].iloc[1].centroid.y, + stop_lon=df_stops["geometry"].iloc[1].centroid.x, + location_type=1, + parent_station=None, + ), + ] + ) trips = [] times = [] @@ -844,19 +1234,27 @@ def create(output_path): for origin, destination in [("A", "B"), ("B", "A")]: for hour in np.arange(1, 24): - trips.append(dict( - route_id = 1, service_id = 1, trip_id = trip_id - )) - - times.append(dict( - trip_id = trip_id, arrival_time = "%02d:00:00" % hour, - departure_time = "%02d:00:00" % hour, stop_id = origin, stop_sequence = 1 - )) + trips.append(dict(route_id=1, service_id=1, trip_id=trip_id)) + + times.append( + dict( + trip_id=trip_id, + arrival_time="%02d:00:00" % hour, + departure_time="%02d:00:00" % hour, + stop_id=origin, + stop_sequence=1, + ) + ) - times.append(dict( - trip_id = trip_id, arrival_time = "%02d:00:00" % (hour + 1), - departure_time = "%02d:00:00" % (hour + 1), stop_id = destination, stop_sequence = 2 - )) + times.append( + dict( + trip_id=trip_id, + arrival_time="%02d:00:00" % (hour + 1), + departure_time="%02d:00:00" % (hour + 1), + stop_id=destination, + stop_sequence=2, + ) + ) trip_id += 1 @@ -864,28 +1262,39 @@ def create(output_path): feed["stop_times"] = pd.DataFrame.from_records(times) # Transfers - feed["transfers"] = pd.DataFrame(dict( - from_stop_id = [], to_stop_id = [], transfer_type = [] - )) + feed["transfers"] = pd.DataFrame( + dict(from_stop_id=[], to_stop_id=[], transfer_type=[]) + ) os.mkdir("%s/gtfs_idf" % output_path) import data.gtfs.utils + data.gtfs.utils.write_feed(feed, "%s/gtfs_idf/IDFM-gtfs.zip" % output_path) # Dataset: Parc automobile - df_vehicles_region = pd.DataFrame(index = pd.MultiIndex.from_product([ - df["region"].unique(), - np.arange(20), - ], names = [ - "Code région", "Age au 01/01/2021" - ])).reset_index() + df_vehicles_region = pd.DataFrame( + index=pd.MultiIndex.from_product( + [ + df["region"].unique(), + np.arange(20), + ], + names=["Code région", "Age au 01/01/2021"], + ) + ).reset_index() # to enforce string - df_vehicles_region = pd.concat([df_vehicles_region, pd.DataFrame({ - "Code région": ["AB"], - "Age au 01/01/2021": [0], - })]) + df_vehicles_region = pd.concat( + [ + df_vehicles_region, + pd.DataFrame( + { + "Code région": ["AB"], + "Age au 01/01/2021": [0], + } + ), + ] + ) df_vehicles_region["Code région"] = df_vehicles_region["Code région"].astype(str) @@ -893,44 +1302,58 @@ def create(output_path): df_vehicles_region["Energie"] = "Gazole" df_vehicles_region["Vignette crit'air"] = "Crit'air 1" - df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"].astype(str) - df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"].replace("20", ">20") - df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"] + " ans" + df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region[ + "Age au 01/01/2021" + ].astype(str) + df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region[ + "Age au 01/01/2021" + ].replace("20", ">20") + df_vehicles_region["Age au 01/01/2021"] = ( + df_vehicles_region["Age au 01/01/2021"] + " ans" + ) - df_vehicles_commune = pd.DataFrame({ - "municipality": df["municipality"].unique() - }) + df_vehicles_commune = pd.DataFrame({"municipality": df["municipality"].unique()}) df_vehicles_commune["Parc au 01/01/2021"] = 100 df_vehicles_commune["Energie"] = "Gazole" df_vehicles_commune["Vignette Crit'air"] = "Crit'air 1" - df_vehicles_commune = pd.merge(df_vehicles_commune, df[[ - "municipality", "region", "department" - ]], on = "municipality") + df_vehicles_commune = pd.merge( + df_vehicles_commune, + df[["municipality", "region", "department"]], + on="municipality", + ) - df_vehicles_commune = df_vehicles_commune.rename(columns = { - "municipality": "Code commune", - "department": "Code départment", - "region": "Code région", - }) + df_vehicles_commune = df_vehicles_commune.rename( + columns={ + "municipality": "Code commune", + "department": "Code départment", + "region": "Code région", + } + ) os.mkdir("%s/vehicles" % output_path) - - with zipfile.ZipFile("%s/vehicles/parc_vp_regions.zip" % output_path, "w") as archive: + + with zipfile.ZipFile( + "%s/vehicles/parc_vp_regions.zip" % output_path, "w" + ) as archive: with archive.open("Parc_VP_Regions_2021.xlsx", "w") as f: df_vehicles_region.to_excel(f) - with zipfile.ZipFile("%s/vehicles/parc_vp_communes.zip" % output_path, "w") as archive: + with zipfile.ZipFile( + "%s/vehicles/parc_vp_communes.zip" % output_path, "w" + ) as archive: with archive.open("Parc_VP_Communes_2021.xlsx", "w") as f: df_vehicles_commune.to_excel(f) + if __name__ == "__main__": import shutil import sys import os + folder = sys.argv[1] os.makedirs(folder, exist_ok=True) for dir in os.listdir(folder): - shutil.rmtree(os.path.join(folder,dir)) + shutil.rmtree(os.path.join(folder, dir)) create(sys.argv[1])