diff --git a/analysis/bootstrapping.py b/analysis/bootstrapping.py
index 0544e2d1..f042052c 100644
--- a/analysis/bootstrapping.py
+++ b/analysis/bootstrapping.py
@@ -2,10 +2,12 @@
import copy
import analysis.statistics
+
def get_seeds(number_of_seeds):
return np.arange(1, number_of_seeds + 1) * 1000
-def configure(context, stage, sample_size, parameters = {}, alias = None, ephemeral = True):
+
+def configure(context, stage, sample_size, parameters={}, alias=None, ephemeral=True):
if alias is None:
alias = stage
@@ -15,11 +17,18 @@ def configure(context, stage, sample_size, parameters = {}, alias = None, epheme
sample_parameters = copy.copy(parameters)
sample_parameters["random_seed"] = int(random_seed)
- context.stage(stage, sample_parameters, alias = "bootstrap_%s_%d" % (alias, index), ephemeral = ephemeral)
+ context.stage(
+ stage,
+ sample_parameters,
+ alias="bootstrap_%s_%d" % (alias, index),
+ ephemeral=ephemeral,
+ )
+
def get_stage(context, alias, index):
return context.stage("bootstrap_%s_%d" % (alias, index))
+
def get_stages(context, alias, sample_size):
for index in range(sample_size):
yield get_stage(context, alias, index)
diff --git a/analysis/chains.py b/analysis/chains.py
index 886bd765..66a809d6 100644
--- a/analysis/chains.py
+++ b/analysis/chains.py
@@ -9,22 +9,30 @@
("chain", "sex"),
("chain_length_class", "age_class"),
("chain_length_class", "sex"),
- ("chain",), ("chain_length_class",),
+ ("chain",),
+ ("chain_length_class",),
("age_range", "sex", "chain"),
- ("age_range", "sex", "chain_length_class")
+ ("age_range", "sex", "chain_length_class"),
]
PURPOSE_MAPPING = {
- "home": "h", "work": "w", "education": "e",
- "shop": "s", "leisure": "l", "other": "o"
+ "home": "h",
+ "work": "w",
+ "education": "e",
+ "shop": "s",
+ "leisure": "l",
+ "other": "o",
}
+
def aggregate_chains(df_chains):
current_person_id = None
current_chain = None
records = []
- for person_id, purpose in zip(df_chains["person_id"].values, df_chains["purpose"].values):
+ for person_id, purpose in zip(
+ df_chains["person_id"].values, df_chains["purpose"].values
+ ):
if not person_id == current_person_id:
if not current_person_id is None:
records.append((current_person_id, current_chain))
@@ -36,11 +44,11 @@ def aggregate_chains(df_chains):
records.append((current_person_id, current_chain))
- df_chains = pd.DataFrame.from_records(records, columns = ["person_id", "chain"])
+ df_chains = pd.DataFrame.from_records(records, columns=["person_id", "chain"])
- #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"w+", "w", x))
- #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"e+", "e", x))
- #df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"h+", "h", x))
+ # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"w+", "w", x))
+ # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"e+", "e", x))
+ # df_chains["chain"] = df_chains["chain"].apply(lambda x: re.sub(r"h+", "h", x))
df_chains["chain_length"] = df_chains["chain"].str.len()
diff --git a/analysis/debug/sc.py b/analysis/debug/sc.py
index 2f73f125..8bbeeaab 100644
--- a/analysis/debug/sc.py
+++ b/analysis/debug/sc.py
@@ -1,11 +1,13 @@
import numpy as np
import pandas as pd
+
def configure(context):
- context.stage("data.census.filtered", alias = "census")
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.census.filtered", alias="census")
+ context.stage("data.hts.selected", alias="hts")
context.config("output_path")
+
def execute(context):
df_census = context.stage("census")
df_hts = context.stage("hts")[1]
@@ -19,14 +21,16 @@ def execute(context):
f_census = df_census["socioprofessional_class"] == value
f_hts = df_hts["socioprofessional_class"] == value
- df_output.append({
- "value": value,
- "census_count": np.count_nonzero(f_census),
- "hts_count": np.count_nonzero(f_hts),
- "census_weight": df_census[f_census]["weight"].sum(),
- "hts_weight": df_hts[f_hts]["person_weight"].sum()
- })
+ df_output.append(
+ {
+ "value": value,
+ "census_count": np.count_nonzero(f_census),
+ "hts_count": np.count_nonzero(f_hts),
+ "census_weight": df_census[f_census]["weight"].sum(),
+ "hts_weight": df_hts[f_hts]["person_weight"].sum(),
+ }
+ )
pd.DataFrame.from_records(df_output).to_csv(
- "{}/debug_sc.csv".format(context.config("output_path")),
- sep = ";", index = False)
+ "{}/debug_sc.csv".format(context.config("output_path")), sep=";", index=False
+ )
diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py
index b2506ea1..4201d3d6 100644
--- a/analysis/grid/comparison_flow_volume.py
+++ b/analysis/grid/comparison_flow_volume.py
@@ -1,116 +1,268 @@
import pandas as pd
import geopandas as gpd
-import plotly.express as px
+import plotly.express as px
SAMPLING_RATE = 0.05
+
def configure(context):
- if not context.config("analysis_from_file",False) :
+ if not context.config("analysis_from_file", False):
context.stage("synthesis.population.trips")
context.stage("synthesis.population.spatial.locations")
context.stage("synthesis.population.enriched")
context.stage("data.spatial.departments")
- context.config("comparison_file_prefix",None)
+ context.config("comparison_file_prefix", None)
context.config("output_prefix", "ile_de_france_")
context.config("output_formats", ["csv", "gpkg"])
context.config("output_path")
context.config("data_path")
-def stat_grid(df_trips,df_locations,df_persons,df_grid):
-
+
+def stat_grid(df_trips, df_locations, df_persons, df_grid):
+
# Write spatial trips
- df_spatial = pd.merge(df_trips, df_locations[[
- "person_id", "activity_index", "geometry"
- ]].rename(columns = {
- "activity_index": "following_activity_index",
- }), how = "left", on = ["person_id", "following_activity_index"])
- df_spatial = pd.merge(df_spatial,df_persons,how = "left", on = ["person_id",])
- df_spatial = gpd.GeoDataFrame(df_spatial, crs = "EPSG:2154").to_crs("4326")
-
- df_stats = gpd.sjoin(df_grid,df_spatial,how="left")
- return df_stats[['id_carr_1km', 'geometry','person_id', 'following_purpose', 'household_id', 'age']]
+ df_spatial = pd.merge(
+ df_trips,
+ df_locations[["person_id", "activity_index", "geometry"]].rename(
+ columns={
+ "activity_index": "following_activity_index",
+ }
+ ),
+ how="left",
+ on=["person_id", "following_activity_index"],
+ )
+ df_spatial = pd.merge(
+ df_spatial,
+ df_persons,
+ how="left",
+ on=[
+ "person_id",
+ ],
+ )
+ df_spatial = gpd.GeoDataFrame(df_spatial, crs="EPSG:2154").to_crs("4326")
+
+ df_stats = gpd.sjoin(df_grid, df_spatial, how="left")
+ return df_stats[
+ [
+ "id_carr_1km",
+ "geometry",
+ "person_id",
+ "following_purpose",
+ "household_id",
+ "age",
+ ]
+ ]
+
+
def execute(context):
-
+
figures = {
- "Yrs:0-10":{"min_age": 0, "max_age": 10,},
- "Yrs:11-14":{"min_age": 11, "max_age": 14,},
- "Yrs:15-18":{"min_age": 15, "max_age": 17,},
- "Yrs:18-25":{"min_age": 18, "max_age": 25,},
- "Yrs:25-50":{"min_age": 26, "max_age": 50,},
- "Yrs:50-65":{"min_age": 51, "max_age": 65,},
- "Yrs:65-75":{"min_age": 66, "max_age": 75,},
- "Yrs:75+":{"min_age": 76, "max_age": 110,},}
- comparison_file = context.config("output_prefix") if context.config("comparison_file_prefix") is None else context.config("comparison_file_prefix")
-
+ "Yrs:0-10": {
+ "min_age": 0,
+ "max_age": 10,
+ },
+ "Yrs:11-14": {
+ "min_age": 11,
+ "max_age": 14,
+ },
+ "Yrs:15-18": {
+ "min_age": 15,
+ "max_age": 17,
+ },
+ "Yrs:18-25": {
+ "min_age": 18,
+ "max_age": 25,
+ },
+ "Yrs:25-50": {
+ "min_age": 26,
+ "max_age": 50,
+ },
+ "Yrs:50-65": {
+ "min_age": 51,
+ "max_age": 65,
+ },
+ "Yrs:65-75": {
+ "min_age": 66,
+ "max_age": 75,
+ },
+ "Yrs:75+": {
+ "min_age": 76,
+ "max_age": 110,
+ },
+ }
+ comparison_file = (
+ context.config("output_prefix")
+ if context.config("comparison_file_prefix") is None
+ else context.config("comparison_file_prefix")
+ )
+
if not context.config("analysis_from_file"):
print("Récupération simu données ...")
# from simulation cache
df_trips = context.stage("synthesis.population.trips")
- df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id","age"]]
- df_locations = context.stage("synthesis.population.spatial.locations")[[
- "person_id", "activity_index", "geometry"
- ]]
+ df_persons = context.stage("synthesis.population.enriched")[
+ ["person_id", "household_id", "age"]
+ ]
+ df_locations = context.stage("synthesis.population.spatial.locations")[
+ ["person_id", "activity_index", "geometry"]
+ ]
df_trips["preceding_activity_index"] = df_trips["trip_index"]
df_trips["following_activity_index"] = df_trips["trip_index"] + 1
- else :
+ else:
# from file trips, activites and person
print("Récupération données ...")
- df_trips = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
- df_locations = gpd.read_parquet(f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg')
- df_persons = pd.read_csv(f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',sep=';')[["person_id", "household_id","age"]]
+ df_trips = pd.read_csv(
+ f'{context.config("output_path")}/{context.config("output_prefix")}trips.csv',
+ sep=";",
+ )[["person_id", "trip_index", "following_activity_index", "following_purpose"]]
+ df_locations = (
+ gpd.read_parquet(
+ f'{context.config("output_path")}/{context.config("output_prefix")}activities.geoparquet'
+ )
+ if "geoparquet" in context.config("output_formats")
+ else gpd.read_file(
+ f'{context.config("output_path")}/{context.config("output_prefix")}activities.gpkg'
+ )
+ )
+ df_persons = pd.read_csv(
+ f'{context.config("output_path")}/{context.config("output_prefix")}persons.csv',
+ sep=";",
+ )[["person_id", "household_id", "age"]]
print("Récupération comp données ...")
- df_trips_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}trips.csv',sep=';')[["person_id","trip_index" ,"following_activity_index","following_purpose"]]
- df_locations_comp = gpd.read_parquet(f'{context.config("output_path")}/{comparison_file}activities.geoparquet') if "geoparquet" in context.config("output_formats") else gpd.read_file(f'{context.config("output_path")}/{comparison_file}activities.gpkg')
- df_persons_comp = pd.read_csv(f'{context.config("output_path")}/{comparison_file}persons.csv',sep=';')[["person_id", "household_id","age"]]
-
+ df_trips_comp = pd.read_csv(
+ f'{context.config("output_path")}/{comparison_file}trips.csv', sep=";"
+ )[["person_id", "trip_index", "following_activity_index", "following_purpose"]]
+ df_locations_comp = (
+ gpd.read_parquet(
+ f'{context.config("output_path")}/{comparison_file}activities.geoparquet'
+ )
+ if "geoparquet" in context.config("output_formats")
+ else gpd.read_file(
+ f'{context.config("output_path")}/{comparison_file}activities.gpkg'
+ )
+ )
+ df_persons_comp = pd.read_csv(
+ f'{context.config("output_path")}/{comparison_file}persons.csv', sep=";"
+ )[["person_id", "household_id", "age"]]
+
list_purpose = list(df_trips["following_purpose"].unique())
# grid 1km of location data
df_departments = context.stage("data.spatial.departments")
poly_dep = df_departments.unary_union
df_grids = gpd.read_file(
- f'{context.config("data_path")}/grid/grille200m_metropole.gpkg',
- mask=poly_dep,
- )
+ f'{context.config("data_path")}/grid/grille200m_metropole.gpkg',
+ mask=poly_dep,
+ )
df_grids = df_grids.to_crs("4326")
- df_grid = df_grids[["id_carr_1km","geometry"]].dissolve(by="id_carr_1km").reset_index()
+ df_grid = (
+ df_grids[["id_carr_1km", "geometry"]].dissolve(by="id_carr_1km").reset_index()
+ )
- df_stats = stat_grid(df_trips,df_locations,df_persons,df_grid)
- df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid)
- point = df_grid.unary_union.centroid # a changé avec ploy_dep
+ df_stats = stat_grid(df_trips, df_locations, df_persons, df_grid)
+ df_grids = stat_grid(df_trips_comp, df_locations_comp, df_persons_comp, df_grid)
+ point = df_grid.unary_union.centroid # a changé avec ploy_dep
print("Printing grids...")
for prefix, figure in figures.items():
- df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
- df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
+ df_select_age = df_stats[
+ df_stats["age"].between(figure["min_age"], figure["max_age"])
+ ]
+ df_select_age = df_select_age.dissolve(
+ by=["id_carr_1km", "following_purpose"], aggfunc="count"
+ ).reset_index()
df_select_age = df_select_age[~(df_select_age["geometry"].isna())]
- df_select_age["following_purpose"] = df_select_age["following_purpose"].astype('str')
+ df_select_age["following_purpose"] = df_select_age["following_purpose"].astype(
+ "str"
+ )
- df_grids_age = df_grids[df_grids["age"].between(figure["min_age"],figure["max_age"])]
- df_grids_age = df_grids_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
+ df_grids_age = df_grids[
+ df_grids["age"].between(figure["min_age"], figure["max_age"])
+ ]
+ df_grids_age = df_grids_age.dissolve(
+ by=["id_carr_1km", "following_purpose"], aggfunc="count"
+ ).reset_index()
df_grids_age = df_grids_age[~(df_grids_age["geometry"].isna())]
- df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype('str')
-
- for purpose in list_purpose :
- df_select = df_select_age[df_select_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
- df_grids_select = df_grids_age[df_grids_age["following_purpose"]==purpose].rename(columns={"person_id":"count"})
- if context.config("output_prefix") == comparison_file :
- df_select = gpd.sjoin(df_select,df_grid,how='right',predicate="contains").fillna(0)
- df_select = df_select[df_select["count"] != 0]
- fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds',
- mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose")
- fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
- else :
- df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0)
- df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
- df_select["volume_difference"] = df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
- df_select = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)]
- df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"]
- px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"],
- mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
-
-
\ No newline at end of file
+ df_grids_age["following_purpose"] = df_grids_age["following_purpose"].astype(
+ "str"
+ )
+
+ for purpose in list_purpose:
+ df_select = df_select_age[
+ df_select_age["following_purpose"] == purpose
+ ].rename(columns={"person_id": "count"})
+ df_grids_select = df_grids_age[
+ df_grids_age["following_purpose"] == purpose
+ ].rename(columns={"person_id": "count"})
+ if context.config("output_prefix") == comparison_file:
+ df_select = gpd.sjoin(
+ df_select, df_grid, how="right", predicate="contains"
+ ).fillna(0)
+ df_select = df_select[df_select["count"] != 0]
+ fig = px.choropleth_mapbox(
+ df_select,
+ geojson=df_select.geometry,
+ locations=df_select.index,
+ color="count",
+ opacity=0.7,
+ color_continuous_scale="reds",
+ mapbox_style="open-street-map",
+ center=dict(lat=point.y, lon=point.x),
+ title=f"Localisation flow distribution for {prefix} group with {purpose} purpose",
+ )
+ fig.write_html(
+ f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html'
+ )
+ else:
+ df_grids_select = gpd.sjoin(
+ df_grids_select, df_grid, how="right", predicate="contains"
+ ).fillna(0)
+ df_select = (
+ gpd.sjoin(
+ df_select,
+ df_grids_select.drop(columns=["index_left"]),
+ how="right",
+ predicate="contains",
+ )
+ .rename(
+ columns={
+ "count_left": "volume_studied_simu",
+ "count_right": "volume_compared_simu",
+ }
+ )
+ .fillna(0)
+ )
+ df_select["volume_difference"] = (
+ df_select["volume_studied_simu"] - df_select["volume_compared_simu"]
+ )
+ df_select = df_select[
+ (df_select["volume_studied_simu"] != 0)
+ | (df_select["volume_compared_simu"] != 0)
+ ]
+ df_select["pourcentage_vol"] = (
+ df_select["volume_difference"] / df_select["volume_compared_simu"]
+ )
+ px.choropleth_mapbox(
+ df_select,
+ geojson=df_select.geometry,
+ locations=df_select.index,
+ color="volume_difference",
+ opacity=0.7,
+ color_continuous_scale="picnic",
+ color_continuous_midpoint=0,
+ hover_name="id_carr_1km_right",
+ hover_data=[
+ "volume_studied_simu",
+ "volume_compared_simu",
+ "pourcentage_vol",
+ ],
+ mapbox_style="open-street-map",
+ center=dict(lat=point.y, lon=point.x),
+ title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose",
+ ).write_html(
+ f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html'
+ )
diff --git a/analysis/marginals.py b/analysis/marginals.py
index 98baf359..7e78720f 100644
--- a/analysis/marginals.py
+++ b/analysis/marginals.py
@@ -20,25 +20,44 @@
CENSUS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [("socioprofessional_class",)]
CENSUS_HOUSEHOLD_MARGINALS = GENERAL_HOUSEHOLD_MARGINALS
-HTS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [("has_license",), ("has_pt_subscription",)]
+HTS_PERSON_MARGINALS = GENERAL_PERSON_MARGINALS + [
+ ("has_license",),
+ ("has_pt_subscription",),
+]
HTS_HOUSEHOLD_MARGINALS = GENERAL_HOUSEHOLD_MARGINALS + [("number_of_bikes_class",)]
SOCIOPROFESIONAL_CLASS_LABELS = [
- "???", "Agriculture", "Independent", "Science", "Intermediate", "Employee", "Worker", "Retired", "Other"
+ "???",
+ "Agriculture",
+ "Independent",
+ "Science",
+ "Intermediate",
+ "Employee",
+ "Worker",
+ "Retired",
+ "Other",
]
+
def prepare_classes(df):
if "age" in df:
- df["age_class"] = np.digitize(df["age"], AGE_CLASS_BOUNDS, right = True)
+ df["age_class"] = np.digitize(df["age"], AGE_CLASS_BOUNDS, right=True)
if "household_size" in df:
- df["household_size_class"] = np.digitize(df["household_size"], HOUSEHOLD_SIZE_BOUNDS, right = True)
+ df["household_size_class"] = np.digitize(
+ df["household_size"], HOUSEHOLD_SIZE_BOUNDS, right=True
+ )
if "number_of_vehicles" in df:
- df["number_of_vehicles_class"] = np.digitize(df["number_of_vehicles"], NUMBER_OF_VEHICLES_BOUNDS, right = True)
+ df["number_of_vehicles_class"] = np.digitize(
+ df["number_of_vehicles"], NUMBER_OF_VEHICLES_BOUNDS, right=True
+ )
if "number_of_bikes" in df:
- df["number_of_bikes_class"] = np.digitize(df["number_of_bikes"], NUMBER_OF_BIKES_BOUNDS, right = True)
+ df["number_of_bikes_class"] = np.digitize(
+ df["number_of_bikes"], NUMBER_OF_BIKES_BOUNDS, right=True
+ )
+
def cross(*marginals):
result = []
@@ -56,6 +75,7 @@ def cross(*marginals):
return list(set(result))
+
def combine(*marginals):
result = []
@@ -64,21 +84,22 @@ def combine(*marginals):
return list(set(result))
+
ALL_PERSON_MARGINALS = combine(CENSUS_PERSON_MARGINALS, HTS_PERSON_MARGINALS)
ALL_HOUSEHOLD_MARGINALS = combine(CENSUS_HOUSEHOLD_MARGINALS, HTS_HOUSEHOLD_MARGINALS)
SPATIAL_MARGINALS = [("departement_id",), ("commune_id",)]
ANALYSIS_PERSON_MARGINALS = combine(
- ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS,
+ ALL_PERSON_MARGINALS,
+ ALL_HOUSEHOLD_MARGINALS,
cross(ALL_PERSON_MARGINALS, ALL_PERSON_MARGINALS),
cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS),
- cross(ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS)
+ cross(ALL_PERSON_MARGINALS, ALL_HOUSEHOLD_MARGINALS),
)
ANALYSIS_HOUSEHOLD_MARGINALS = combine(
- ALL_HOUSEHOLD_MARGINALS,
- cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS)
+ ALL_HOUSEHOLD_MARGINALS, cross(ALL_HOUSEHOLD_MARGINALS, ALL_HOUSEHOLD_MARGINALS)
)
SPATIAL_PERSON_MARGINALS = combine(
diff --git a/analysis/methods/income/compare_methods.py b/analysis/methods/income/compare_methods.py
index d8573521..f4f545cd 100644
--- a/analysis/methods/income/compare_methods.py
+++ b/analysis/methods/income/compare_methods.py
@@ -29,10 +29,17 @@ def execute(context):
df_population = add_household_type_attribute(df_population)
# get most populated commune
- commune_id = df_population.groupby(["commune_id"], observed=True)["commune_id"].count().drop("undefined").idxmax()
+ commune_id = (
+ df_population.groupby(["commune_id"], observed=True)["commune_id"]
+ .count()
+ .drop("undefined")
+ .idxmax()
+ )
# get income distributions by attributes
- income_df = context.stage("data.income.municipality").query(f"commune_id == '{commune_id}'")
+ income_df = context.stage("data.income.municipality").query(
+ f"commune_id == '{commune_id}'"
+ )
income_df = income_df.rename(
columns={
"value": "modality",
@@ -48,22 +55,22 @@ def execute(context):
}
)
- households_with_attributes = df_population[[
- "household_id", "commune_id", "size", "family_comp"
- ]].drop_duplicates("household_id")
+ households_with_attributes = df_population[
+ ["household_id", "commune_id", "size", "family_comp"]
+ ].drop_duplicates("household_id")
# get enriched population with different methods
uniform_pop_df = context.stage("uniform")
uniform_pop_df = uniform_pop_df.merge(households_with_attributes, on="household_id")
uniform_pop_df["household_income"] = (
- uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"]
+ uniform_pop_df["household_income"] * 12 / uniform_pop_df["consumption_units"]
)
uniform_pop_df = uniform_pop_df.query(f"commune_id == '{commune_id}'")
bhepop2_pop_df = context.stage("bhepop2")
bhepop2_pop_df = bhepop2_pop_df.merge(households_with_attributes, on="household_id")
bhepop2_pop_df["household_income"] = (
- bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"]
+ bhepop2_pop_df["household_income"] * 12 / bhepop2_pop_df["consumption_units"]
)
bhepop2_pop_df = bhepop2_pop_df.query(f"commune_id == '{commune_id}'")
@@ -76,28 +83,29 @@ def execute(context):
["size", "family_comp"],
0,
relative_maximum=MAXIMUM_INCOME_FACTOR,
- delta_min=1000
+ delta_min=1000,
)
# check output folder existence
- compare_output_path = os.path.join(context.config("output_path"), COMPARE_INCOME_FOLDER)
+ compare_output_path = os.path.join(
+ context.config("output_path"), COMPARE_INCOME_FOLDER
+ )
if not os.path.exists(compare_output_path):
os.mkdir(compare_output_path)
# create an analysis instance
analysis = marginal_distributions_source.compare_with_populations(
- {
- "Uniform": uniform_pop_df,
- "Bhepop2": bhepop2_pop_df
- },
+ {"Uniform": uniform_pop_df, "Bhepop2": bhepop2_pop_df},
feature_name="household_income",
- output_folder=compare_output_path
+ output_folder=compare_output_path,
+ )
+ analysis.plot_title_format = (
+ analysis.plot_title_format + f" \n(commune={commune_id})"
)
- analysis.plot_title_format = analysis.plot_title_format + f" \n(commune={commune_id})"
analysis.generate_analysis_plots()
analysis.generate_analysis_error_table()
- print(f"Generated compared analysis of income assignation methods in {compare_output_path}")
-
-
+ print(
+ f"Generated compared analysis of income assignation methods in {compare_output_path}"
+ )
diff --git a/analysis/reference/census/sociodemographics.py b/analysis/reference/census/sociodemographics.py
index 47c6204d..203c8d4f 100644
--- a/analysis/reference/census/sociodemographics.py
+++ b/analysis/reference/census/sociodemographics.py
@@ -1,34 +1,39 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
context.stage("data.census.filtered")
+
def execute(context):
person_marginals = marginals.combine(
marginals.TOTAL_MARGINAL,
-
marginals.CENSUS_PERSON_MARGINALS,
marginals.CENSUS_HOUSEHOLD_MARGINALS,
-
- marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_PERSON_MARGINALS),
- marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS),
-
- marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS),
-
+ marginals.cross(
+ marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_PERSON_MARGINALS
+ ),
+ marginals.cross(
+ marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS
+ ),
+ marginals.cross(
+ marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS
+ ),
marginals.SPATIAL_MARGINALS,
- marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_PERSON_MARGINALS)
+ marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_PERSON_MARGINALS),
)
household_marginals = marginals.combine(
marginals.TOTAL_MARGINAL,
-
marginals.CENSUS_HOUSEHOLD_MARGINALS,
-
- marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS),
-
+ marginals.cross(
+ marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS
+ ),
marginals.SPATIAL_MARGINALS,
- marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS)
+ marginals.cross(
+ marginals.SPATIAL_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS
+ ),
)
df_persons = context.stage("data.census.filtered")
@@ -37,6 +42,6 @@ def execute(context):
df_households = df_persons.drop_duplicates("household_id").copy()
return dict(
- person = stats.marginalize(df_persons, person_marginals),
- household = stats.marginalize(df_households, household_marginals)
+ person=stats.marginalize(df_persons, person_marginals),
+ household=stats.marginalize(df_households, household_marginals),
)
diff --git a/analysis/reference/hts/activities.py b/analysis/reference/hts/activities.py
index f1268709..b098e95a 100644
--- a/analysis/reference/hts/activities.py
+++ b/analysis/reference/hts/activities.py
@@ -1,14 +1,21 @@
import pandas as pd
import numpy as np
+
def configure(context):
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
PURPOSE_MAPPING = {
- "home": "h", "work": "w", "education": "e",
- "shop": "s", "leisure": "l", "other": "o"
+ "home": "h",
+ "work": "w",
+ "education": "e",
+ "shop": "s",
+ "leisure": "l",
+ "other": "o",
}
+
def execute(context):
df_households, df_persons, df_activities = context.stage("hts")
@@ -36,13 +43,37 @@ def execute(context):
df_last["is_first"] = False
df_last["is_last"] = True
- df_activities = pd.concat([
- df_activities[["person_id", "activity_id", "purpose", "start_time", "end_time", "is_first", "is_last"]],
- df_last[["person_id", "activity_id", "purpose", "start_time", "end_time", "is_first", "is_last"]]
- ]).sort_values(by = ["person_id", "activity_id"])
+ df_activities = pd.concat(
+ [
+ df_activities[
+ [
+ "person_id",
+ "activity_id",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ ]
+ ],
+ df_last[
+ [
+ "person_id",
+ "activity_id",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ ]
+ ],
+ ]
+ ).sort_values(by=["person_id", "activity_id"])
# Add activities for people without trips
- df_missing = df_persons[~df_persons["person_id"].isin(df_activities["person_id"])][["person_id"]]
+ df_missing = df_persons[~df_persons["person_id"].isin(df_activities["person_id"])][
+ ["person_id"]
+ ]
df_missing["activity_id"] = 0
df_missing["purpose"] = "home"
diff --git a/analysis/reference/hts/chains.py b/analysis/reference/hts/chains.py
index 9f7cd0dc..b10730d5 100644
--- a/analysis/reference/hts/chains.py
+++ b/analysis/reference/hts/chains.py
@@ -5,29 +5,47 @@
import analysis.statistics as stats
import analysis.marginals as marginals
-from analysis.chains import aggregate_chains, CHAIN_MARGINALS, CHAIN_LENGTH_LIMIT, CHAIN_TOP_K
+from analysis.chains import (
+ aggregate_chains,
+ CHAIN_MARGINALS,
+ CHAIN_LENGTH_LIMIT,
+ CHAIN_TOP_K,
+)
+
def configure(context):
context.stage("analysis.reference.hts.activities")
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def execute(context):
- df_chains = context.stage("analysis.reference.hts.activities")[[
- "person_id", "activity_id", "purpose"
- ]].sort_values(by = ["person_id", "activity_id"])
+ df_chains = context.stage("analysis.reference.hts.activities")[
+ ["person_id", "activity_id", "purpose"]
+ ].sort_values(by=["person_id", "activity_id"])
df_chains = aggregate_chains(df_chains)
df_population = context.stage("hts")[1]
marginals.prepare_classes(df_population)
- df_chains = pd.merge(df_population[["person_id", "age_class", "sex", "person_weight", "age"]], df_chains, on = "person_id")
- df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT)
-
- top_k_chains = df_chains.groupby("chain")["person_weight"].sum().reset_index().sort_values(
- by = "person_weight", ascending = False
- ).head(CHAIN_TOP_K)["chain"].values
+ df_chains = pd.merge(
+ df_population[["person_id", "age_class", "sex", "person_weight", "age"]],
+ df_chains,
+ on="person_id",
+ )
+ df_chains["chain_length_class"] = np.minimum(
+ df_chains["chain_length"], CHAIN_LENGTH_LIMIT
+ )
+
+ top_k_chains = (
+ df_chains.groupby("chain")["person_weight"]
+ .sum()
+ .reset_index()
+ .sort_values(by="person_weight", ascending=False)
+ .head(CHAIN_TOP_K)["chain"]
+ .values
+ )
df_chains = df_chains[df_chains["chain"].isin(top_k_chains)]
df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40)
- return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column = "person_weight")
+ return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column="person_weight")
diff --git a/analysis/reference/hts/commute_distance.py b/analysis/reference/hts/commute_distance.py
index 70cd8931..72897de6 100644
--- a/analysis/reference/hts/commute_distance.py
+++ b/analysis/reference/hts/commute_distance.py
@@ -5,33 +5,49 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def execute(context):
- df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" })
- df_trips = pd.merge(context.stage("hts")[2], df_weight, on = "person_id")
+ df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(
+ columns={"person_weight": "weight"}
+ )
+ df_trips = pd.merge(context.stage("hts")[2], df_weight, on="person_id")
# Prepare data frames
df_work = df_trips[
- ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == "work")) |
- ((df_trips["preceding_purpose"] == "work") & (df_trips["following_purpose"] == "home"))
- ].drop_duplicates("person_id", keep = "first")[["euclidean_distance", "weight"]]
+ (
+ (df_trips["preceding_purpose"] == "home")
+ & (df_trips["following_purpose"] == "work")
+ )
+ | (
+ (df_trips["preceding_purpose"] == "work")
+ & (df_trips["following_purpose"] == "home")
+ )
+ ].drop_duplicates("person_id", keep="first")[["euclidean_distance", "weight"]]
df_education = df_trips[
- ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == "education")) |
- ((df_trips["preceding_purpose"] == "education") & (df_trips["following_purpose"] == "home"))
- ].drop_duplicates("person_id", keep = "first")[["euclidean_distance", "weight"]]
+ (
+ (df_trips["preceding_purpose"] == "home")
+ & (df_trips["following_purpose"] == "education")
+ )
+ | (
+ (df_trips["preceding_purpose"] == "education")
+ & (df_trips["following_purpose"] == "home")
+ )
+ ].drop_duplicates("person_id", keep="first")[["euclidean_distance", "weight"]]
# Prepare distributions
- df_work = df_work.sort_values(by = "euclidean_distance")
+ df_work = df_work.sort_values(by="euclidean_distance")
df_work["cdf"] = np.cumsum(df_work["weight"])
df_work["cdf"] /= df_work["cdf"].max()
df_work = df_work[["euclidean_distance", "cdf"]]
- df_education = df_education.sort_values(by = "euclidean_distance")
+ df_education = df_education.sort_values(by="euclidean_distance")
df_education["cdf"] = np.cumsum(df_education["weight"])
df_education["cdf"] /= df_education["cdf"].max()
df_education = df_education[["euclidean_distance", "cdf"]]
- return dict(work = df_work, education = df_education)
+ return dict(work=df_work, education=df_education)
diff --git a/analysis/reference/hts/commute_flow.py b/analysis/reference/hts/commute_flow.py
index 5a922409..6cf2722f 100644
--- a/analysis/reference/hts/commute_flow.py
+++ b/analysis/reference/hts/commute_flow.py
@@ -5,64 +5,115 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
-def execute(context):
- df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" })
- df_trips = context.stage("hts")[2][[
- "person_id", "origin_departement_id", "destination_departement_id",
- "preceding_purpose", "following_purpose"
- ]]
+def execute(context):
+ df_weight = context.stage("hts")[1][["person_id", "person_weight"]].rename(
+ columns={"person_weight": "weight"}
+ )
+
+ df_trips = context.stage("hts")[2][
+ [
+ "person_id",
+ "origin_departement_id",
+ "destination_departement_id",
+ "preceding_purpose",
+ "following_purpose",
+ ]
+ ]
# Prepare homes
- df_homes = df_trips[df_trips["preceding_purpose"] == "home"][["person_id", "origin_departement_id"]].rename(
- columns = { "origin_departement_id": "home" }
- ).drop_duplicates("person_id")
+ df_homes = (
+ df_trips[df_trips["preceding_purpose"] == "home"][
+ ["person_id", "origin_departement_id"]
+ ]
+ .rename(columns={"origin_departement_id": "home"})
+ .drop_duplicates("person_id")
+ )
# Calculate work
- df_work = df_trips[df_trips["following_purpose"] == "work"][["person_id", "destination_departement_id"]].rename(
- columns = { "destination_departement_id": "work" }
- ).drop_duplicates("person_id")
+ df_work = (
+ df_trips[df_trips["following_purpose"] == "work"][
+ ["person_id", "destination_departement_id"]
+ ]
+ .rename(columns={"destination_departement_id": "work"})
+ .drop_duplicates("person_id")
+ )
- df_work = pd.merge(df_homes, df_work, on = "person_id")
- df_work = pd.merge(df_work, df_weight, on = "person_id", how = "left")
+ df_work = pd.merge(df_homes, df_work, on="person_id")
+ df_work = pd.merge(df_work, df_weight, on="person_id", how="left")
df_work = df_work.groupby(["home", "work"])["weight"].sum()
df_work = df_work.reset_index()
# Calculate education
- df_education = df_trips[df_trips["following_purpose"] == "education"][["person_id", "destination_departement_id"]].rename(
- columns = { "destination_departement_id": "education" }
- ).drop_duplicates("person_id")
+ df_education = (
+ df_trips[df_trips["following_purpose"] == "education"][
+ ["person_id", "destination_departement_id"]
+ ]
+ .rename(columns={"destination_departement_id": "education"})
+ .drop_duplicates("person_id")
+ )
- df_education = pd.merge(df_homes, df_education, on = "person_id")
- df_education = pd.merge(df_education, df_weight, on = "person_id", how = "left")
+ df_education = pd.merge(df_homes, df_education, on="person_id")
+ df_education = pd.merge(df_education, df_weight, on="person_id", how="left")
df_education = df_education.groupby(["home", "education"])["weight"].sum()
df_education = df_education.reset_index()
# Calculate corrections for employed non-movers
- df_existing = context.stage("hts")[1][["employed", "departement_id", "person_weight"]].rename(columns = { "person_weight": "weight", "departement_id": "home" })
+ df_existing = context.stage("hts")[1][
+ ["employed", "departement_id", "person_weight"]
+ ].rename(columns={"person_weight": "weight", "departement_id": "home"})
df_existing = df_existing[df_existing["employed"]]
- df_existing = df_existing.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "existing" })
-
- df_origin = df_work.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "active" })
-
- df_work_correction = pd.merge(df_existing, df_origin, on = "home")
- df_work_correction["factor"] = df_work_correction["active"] / df_work_correction["existing"]
+ df_existing = (
+ df_existing.groupby("home")["weight"]
+ .sum()
+ .reset_index()
+ .rename(columns={"weight": "existing"})
+ )
+
+ df_origin = (
+ df_work.groupby("home")["weight"]
+ .sum()
+ .reset_index()
+ .rename(columns={"weight": "active"})
+ )
+
+ df_work_correction = pd.merge(df_existing, df_origin, on="home")
+ df_work_correction["factor"] = (
+ df_work_correction["active"] / df_work_correction["existing"]
+ )
df_work_correction = df_work_correction[["home", "factor"]]
# Calculate corrections for studying non-movers
- df_existing = context.stage("hts")[1][["studies", "departement_id", "person_weight"]].rename(columns = { "person_weight": "weight", "departement_id": "home" })
+ df_existing = context.stage("hts")[1][
+ ["studies", "departement_id", "person_weight"]
+ ].rename(columns={"person_weight": "weight", "departement_id": "home"})
df_existing = df_existing[df_existing["studies"]]
- df_existing = df_existing.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "existing" })
-
- df_origin = df_education.groupby("home")["weight"].sum().reset_index().rename(columns = { "weight": "active" })
-
- df_education_correction = pd.merge(df_existing, df_origin, on = "home")
- df_education_correction["factor"] = df_education_correction["active"] / df_education_correction["existing"]
+ df_existing = (
+ df_existing.groupby("home")["weight"]
+ .sum()
+ .reset_index()
+ .rename(columns={"weight": "existing"})
+ )
+
+ df_origin = (
+ df_education.groupby("home")["weight"]
+ .sum()
+ .reset_index()
+ .rename(columns={"weight": "active"})
+ )
+
+ df_education_correction = pd.merge(df_existing, df_origin, on="home")
+ df_education_correction["factor"] = (
+ df_education_correction["active"] / df_education_correction["existing"]
+ )
df_education_correction = df_education_correction[["home", "factor"]]
- return dict(work = df_work, education = df_education), dict(work = df_work_correction, education = df_education_correction)
+ return dict(work=df_work, education=df_education), dict(
+ work=df_work_correction, education=df_education_correction
+ )
diff --git a/analysis/reference/hts/mode_distances.py b/analysis/reference/hts/mode_distances.py
index 6b556bd5..f347a72c 100644
--- a/analysis/reference/hts/mode_distances.py
+++ b/analysis/reference/hts/mode_distances.py
@@ -1,9 +1,11 @@
import pandas as pd
import numpy as np
+
def configure(context):
context.stage("data.hts.selected")
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.selected")
df = pd.merge(df_trips, df_persons[["person_id", "person_weight"]])
@@ -12,11 +14,13 @@ def execute(context):
df["travel_time"] = df["arrival_time"] - df["departure_time"]
primary_activities = ["home", "work", "education"]
- #primary_activities = []
- df = df[~(
- df["preceding_purpose"].isin(primary_activities) &
- df["following_purpose"].isin(primary_activities)
- )]
+ # primary_activities = []
+ df = df[
+ ~(
+ df["preceding_purpose"].isin(primary_activities)
+ & df["following_purpose"].isin(primary_activities)
+ )
+ ]
data = dict()
@@ -32,6 +36,6 @@ def execute(context):
cdf = np.cumsum(weights[sorter])
cdf /= cdf[-1]
- data[mode] = dict(values = values, cdf = cdf)
+ data[mode] = dict(values=values, cdf=cdf)
return data
diff --git a/analysis/reference/hts/sociodemographics.py b/analysis/reference/hts/sociodemographics.py
index d6acb58f..ad64a5d9 100644
--- a/analysis/reference/hts/sociodemographics.py
+++ b/analysis/reference/hts/sociodemographics.py
@@ -2,8 +2,10 @@
import analysis.marginals as marginals
import pandas as pd
+
def configure(context):
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def execute(context):
df_households, df_persons, _ = context.stage("hts")
@@ -13,7 +15,7 @@ def execute(context):
household_columns -= person_columns
household_columns.add("household_id")
- df = pd.merge(df_persons, df_households[household_columns], on = "household_id")
+ df = pd.merge(df_persons, df_households[household_columns], on="household_id")
assert len(df_persons) == len(df)
df_persons = df
@@ -21,36 +23,36 @@ def execute(context):
person_marginals = marginals.combine(
marginals.TOTAL_MARGINAL,
-
marginals.HTS_PERSON_MARGINALS,
marginals.HTS_HOUSEHOLD_MARGINALS,
-
marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_PERSON_MARGINALS),
- marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS),
-
- marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS),
-
+ marginals.cross(
+ marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS
+ ),
+ marginals.cross(
+ marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS
+ ),
spatial_marginals,
- marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS)
+ marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS),
)
household_marginals = marginals.combine(
marginals.TOTAL_MARGINAL,
-
marginals.HTS_HOUSEHOLD_MARGINALS,
- marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS),
-
+ marginals.cross(
+ marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS
+ ),
spatial_marginals,
- marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS)
+ marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS),
)
marginals.prepare_classes(df_persons)
df_households = df_persons.drop_duplicates("household_id").copy()
- df_persons = df_persons.rename(columns = { "person_weight": "weight" })
- df_households = df_households.rename(columns = { "household_weight": "weight" })
+ df_persons = df_persons.rename(columns={"person_weight": "weight"})
+ df_households = df_households.rename(columns={"household_weight": "weight"})
return dict(
- person = stats.marginalize(df_persons, person_marginals),
- household = stats.marginalize(df_households, household_marginals)
+ person=stats.marginalize(df_persons, person_marginals),
+ household=stats.marginalize(df_households, household_marginals),
)
diff --git a/analysis/reference/income.py b/analysis/reference/income.py
index 5b4e068d..c75e184a 100644
--- a/analysis/reference/income.py
+++ b/analysis/reference/income.py
@@ -4,11 +4,13 @@
import data.hts.egt.cleaned
import data.hts.entd.cleaned
+
def configure(context):
context.stage("data.hts.entd.cleaned")
context.stage("data.hts.egt.cleaned")
context.stage("data.income.region")
+
def calculate_cdf(df):
weights = df["household_weight"].values
incomes = df["income"].values
@@ -16,33 +18,53 @@ def calculate_cdf(df):
sorter = np.argsort(incomes)
cdf = np.cumsum(weights[sorter]) / np.sum(weights)
- return dict(income = incomes[sorter], cdf = cdf)
+ return dict(income=incomes[sorter], cdf=cdf)
+
def execute(context):
# Calculate ENTD income distribution
- df_entd = context.stage("data.hts.entd.cleaned")[0][["household_weight", "income_class", "consumption_units"]].copy()
+ df_entd = context.stage("data.hts.entd.cleaned")[0][
+ ["household_weight", "income_class", "consumption_units"]
+ ].copy()
entd_upper_bounds = data.hts.entd.cleaned.INCOME_CLASS_BOUNDS
entd_lower_bounds = [0] + entd_upper_bounds[:-1]
- df_entd["income"] = 12 * 0.5 * df_entd["income_class"].apply(lambda k: entd_lower_bounds[k] + entd_upper_bounds[k] if k >= 0 else np.nan)
+ df_entd["income"] = (
+ 12
+ * 0.5
+ * df_entd["income_class"].apply(
+ lambda k: entd_lower_bounds[k] + entd_upper_bounds[k] if k >= 0 else np.nan
+ )
+ )
df_entd = pd.DataFrame(calculate_cdf(df_entd))
df_entd["source"] = "entd"
# Calculate EGT income distribution
- df_egt = context.stage("data.hts.egt.cleaned")[0][["household_weight", "income_class", "consumption_units"]].copy()
+ df_egt = context.stage("data.hts.egt.cleaned")[0][
+ ["household_weight", "income_class", "consumption_units"]
+ ].copy()
egt_upper_bounds = data.hts.egt.cleaned.INCOME_CLASS_BOUNDS
egt_lower_bounds = [0] + egt_upper_bounds[:-1]
- df_egt["income"] = 12 * 0.5 * df_egt["income_class"].apply(lambda k: egt_lower_bounds[k] + egt_upper_bounds[k] if k >= 0 else np.nan)
+ df_egt["income"] = (
+ 12
+ * 0.5
+ * df_egt["income_class"].apply(
+ lambda k: egt_lower_bounds[k] + egt_upper_bounds[k] if k >= 0 else np.nan
+ )
+ )
df_egt["income"] /= df_egt["consumption_units"]
df_egt = pd.DataFrame(calculate_cdf(df_egt))
df_egt["source"] = "egt"
# Calcultae FiLo income distribution
df_filo = context.stage("data.income.region")
- df_filo = pd.DataFrame(dict(
- income = np.array([0.0] + df_filo.tolist()), cdf = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
- ))
+ df_filo = pd.DataFrame(
+ dict(
+ income=np.array([0.0] + df_filo.tolist()),
+ cdf=np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
+ )
+ )
df_filo["source"] = "filo"
return pd.concat([df_entd, df_egt, df_filo])
diff --git a/analysis/reference/od/commute_distance.py b/analysis/reference/od/commute_distance.py
index 14a04eae..9bd3b13f 100644
--- a/analysis/reference/od/commute_distance.py
+++ b/analysis/reference/od/commute_distance.py
@@ -1,19 +1,21 @@
import pandas as pd
import numpy as np
+
def configure(context):
context.stage("data.od.cleaned")
context.stage("data.spatial.centroid_distances")
+
def execute(context):
df_distances = context.stage("data.spatial.centroid_distances")
result = {}
for df_data, name in zip(context.stage("data.od.cleaned"), ("work", "education")):
- df_data = pd.merge(df_data, df_distances, on = ["origin_id", "destination_id"])
+ df_data = pd.merge(df_data, df_distances, on=["origin_id", "destination_id"])
df_data = df_data[["centroid_distance", "weight"]]
- df_data = df_data.sort_values(by = "centroid_distance")
+ df_data = df_data.sort_values(by="centroid_distance")
df_data["cdf"] = np.cumsum(df_data["weight"])
df_data["cdf"] /= df_data["cdf"].max()
df_data = df_data[["centroid_distance", "cdf"]]
diff --git a/analysis/reference/od/commute_flow.py b/analysis/reference/od/commute_flow.py
index 0a693a0f..1a8b1981 100644
--- a/analysis/reference/od/commute_flow.py
+++ b/analysis/reference/od/commute_flow.py
@@ -1,36 +1,56 @@
import pandas as pd
+
def configure(context):
context.stage("data.od.cleaned")
context.stage("data.spatial.municipalities")
+
def execute(context):
- df_codes = context.stage("data.spatial.municipalities")[[
- "commune_id", "departement_id"
- ]]
+ df_codes = context.stage("data.spatial.municipalities")[
+ ["commune_id", "departement_id"]
+ ]
result = {}
for df_data, name in zip(context.stage("data.od.cleaned"), ("work", "education")):
df_data["origin_id"] = df_data["origin_id"].cat.remove_unused_categories()
- df_data["destination_id"] = df_data["destination_id"].cat.remove_unused_categories()
-
- df_data = pd.merge(df_data, df_codes.rename(columns = {
- "commune_id": "origin_id",
- "departement_id": "origin_departement_id"
- }), how = "left", on = "origin_id")
-
- df_data = pd.merge(df_data, df_codes.rename(columns = {
- "commune_id": "destination_id",
- "departement_id": "destination_departement_id"
- }), how = "left", on = "destination_id")
-
- df_data = df_data[[
- "origin_departement_id", "destination_departement_id", "weight"
- ]].rename(columns = {
- "origin_departement_id": "home",
- "destination_departement_id": name
- })
+ df_data["destination_id"] = df_data[
+ "destination_id"
+ ].cat.remove_unused_categories()
+
+ df_data = pd.merge(
+ df_data,
+ df_codes.rename(
+ columns={
+ "commune_id": "origin_id",
+ "departement_id": "origin_departement_id",
+ }
+ ),
+ how="left",
+ on="origin_id",
+ )
+
+ df_data = pd.merge(
+ df_data,
+ df_codes.rename(
+ columns={
+ "commune_id": "destination_id",
+ "departement_id": "destination_departement_id",
+ }
+ ),
+ how="left",
+ on="destination_id",
+ )
+
+ df_data = df_data[
+ ["origin_departement_id", "destination_departement_id", "weight"]
+ ].rename(
+ columns={
+ "origin_departement_id": "home",
+ "destination_departement_id": name,
+ }
+ )
df_data["home"] = df_data["home"].cat.remove_unused_categories()
df_data[name] = df_data[name].cat.remove_unused_categories()
diff --git a/analysis/statistics.py b/analysis/statistics.py
index 498b9bff..1f577e42 100644
--- a/analysis/statistics.py
+++ b/analysis/statistics.py
@@ -4,11 +4,13 @@
import numpy as np
import pandas as pd
-@numba.jit(nopython = True, parallel = True)
+
+@numba.jit(nopython=True, parallel=True)
def _combine_filter(filters):
return np.logical_and.reduce(filters)
-def marginalize(df, marginals, weight_column = "weight", count_column = "weight"):
+
+def marginalize(df, marginals, weight_column="weight", count_column="weight"):
"""
This function takes a data frame and a list of marginals in the form
@@ -58,16 +60,26 @@ def marginalize(df, marginals, weight_column = "weight", count_column = "weight"
results = {}
for columns in marginals:
- if len(columns) == 0: # Total is requested
+ if len(columns) == 0: # Total is requested
total = len(df) if weight_column is None else df[weight_column].sum()
- results[columns] = pd.DataFrame.from_records([["value", total]], columns = ["total", count_column])
+ results[columns] = pd.DataFrame.from_records(
+ [["value", total]], columns=["total", count_column]
+ )
else:
marginal_records = []
- value_index_lists = [np.arange(len(unique_values[column])) for column in columns]
+ value_index_lists = [
+ np.arange(len(unique_values[column])) for column in columns
+ ]
for value_indices in itertools.product(*value_index_lists):
- marginal_values = [unique_values[column][value_index] for column, value_index in zip(columns, value_indices)]
- marginal_filters = [filters[column][value_index] for column, value_index in zip(columns, value_indices)]
+ marginal_values = [
+ unique_values[column][value_index]
+ for column, value_index in zip(columns, value_indices)
+ ]
+ marginal_filters = [
+ filters[column][value_index]
+ for column, value_index in zip(columns, value_indices)
+ ]
f = np.logical_and.reduce(marginal_filters)
if weight_column is None:
@@ -77,18 +89,19 @@ def marginalize(df, marginals, weight_column = "weight", count_column = "weight"
marginal_records.append(marginal_values + [marginal_count])
- marginal_records = pd.DataFrame.from_records(marginal_records, columns = list(columns) + [count_column])
+ marginal_records = pd.DataFrame.from_records(
+ marginal_records, columns=list(columns) + [count_column]
+ )
results[columns] = marginal_records
return results
+
def apply_per_marginal(marginals, f):
- return {
- marginal: f(df)
- for marginal, df in marginals.items()
- }
+ return {marginal: f(df) for marginal, df in marginals.items()}
+
-def collect_sample(dfs, column = "realization"):
+def collect_sample(dfs, column="realization"):
"""
This function combines multiple structurally equal data frames into one
by adding an additional column denoting the number of the realization.
@@ -108,7 +121,8 @@ def collect_sample(dfs, column = "realization"):
return pd.concat(new_dfs)
-def combine_marginals(realizations, column = "realization"):
+
+def combine_marginals(realizations, column="realization"):
"""
This function combines multiple realizations of the "marginalize" output into
a new data structure that is equivalent to the one of "marginalize", but with
@@ -117,7 +131,9 @@ def combine_marginals(realizations, column = "realization"):
assert len(realizations) > 0
marginals = realizations[0].keys()
- marginal_columns = { marginal: list(realizations[0][marginal].columns) for marginal in marginals }
+ marginal_columns = {
+ marginal: list(realizations[0][marginal].columns) for marginal in marginals
+ }
# Check that all realizations have the same structure as the first
for realization in realizations:
@@ -130,21 +146,33 @@ def combine_marginals(realizations, column = "realization"):
sample = {}
for marginal in marginals:
- sample[marginal] = collect_sample([realization[marginal] for realization in realizations], column)
+ sample[marginal] = collect_sample(
+ [realization[marginal] for realization in realizations], column
+ )
return sample
-def bootstrap(df, bootstrap_size, random, realization_column = "realization", bootstrap_sample_size = None):
+
+def bootstrap(
+ df,
+ bootstrap_size,
+ random,
+ realization_column="realization",
+ bootstrap_sample_size=None,
+):
unique_realizations = np.unique(df[realization_column])
realizations = df[realization_column].values
- indices = [list(np.where(realizations == realization)[0]) for realization in unique_realizations]
+ indices = [
+ list(np.where(realizations == realization)[0])
+ for realization in unique_realizations
+ ]
lengths = [len(i) for i in indices]
if bootstrap_sample_size is None:
bootstrap_sample_size = len(indices)
- counts = random.randint(len(indices), size = (bootstrap_size, bootstrap_sample_size))
+ counts = random.randint(len(indices), size=(bootstrap_size, bootstrap_sample_size))
for selection in counts:
selection_indices = []
@@ -159,17 +187,23 @@ def bootstrap(df, bootstrap_size, random, realization_column = "realization", bo
yield df_sample
-def apply_bootstrap(df, bootstrap_size, random, f, realization_column = "realization"):
+
+def apply_bootstrap(df, bootstrap_size, random, f, realization_column="realization"):
df_bootstrap = []
- for bootstrap_realization, df_sample in enumerate(bootstrap(df, bootstrap_size, random, realization_column)):
+ for bootstrap_realization, df_sample in enumerate(
+ bootstrap(df, bootstrap_size, random, realization_column)
+ ):
df_sample = f(df_sample)
df_sample[realization_column] = bootstrap_realization
df_bootstrap.append(df_sample)
return pd.concat(df_bootstrap)
-def analyze_sample(df, realization_column = "realization", columns = ["weight"], statistics = None):
+
+def analyze_sample(
+ df, realization_column="realization", columns=["weight"], statistics=None
+):
assert realization_column in df
if columns is None or len(columns) == 0:
@@ -183,128 +217,171 @@ def analyze_sample(df, realization_column = "realization", columns = ["weight"],
assert column in df.columns
group_columns = list(df.columns)
- for column in columns: group_columns.remove(column)
+ for column in columns:
+ group_columns.remove(column)
group_columns.remove(realization_column)
if statistics is None:
statistics = {
column: [
- ("mean", "mean"), ("median", "median"), ("min", "min"), ("max", "max"),
- ("q10", lambda x: x.quantile(0.1)), ("q90", lambda x: x.quantile(0.9)),
- ("q5", lambda x: x.quantile(0.05)), ("q95", lambda x: x.quantile(0.95))
+ ("mean", "mean"),
+ ("median", "median"),
+ ("min", "min"),
+ ("max", "max"),
+ ("q10", lambda x: x.quantile(0.1)),
+ ("q90", lambda x: x.quantile(0.9)),
+ ("q5", lambda x: x.quantile(0.05)),
+ ("q95", lambda x: x.quantile(0.95)),
]
for column in columns
}
- df = df[group_columns + columns].groupby(group_columns).aggregate(statistics).reset_index()
+ df = (
+ df[group_columns + columns]
+ .groupby(group_columns)
+ .aggregate(statistics)
+ .reset_index()
+ )
return df
-def analyze_sample_and_flatten(df, realization_column = "realization", columns = ["weight"], statistics = None):
+
+def analyze_sample_and_flatten(
+ df, realization_column="realization", columns=["weight"], statistics=None
+):
df = analyze_sample(df, realization_column, columns, statistics)
df.columns = [c[1] if c[0] == "weight" else c[0] for c in df.columns]
return df
-def sample_subsets(df, subset_size, random, realization_column = "realization"):
+
+def sample_subsets(df, subset_size, random, realization_column="realization"):
realizations = len(np.unique(df[realization_column]))
return bootstrap(df, realizations, random, realization_column, subset_size)
-def average_subsets(df, subset_size, random, realization_column = "realization", weight_column = "weight"):
+
+def average_subsets(
+ df, subset_size, random, realization_column="realization", weight_column="weight"
+):
df_output = []
- for realization, df_subset in enumerate(sample_subsets(df, subset_size, random, realization_column)):
- df_subset = analyze_sample(df_subset, realization_column, weight_column, [("weight", "mean")])
+ for realization, df_subset in enumerate(
+ sample_subsets(df, subset_size, random, realization_column)
+ ):
+ df_subset = analyze_sample(
+ df_subset, realization_column, weight_column, [("weight", "mean")]
+ )
df_subset[realization_column] = realization
df_output.append(df_subset)
return pd.concat(df_output)
+
if __name__ == "__main__":
+
def create_sample(random_seed):
random = np.random.RandomState(random_seed)
index = np.arange(100)
- ages = random.randint(10, size = 100) * 10
- gender = random.randint(2, size = 100)
+ ages = random.randint(10, size=100) * 10
+ gender = random.randint(2, size=100)
- df = pd.DataFrame.from_records(zip(index, ages, gender), columns = ["person", "age", "gender"])
- df["gender"] = df["gender"].map({ 0: "male", 1: "female" }).astype("category")
+ df = pd.DataFrame.from_records(
+ zip(index, ages, gender), columns=["person", "age", "gender"]
+ )
+ df["gender"] = df["gender"].map({0: "male", 1: "female"}).astype("category")
df["weight"] = 1.0
return df
- df = pd.DataFrame.from_records([
- { "age": 20, "weight": 10.0, "abc": 10.0, "realization": 0 },
- { "age": 50, "weight": 50.0, "abc": 50.0, "realization": 0 },
- { "age": 20, "weight": 20.0, "abc": 20.0, "realization": 1 },
- { "age": 50, "weight": 60.0, "abc": 60.0, "realization": 1 },
- ])
+ df = pd.DataFrame.from_records(
+ [
+ {"age": 20, "weight": 10.0, "abc": 10.0, "realization": 0},
+ {"age": 50, "weight": 50.0, "abc": 50.0, "realization": 0},
+ {"age": 20, "weight": 20.0, "abc": 20.0, "realization": 1},
+ {"age": 50, "weight": 60.0, "abc": 60.0, "realization": 1},
+ ]
+ )
random = np.random.RandomState(0)
statistics = {
"weight": [("mean", "mean")],
- "abc": [("q95", lambda x: x.quantile(0.95))]
+ "abc": [("q95", lambda x: x.quantile(0.95))],
}
- df = apply_bootstrap(df, 100, random, lambda df: analyze_sample(df, statistics = statistics, columns = ["weight", "abc"]))
-
- df = df.groupby("age").aggregate([
- ("mean", "mean"),
- ("q10", lambda x: x.quantile(0.1)),
- ("q90", lambda x: x.quantile(0.9))
- ]).reset_index()
+ df = apply_bootstrap(
+ df,
+ 100,
+ random,
+ lambda df: analyze_sample(df, statistics=statistics, columns=["weight", "abc"]),
+ )
+
+ df = (
+ df.groupby("age")
+ .aggregate(
+ [
+ ("mean", "mean"),
+ ("q10", lambda x: x.quantile(0.1)),
+ ("q90", lambda x: x.quantile(0.9)),
+ ]
+ )
+ .reset_index()
+ )
print(df)
-
-
-
exit()
random = np.random.RandomState(0)
- #for df_subset in sample_subsets(df, 3, random):
+ # for df_subset in sample_subsets(df, 3, random):
# print(df_subset)
print(average_subsets(df, 3, random))
- print(apply_bootstrap(average_subsets(df, 3, random), 100, random, lambda df: analyze_sample(df)))
+ print(
+ apply_bootstrap(
+ average_subsets(df, 3, random), 100, random, lambda df: analyze_sample(df)
+ )
+ )
exit()
- #print(analyze(df))
+ # print(analyze(df))
- #for df_sample in bootstrap(df, 100, random):
+ # for df_sample in bootstrap(df, 100, random):
# df_sample = analyze(df_sample)
# print(df_sample)
- statistics = [
- ("precision", lambda x: np.mean(x < 55.0))
- ]
-
- df = apply_bootstrap(df, 100, random, lambda df: analyze_sample(df, statistics = statistics))
- df = df.groupby(["age"]).aggregate([
- ("mean", "mean"),
- ("q10", lambda x: x.quantile(0.1)),
- ("q90", lambda x: x.quantile(0.9))
- ]).reset_index()
-
-
+ statistics = [("precision", lambda x: np.mean(x < 55.0))]
+
+ df = apply_bootstrap(
+ df, 100, random, lambda df: analyze_sample(df, statistics=statistics)
+ )
+ df = (
+ df.groupby(["age"])
+ .aggregate(
+ [
+ ("mean", "mean"),
+ ("q10", lambda x: x.quantile(0.1)),
+ ("q90", lambda x: x.quantile(0.9)),
+ ]
+ )
+ .reset_index()
+ )
print(df)
exit()
print()
-
exit()
sample = [create_sample(R) for R in range(2)]
random = np.random.RandomState(5)
- #marginals = [marginalize(df, [("age",), ("gender",), ("age", "gender"), tuple()]) for df in sample]
+ # marginals = [marginalize(df, [("age",), ("gender",), ("age", "gender"), tuple()]) for df in sample]
marginals = [marginalize(df, [("gender",)]) for df in sample]
marginals = collect_marginalized_sample(marginals)
- metrics = bootstrap_sampled_marginals(marginals, 100, subset_size = 2, random = random)
+ metrics = bootstrap_sampled_marginals(marginals, 100, subset_size=2, random=random)
print(metrics[("gender",)])
diff --git a/analysis/synthesis/commute_distance.py b/analysis/synthesis/commute_distance.py
index b8a83a8e..ec9f8946 100644
--- a/analysis/synthesis/commute_distance.py
+++ b/analysis/synthesis/commute_distance.py
@@ -6,30 +6,52 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
- bs.configure(context, "synthesis.population.spatial.home.locations", acquisition_sample_size)
- bs.configure(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size)
+ bs.configure(
+ context, "synthesis.population.spatial.home.locations", acquisition_sample_size
+ )
+ bs.configure(
+ context,
+ "synthesis.population.spatial.primary.locations",
+ acquisition_sample_size,
+ )
bs.configure(context, "synthesis.population.sampled", acquisition_sample_size)
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
feeder = zip(
- bs.get_stages(context, "synthesis.population.spatial.home.locations", acquisition_sample_size),
- bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size),
+ bs.get_stages(
+ context,
+ "synthesis.population.spatial.home.locations",
+ acquisition_sample_size,
+ ),
+ bs.get_stages(
+ context,
+ "synthesis.population.spatial.primary.locations",
+ acquisition_sample_size,
+ ),
bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size),
)
probabilities = np.linspace(0.0, 1.0, 20)
- quantiles = { "work": [], "education": [] }
+ quantiles = {"work": [], "education": []}
- with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
+ with context.progress(
+ label="Processing commute data ...", total=acquisition_sample_size
+ ) as progress:
for df_home, df_spatial, df_persons in feeder:
# Prepare home
- df_home = pd.merge(df_home, df_persons[["person_id", "household_id"]], on = "household_id")
- df_home = df_home[["person_id", "geometry"]].set_index("person_id").sort_index()
+ df_home = pd.merge(
+ df_home, df_persons[["person_id", "household_id"]], on="household_id"
+ )
+ df_home = (
+ df_home[["person_id", "geometry"]].set_index("person_id").sort_index()
+ )
assert len(df_home) == len(df_persons)
for index, name in enumerate(("work", "education")):
@@ -40,12 +62,11 @@ def execute(context):
df_compare = df_home.loc[df_destination.index]
assert len(df_destination) == len(df_compare)
- distances = df_destination["geometry"].distance(df_compare["geometry"]) * 1e-3
+ distances = (
+ df_destination["geometry"].distance(df_compare["geometry"]) * 1e-3
+ )
- quantiles[name].append([
- distances.quantile(p)
- for p in probabilities
- ])
+ quantiles[name].append([distances.quantile(p) for p in probabilities])
progress.update()
@@ -54,11 +75,11 @@ def execute(context):
for name in ("work", "education"):
data = np.array(quantiles[name])
- mean = np.mean(data, axis = 0)
- min = np.min(data, axis = 0)
- max = np.max(data, axis = 0)
+ mean = np.mean(data, axis=0)
+ min = np.min(data, axis=0)
+ max = np.max(data, axis=0)
- df = pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities))
+ df = pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities))
result[name] = df
return result
diff --git a/analysis/synthesis/commute_flow.py b/analysis/synthesis/commute_flow.py
index 82119898..c96cd61b 100644
--- a/analysis/synthesis/commute_flow.py
+++ b/analysis/synthesis/commute_flow.py
@@ -5,56 +5,94 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
- bs.configure(context, "synthesis.population.spatial.home.zones", acquisition_sample_size)
- bs.configure(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size)
+ bs.configure(
+ context, "synthesis.population.spatial.home.zones", acquisition_sample_size
+ )
+ bs.configure(
+ context,
+ "synthesis.population.spatial.primary.locations",
+ acquisition_sample_size,
+ )
bs.configure(context, "synthesis.population.sampled", acquisition_sample_size)
context.stage("data.spatial.municipalities")
+
def execute(context):
- df_codes = context.stage("data.spatial.municipalities")[[
- "commune_id", "departement_id"
- ]]
+ df_codes = context.stage("data.spatial.municipalities")[
+ ["commune_id", "departement_id"]
+ ]
acquisition_sample_size = context.config("acquisition_sample_size")
feeder = zip(
- bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size),
- bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size),
+ bs.get_stages(
+ context, "synthesis.population.spatial.home.zones", acquisition_sample_size
+ ),
+ bs.get_stages(
+ context,
+ "synthesis.population.spatial.primary.locations",
+ acquisition_sample_size,
+ ),
bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size),
)
work_flows = []
education_flows = []
- with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
+ with context.progress(
+ label="Processing commute data ...", total=acquisition_sample_size
+ ) as progress:
for realization, (df_home, df_spatial, df_persons) in enumerate(feeder):
# Prepare home
- df_home = pd.merge(df_persons[["person_id", "household_id"]], df_home, on = "household_id")
- df_home = df_home[["person_id", "departement_id"]].rename(columns = { "departement_id": "home" })
+ df_home = pd.merge(
+ df_persons[["person_id", "household_id"]], df_home, on="household_id"
+ )
+ df_home = df_home[["person_id", "departement_id"]].rename(
+ columns={"departement_id": "home"}
+ )
# Prepare work
df_work = df_spatial[0]
- df_work = pd.merge(df_work, df_codes, how = "left", on = "commune_id")
- df_work["departement_id"] = df_work["departement_id"].cat.remove_unused_categories()
- df_work = df_work[["person_id", "departement_id"]].rename(columns = { "departement_id": "work" })
+ df_work = pd.merge(df_work, df_codes, how="left", on="commune_id")
+ df_work["departement_id"] = df_work[
+ "departement_id"
+ ].cat.remove_unused_categories()
+ df_work = df_work[["person_id", "departement_id"]].rename(
+ columns={"departement_id": "work"}
+ )
# Calculate work
- df_work = pd.merge(df_home, df_work, on = "person_id").groupby(["home", "work"]).size().reset_index(name = "weight")
+ df_work = (
+ pd.merge(df_home, df_work, on="person_id")
+ .groupby(["home", "work"])
+ .size()
+ .reset_index(name="weight")
+ )
df_work["realization"] = realization
work_flows.append(df_work)
# Prepare work
df_education = df_spatial[1]
- df_education = pd.merge(df_education, df_codes, how = "left", on = "commune_id")
- df_education["departement_id"] = df_education["departement_id"].cat.remove_unused_categories()
- df_education = df_education[["person_id", "departement_id"]].rename(columns = { "departement_id": "education" })
+ df_education = pd.merge(df_education, df_codes, how="left", on="commune_id")
+ df_education["departement_id"] = df_education[
+ "departement_id"
+ ].cat.remove_unused_categories()
+ df_education = df_education[["person_id", "departement_id"]].rename(
+ columns={"departement_id": "education"}
+ )
# Calculate education
- df_education = pd.merge(df_home, df_education, on = "person_id").groupby(["home", "education"]).size().reset_index(name = "weight")
+ df_education = (
+ pd.merge(df_home, df_education, on="person_id")
+ .groupby(["home", "education"])
+ .size()
+ .reset_index(name="weight")
+ )
df_education["realization"] = realization
education_flows.append(df_education)
@@ -66,4 +104,4 @@ def execute(context):
df_work = stats.analyze_sample_and_flatten(df_work)
df_education = stats.analyze_sample_and_flatten(df_education)
- return dict(work = df_work, education = df_education)
+ return dict(work=df_work, education=df_education)
diff --git a/analysis/synthesis/income.py b/analysis/synthesis/income.py
index f37131e4..1a49af17 100644
--- a/analysis/synthesis/income.py
+++ b/analysis/synthesis/income.py
@@ -6,9 +6,13 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
- bs.configure(context, "synthesis.population.income.selected", acquisition_sample_size)
+ bs.configure(
+ context, "synthesis.population.income.selected", acquisition_sample_size
+ )
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
@@ -16,16 +20,20 @@ def execute(context):
probabilities = np.linspace(0.0, 1.0, 20)
quantiles = []
- with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
- for df_income in bs.get_stages(context, "synthesis.population.income.selected", acquisition_sample_size):
+ with context.progress(
+ label="Processing commute data ...", total=acquisition_sample_size
+ ) as progress:
+ for df_income in bs.get_stages(
+ context, "synthesis.population.income.selected", acquisition_sample_size
+ ):
income = 12 * df_income["household_income"] / df_income["consumption_units"]
quantiles.append([income.quantile(p) for p in probabilities])
progress.update()
quantiles = np.array(quantiles)
- mean = np.mean(quantiles, axis = 0)
- min = np.min(quantiles, axis = 0)
- max = np.max(quantiles, axis = 0)
+ mean = np.mean(quantiles, axis=0)
+ min = np.min(quantiles, axis=0)
+ max = np.max(quantiles, axis=0)
- return pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities))
+ return pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities))
diff --git a/analysis/synthesis/matching.py b/analysis/synthesis/matching.py
index 1c66c14b..f3b33884 100644
--- a/analysis/synthesis/matching.py
+++ b/analysis/synthesis/matching.py
@@ -2,15 +2,21 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
random_seeds = (np.arange(acquisition_sample_size) * 1000 + 1000).astype(int)
for index, random_seed in enumerate(random_seeds):
- context.stage("synthesis.population.matched", {
- "random_seed": int(random_seed),
- "sampling_rate": context.config("sampling_rate")
- }, alias = "seed_%d" % index)
+ context.stage(
+ "synthesis.population.matched",
+ {
+ "random_seed": int(random_seed),
+ "sampling_rate": context.config("sampling_rate"),
+ },
+ alias="seed_%d" % index,
+ )
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
@@ -26,6 +32,6 @@ def execute(context):
aggregated[key].append(value)
- aggregated = { k: np.array(v) for k, v in aggregated.items() }
+ aggregated = {k: np.array(v) for k, v in aggregated.items()}
return aggregated
diff --git a/analysis/synthesis/mode_distances.py b/analysis/synthesis/mode_distances.py
index eea6b36d..e5f6a686 100644
--- a/analysis/synthesis/mode_distances.py
+++ b/analysis/synthesis/mode_distances.py
@@ -5,50 +5,80 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
- bs.configure(context, "synthesis.population.spatial.locations", acquisition_sample_size)
+ bs.configure(
+ context, "synthesis.population.spatial.locations", acquisition_sample_size
+ )
bs.configure(context, "synthesis.population.trips", acquisition_sample_size)
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
probabilities = np.linspace(0.0, 1.0, 20)
modes = ["car", "car_passenger", "pt", "bike", "walk"]
- quantiles = { mode : [] for mode in modes }
+ quantiles = {mode: [] for mode in modes}
generator = zip(
- bs.get_stages(context, "synthesis.population.spatial.locations", acquisition_sample_size),
- bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size)
+ bs.get_stages(
+ context, "synthesis.population.spatial.locations", acquisition_sample_size
+ ),
+ bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size),
)
- with context.progress(label = "Processing distance data ...", total = acquisition_sample_size) as progress:
+ with context.progress(
+ label="Processing distance data ...", total=acquisition_sample_size
+ ) as progress:
for df_locations, df_trips in generator:
# Load locations and calculate euclidean distances
- df_locations = df_locations[["person_id", "activity_index", "geometry"]].rename(columns = { "activity_index": "trip_index" })
- df_locations["euclidean_distance"] = df_locations["geometry"].distance(df_locations["geometry"].shift(-1))
+ df_locations = df_locations[
+ ["person_id", "activity_index", "geometry"]
+ ].rename(columns={"activity_index": "trip_index"})
+ df_locations["euclidean_distance"] = df_locations["geometry"].distance(
+ df_locations["geometry"].shift(-1)
+ )
# Merge mode into distances
df_trips = pd.merge(
- df_trips[["person_id", "trip_index", "mode", "preceding_purpose", "following_purpose", "departure_time", "arrival_time"]],
- df_locations, on = ["person_id", "trip_index"], how = "inner"
+ df_trips[
+ [
+ "person_id",
+ "trip_index",
+ "mode",
+ "preceding_purpose",
+ "following_purpose",
+ "departure_time",
+ "arrival_time",
+ ]
+ ],
+ df_locations,
+ on=["person_id", "trip_index"],
+ how="inner",
+ )
+ df_trips["travel_time"] = (
+ df_trips["arrival_time"] - df_trips["departure_time"]
)
- df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"]
# Filter trips
primary_activities = ["home", "work", "education"]
- #primary_activities = []
- df_trips = df_trips[~(
- df_trips["preceding_purpose"].isin(primary_activities) &
- df_trips["following_purpose"].isin(primary_activities)
- )]
+ # primary_activities = []
+ df_trips = df_trips[
+ ~(
+ df_trips["preceding_purpose"].isin(primary_activities)
+ & df_trips["following_purpose"].isin(primary_activities)
+ )
+ ]
# Calculate quantiles
for mode in modes:
df_mode = df_trips[df_trips["mode"] == mode]
- quantiles[mode].append([df_mode["euclidean_distance"].quantile(p) for p in probabilities])
+ quantiles[mode].append(
+ [df_mode["euclidean_distance"].quantile(p) for p in probabilities]
+ )
progress.update()
@@ -58,14 +88,16 @@ def execute(context):
df_data = []
for mode in modes:
- mean = np.mean(quantiles[mode], axis = 0)
- #min = np.percentile(quantiles[mode], 5, axis = 0)
- #max = np.percentile(quantiles[mode], 95, axis = 0)
+ mean = np.mean(quantiles[mode], axis=0)
+ # min = np.percentile(quantiles[mode], 5, axis = 0)
+ # max = np.percentile(quantiles[mode], 95, axis = 0)
- min = np.min(quantiles[mode], axis = 0)
- max = np.max(quantiles[mode], axis = 0)
+ min = np.min(quantiles[mode], axis=0)
+ max = np.max(quantiles[mode], axis=0)
- df_data.append(pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities)))
+ df_data.append(
+ pd.DataFrame(dict(mean=mean, min=min, max=max, cdf=probabilities))
+ )
df_data[-1]["mode"] = mode
return pd.concat(df_data)
diff --git a/analysis/synthesis/sociodemographics/chains.py b/analysis/synthesis/sociodemographics/chains.py
index 3c90e9bc..c90c95c8 100644
--- a/analysis/synthesis/sociodemographics/chains.py
+++ b/analysis/synthesis/sociodemographics/chains.py
@@ -5,7 +5,13 @@
import analysis.statistics as stats
import analysis.marginals as marginals
-from analysis.chains import aggregate_chains, CHAIN_MARGINALS, CHAIN_LENGTH_LIMIT, CHAIN_TOP_K
+from analysis.chains import (
+ aggregate_chains,
+ CHAIN_MARGINALS,
+ CHAIN_LENGTH_LIMIT,
+ CHAIN_TOP_K,
+)
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
@@ -13,26 +19,41 @@ def configure(context):
bs.configure(context, "synthesis.population.sampled", acquisition_sample_size)
bs.configure(context, "synthesis.population.activities", acquisition_sample_size)
+
def execute_parallel(context, data):
acquisition_sample_size = context.config("acquisition_sample_size")
df_population, df_chains = data
- df_chains = df_chains[["person_id", "activity_index", "purpose"]].sort_values(by = ["person_id", "activity_index"])
+ df_chains = df_chains[["person_id", "activity_index", "purpose"]].sort_values(
+ by=["person_id", "activity_index"]
+ )
df_chains = aggregate_chains(df_chains)
marginals.prepare_classes(df_population)
- df_chains = pd.merge(df_population[["person_id", "age_class", "sex", "age"]], df_chains, on = "person_id")
- df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT)
+ df_chains = pd.merge(
+ df_population[["person_id", "age_class", "sex", "age"]],
+ df_chains,
+ on="person_id",
+ )
+ df_chains["chain_length_class"] = np.minimum(
+ df_chains["chain_length"], CHAIN_LENGTH_LIMIT
+ )
- top_k_chains = df_chains.groupby("chain").size().reset_index(name = "weight").sort_values(
- by = "weight", ascending = False
- ).head(CHAIN_TOP_K)["chain"].values
+ top_k_chains = (
+ df_chains.groupby("chain")
+ .size()
+ .reset_index(name="weight")
+ .sort_values(by="weight", ascending=False)
+ .head(CHAIN_TOP_K)["chain"]
+ .values
+ )
df_chains = df_chains[df_chains["chain"].isin(top_k_chains)]
df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40)
context.progress.update()
- return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column = None)
+ return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column=None)
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
@@ -41,10 +62,14 @@ def execute(context):
feeder = zip(
bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size),
- bs.get_stages(context, "synthesis.population.activities", acquisition_sample_size)
+ bs.get_stages(
+ context, "synthesis.population.activities", acquisition_sample_size
+ ),
)
- with context.progress(label = "Marginalizing chain data ...", total = acquisition_sample_size):
+ with context.progress(
+ label="Marginalizing chain data ...", total=acquisition_sample_size
+ ):
with context.parallel() as parallel:
data = list(parallel.imap_unordered(execute_parallel, feeder))
diff --git a/analysis/synthesis/sociodemographics/general.py b/analysis/synthesis/sociodemographics/general.py
index c396231f..854e4360 100644
--- a/analysis/synthesis/sociodemographics/general.py
+++ b/analysis/synthesis/sociodemographics/general.py
@@ -2,26 +2,44 @@
import analysis.statistics as stats
import analysis.marginals as marginals
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
bs.configure(context, "synthesis.population.enriched", acquisition_sample_size)
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
person_marginals = []
household_marginals = []
- for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size):
+ for df in bs.get_stages(
+ context, "synthesis.population.enriched", acquisition_sample_size
+ ):
marginals.prepare_classes(df)
- person_marginals.append(stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column = None))
- household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column = None))
+ person_marginals.append(
+ stats.marginalize(
+ df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None
+ )
+ )
+ household_marginals.append(
+ stats.marginalize(
+ df.drop_duplicates("household_id"),
+ marginals.ANALYSIS_HOUSEHOLD_MARGINALS,
+ weight_column=None,
+ )
+ )
person_marginals = stats.combine_marginals(person_marginals)
household_marginals = stats.combine_marginals(household_marginals)
- person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten)
- household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten)
+ person_marginals = stats.apply_per_marginal(
+ person_marginals, stats.analyze_sample_and_flatten
+ )
+ household_marginals = stats.apply_per_marginal(
+ household_marginals, stats.analyze_sample_and_flatten
+ )
- return dict(person = person_marginals, household = household_marginals)
+ return dict(person=person_marginals, household=household_marginals)
diff --git a/analysis/synthesis/sociodemographics/spatial.py b/analysis/synthesis/sociodemographics/spatial.py
index baba7e07..3204eea8 100644
--- a/analysis/synthesis/sociodemographics/spatial.py
+++ b/analysis/synthesis/sociodemographics/spatial.py
@@ -4,11 +4,15 @@
import pandas as pd
+
def configure(context):
acquisition_sample_size = context.config("acquisition_sample_size")
bs.configure(context, "synthesis.population.enriched", acquisition_sample_size)
- bs.configure(context, "synthesis.population.spatial.home.zones", acquisition_sample_size)
+ bs.configure(
+ context, "synthesis.population.spatial.home.zones", acquisition_sample_size
+ )
+
def execute(context):
acquisition_sample_size = context.config("acquisition_sample_size")
@@ -17,21 +21,39 @@ def execute(context):
household_marginals = []
feeder = zip(
- bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size),
- bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size)
+ bs.get_stages(
+ context, "synthesis.population.enriched", acquisition_sample_size
+ ),
+ bs.get_stages(
+ context, "synthesis.population.spatial.home.zones", acquisition_sample_size
+ ),
)
for df, df_home in feeder:
df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]])
marginals.prepare_classes(df)
- person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None))
- household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None))
+ person_marginals.append(
+ stats.marginalize(
+ df, marginals.SPATIAL_PERSON_MARGINALS, weight_column=None
+ )
+ )
+ household_marginals.append(
+ stats.marginalize(
+ df.drop_duplicates("household_id"),
+ marginals.SPATIAL_HOUSEHOLD_MARGINALS,
+ weight_column=None,
+ )
+ )
person_marginals = stats.combine_marginals(person_marginals)
household_marginals = stats.combine_marginals(household_marginals)
- person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten)
- household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten)
+ person_marginals = stats.apply_per_marginal(
+ person_marginals, stats.analyze_sample_and_flatten
+ )
+ household_marginals = stats.apply_per_marginal(
+ household_marginals, stats.analyze_sample_and_flatten
+ )
- return dict(person = person_marginals, household = household_marginals)
+ return dict(person=person_marginals, household=household_marginals)
diff --git a/analysis/synthesis/statistics/marginal.py b/analysis/synthesis/statistics/marginal.py
index 7e140d5c..8afcec37 100644
--- a/analysis/synthesis/statistics/marginal.py
+++ b/analysis/synthesis/statistics/marginal.py
@@ -5,17 +5,25 @@
import analysis.statistics as stats
MARGINALS = [
- ("age_class",), ("sex",), ("employed",), ("studies",),
- ("socioprofessional_class",), ("age_class", "employed")
+ ("age_class",),
+ ("sex",),
+ ("employed",),
+ ("studies",),
+ ("socioprofessional_class",),
+ ("age_class", "employed"),
]
+
def configure(context):
context.config("random_seed")
- context.stage("synthesis.population.sampled", dict(
- random_seed = context.config("random_seed")
- ), alias = "sample")
+ context.stage(
+ "synthesis.population.sampled",
+ dict(random_seed=context.config("random_seed")),
+ alias="sample",
+ )
+
def execute(context):
df = context.stage("sample")
marginals.prepare_classes(df)
- return stats.marginalize(df, MARGINALS, weight_column = None)
+ return stats.marginalize(df, MARGINALS, weight_column=None)
diff --git a/analysis/synthesis/statistics/monte_carlo.py b/analysis/synthesis/statistics/monte_carlo.py
index 23b9892d..84d0bd8c 100644
--- a/analysis/synthesis/statistics/monte_carlo.py
+++ b/analysis/synthesis/statistics/monte_carlo.py
@@ -11,23 +11,33 @@
from analysis.synthesis.statistics.marginal import MARGINALS
+
def configure(context):
context.stage("analysis.reference.census.sociodemographics")
for sampling_rate in SAMPLING_RATES:
- bt.configure(context, "analysis.synthesis.statistics.marginal", ACQUISITION_SAMPLE_SIZE, dict(
- sampling_rate = sampling_rate
- ), alias = "sample_%f" % sampling_rate)
+ bt.configure(
+ context,
+ "analysis.synthesis.statistics.marginal",
+ ACQUISITION_SAMPLE_SIZE,
+ dict(sampling_rate=sampling_rate),
+ alias="sample_%f" % sampling_rate,
+ )
+
STATISTICS = [
- ("mean", "mean"), ("q5", lambda x: x.quantile(0.05)), ("q95", lambda x: x.quantile(0.95))
+ ("mean", "mean"),
+ ("q5", lambda x: x.quantile(0.05)),
+ ("q95", lambda x: x.quantile(0.95)),
]
STATISTICS = {
- "weight": STATISTICS, "error": STATISTICS,
- "error_probability": [("mean", "mean")]
+ "weight": STATISTICS,
+ "error": STATISTICS,
+ "error_probability": [("mean", "mean")],
}
+
def process(context, k):
reference = context.data("reference")
partial_marginals = context.data("partial_marginals")
@@ -40,12 +50,23 @@ def process(context, k):
df_marginal = k_marginals[marginal]
df_reference = reference[marginal]
- df_marginal = pd.merge(df_marginal, df_reference.rename(columns = { "weight": "reference" }), on = marginal)
+ df_marginal = pd.merge(
+ df_marginal,
+ df_reference.rename(columns={"weight": "reference"}),
+ on=marginal,
+ )
df_marginal["weight"] /= sampling_rate
df_marginal["error"] = df_marginal["weight"] / df_marginal["reference"] - 1
- df_marginal["error_probability"] = np.abs(df_marginal["error"]) <= ERROR_THRESHOLD
+ df_marginal["error_probability"] = (
+ np.abs(df_marginal["error"]) <= ERROR_THRESHOLD
+ )
- df = df_marginal[list(marginal) + ["weight", "error", "error_probability"]].groupby(list(marginal)).aggregate(STATISTICS).reset_index()
+ df = (
+ df_marginal[list(marginal) + ["weight", "error", "error_probability"]]
+ .groupby(list(marginal))
+ .aggregate(STATISTICS)
+ .reset_index()
+ )
df["samples"] = k
df["sampling_rate"] = sampling_rate
@@ -55,19 +76,36 @@ def process(context, k):
return output
+
def execute(context):
reference = context.stage("analysis.reference.census.sociodemographics")["person"]
- output = { marginal: [] for marginal in MARGINALS }
+ output = {marginal: [] for marginal in MARGINALS}
total = len(SAMPLING_RATES) * len(MARGINALS) * ACQUISITION_SAMPLE_SIZE
- with context.progress(label = "Running Monte Carlo analysis ...", total = total) as progress:
+ with context.progress(
+ label="Running Monte Carlo analysis ...", total=total
+ ) as progress:
for sampling_rate in SAMPLING_RATES:
- partial_marginals = list(bt.get_stages(context, "sample_%f" % sampling_rate, sample_size = ACQUISITION_SAMPLE_SIZE))
-
- with context.parallel(data = dict(partial_marginals = partial_marginals, reference = reference, sampling_rate = sampling_rate)) as parallel:
-
- for partial_output in parallel.imap_unordered(process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1)):
+ partial_marginals = list(
+ bt.get_stages(
+ context,
+ "sample_%f" % sampling_rate,
+ sample_size=ACQUISITION_SAMPLE_SIZE,
+ )
+ )
+
+ with context.parallel(
+ data=dict(
+ partial_marginals=partial_marginals,
+ reference=reference,
+ sampling_rate=sampling_rate,
+ )
+ ) as parallel:
+
+ for partial_output in parallel.imap_unordered(
+ process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1)
+ ):
for marginal in MARGINALS:
output[marginal].append(partial_output[marginal])
diff --git a/data/ban/raw.py b/data/ban/raw.py
index 764c6c8f..7f97064f 100644
--- a/data/ban/raw.py
+++ b/data/ban/raw.py
@@ -7,17 +7,16 @@
This stage loads the raw data from the new French address registry (BAN).
"""
+
def configure(context):
context.stage("data.spatial.codes")
context.config("data_path")
context.config("ban_path", "ban_idf")
-BAN_DTYPES = {
- "code_insee": str,
- "x": float,
- "y": float
-}
+
+BAN_DTYPES = {"code_insee": str, "x": float, "y": float}
+
def execute(context):
# Find relevant departments
@@ -27,12 +26,19 @@ def execute(context):
# Load BAN
df_ban = []
- for source_path in find_ban("{}/{}".format(context.config("data_path"), context.config("ban_path"))):
+ for source_path in find_ban(
+ "{}/{}".format(context.config("data_path"), context.config("ban_path"))
+ ):
print("Reading {} ...".format(source_path))
- df_partial = pd.read_csv(source_path,
- compression = "gzip", sep = ";", usecols = BAN_DTYPES.keys(), dtype = BAN_DTYPES)
-
+ df_partial = pd.read_csv(
+ source_path,
+ compression="gzip",
+ sep=";",
+ usecols=BAN_DTYPES.keys(),
+ dtype=BAN_DTYPES,
+ )
+
# Filter by departments
df_partial["department_id"] = df_partial["code_insee"].str[:2]
df_partial = df_partial[["department_id", "x", "y"]]
@@ -40,25 +46,30 @@ def execute(context):
if len(df_partial) > 0:
df_ban.append(df_partial)
-
+
df_ban = pd.concat(df_ban)
df_ban = gpd.GeoDataFrame(
- df_ban, geometry = gpd.points_from_xy(df_ban.x, df_ban.y), crs = "EPSG:2154")
-
+ df_ban, geometry=gpd.points_from_xy(df_ban.x, df_ban.y), crs="EPSG:2154"
+ )
+
# Check that we cover all requested departments at least once
for department_id in requested_departments:
assert np.count_nonzero(df_ban["department_id"] == department_id) > 0
return df_ban[["geometry"]]
+
def find_ban(path):
candidates = sorted(list(glob.glob("{}/*.csv.gz".format(path))))
if len(candidates) == 0:
raise RuntimeError("BAN data is not available in {}".format(path))
-
+
return candidates
+
def validate(context):
- paths = find_ban("{}/{}".format(context.config("data_path"), context.config("ban_path")))
+ paths = find_ban(
+ "{}/{}".format(context.config("data_path"), context.config("ban_path"))
+ )
return sum([os.path.getsize(path) for path in paths])
diff --git a/data/bdtopo/output.py b/data/bdtopo/output.py
index 214fd465..c1bb95c8 100644
--- a/data/bdtopo/output.py
+++ b/data/bdtopo/output.py
@@ -1,14 +1,17 @@
import geopandas as gpd
+
def configure(context):
context.config("output_path")
context.config("output_prefix", "ile_de_france_")
context.stage("data.bdtopo.raw")
+
def execute(context):
df_buildings = context.stage("data.bdtopo.raw")
- df_buildings.to_file("%s/%sbdtopo.gpkg" % (
- context.config("output_path"), context.config("output_prefix")
- ))
+ df_buildings.to_file(
+ "%s/%sbdtopo.gpkg"
+ % (context.config("output_path"), context.config("output_prefix"))
+ )
diff --git a/data/bdtopo/raw.py b/data/bdtopo/raw.py
index 354545ec..e794f895 100644
--- a/data/bdtopo/raw.py
+++ b/data/bdtopo/raw.py
@@ -11,13 +11,15 @@
"""
This stage loads the raw data from the French building registry (BD-TOPO).
"""
-
+
+
def configure(context):
context.config("data_path")
context.config("bdtopo_path", "bdtopo_idf")
context.stage("data.spatial.departments")
+
def get_department_string(department_id):
department_id = str(department_id)
@@ -28,11 +30,14 @@ def get_department_string(department_id):
else:
raise RuntimeError("Department identifier should have at least two characters")
+
def execute(context):
df_departments = context.stage("data.spatial.departments")
print("Expecting data for {} departments".format(len(df_departments)))
-
- source_paths = find_bdtopo("{}/{}".format(context.config("data_path"), context.config("bdtopo_path")))
+
+ source_paths = find_bdtopo(
+ "{}/{}".format(context.config("data_path"), context.config("bdtopo_path"))
+ )
df_bdtopo = []
known_ids = set()
@@ -43,8 +48,10 @@ def execute(context):
with py7zr.SevenZipFile(source_path) as archive:
# Find the path inside the archive
- internal_path = [path for path in archive.getnames() if path.endswith(".gpkg")]
-
+ internal_path = [
+ path for path in archive.getnames() if path.endswith(".gpkg")
+ ]
+
if len(internal_path) != 1:
print(" Skipping: No unambiguous geometry source found!")
@@ -54,20 +61,26 @@ def execute(context):
geometry_path = "{}/{}".format(context.path(), internal_path[0])
if geometry_path is not None:
- with context.progress(label = " Reading ...") as progress:
- data = { "cleabs": [], "nombre_de_logements": [], "geometry": [] }
- with fiona.open(geometry_path, layer = "batiment") as package:
+ with context.progress(label=" Reading ...") as progress:
+ data = {"cleabs": [], "nombre_de_logements": [], "geometry": []}
+ with fiona.open(geometry_path, layer="batiment") as package:
for item in package:
data["cleabs"].append(item["properties"]["cleabs"])
- data["nombre_de_logements"].append(item["properties"]["nombre_de_logements"])
+ data["nombre_de_logements"].append(
+ item["properties"]["nombre_de_logements"]
+ )
data["geometry"].append(geo.shape(item["geometry"]))
progress.update()
df_buildings = pd.DataFrame(data)
- df_buildings = gpd.GeoDataFrame(df_buildings, crs = "EPSG:2154")
-
- df_buildings["building_id"] = df_buildings["cleabs"].apply(lambda x: int(x[8:]))
- df_buildings["housing"] = df_buildings["nombre_de_logements"].fillna(0).astype(int)
+ df_buildings = gpd.GeoDataFrame(df_buildings, crs="EPSG:2154")
+
+ df_buildings["building_id"] = df_buildings["cleabs"].apply(
+ lambda x: int(x[8:])
+ )
+ df_buildings["housing"] = (
+ df_buildings["nombre_de_logements"].fillna(0).astype(int)
+ )
df_buildings["centroid"] = df_buildings["geometry"].centroid
df_buildings = df_buildings.set_geometry("centroid")
@@ -77,22 +90,36 @@ def execute(context):
initial_count = len(df_buildings)
df_buildings = df_buildings[df_buildings["housing"] > 0]
final_count = len(df_buildings)
- print(" {}/{} filtered by dwellings".format(initial_count - final_count, initial_count))
+ print(
+ " {}/{} filtered by dwellings".format(
+ initial_count - final_count, initial_count
+ )
+ )
initial_count = len(df_buildings)
df_buildings = df_buildings[~df_buildings["building_id"].isin(known_ids)]
final_count = len(df_buildings)
- print(" {}/{} filtered duplicates".format(initial_count - final_count, initial_count))
+ print(
+ " {}/{} filtered duplicates".format(
+ initial_count - final_count, initial_count
+ )
+ )
initial_count = len(df_buildings)
- df_buildings = gpd.sjoin(df_buildings, df_departments, predicate = "within")
+ df_buildings = gpd.sjoin(df_buildings, df_departments, predicate="within")
final_count = len(df_buildings)
- print(" {}/{} filtered spatially".format(initial_count - final_count, initial_count))
+ print(
+ " {}/{} filtered spatially".format(
+ initial_count - final_count, initial_count
+ )
+ )
df_buildings["department_id"] = df_buildings["departement_id"]
df_buildings = df_buildings.set_geometry("geometry")
- df_bdtopo.append(df_buildings[["building_id", "housing", "department_id", "geometry"]])
+ df_bdtopo.append(
+ df_buildings[["building_id", "housing", "department_id", "geometry"]]
+ )
known_ids |= set(df_buildings["building_id"].unique())
os.remove(geometry_path)
@@ -104,14 +131,18 @@ def execute(context):
return df_bdtopo[["building_id", "housing", "geometry"]]
+
def find_bdtopo(path):
candidates = sorted(list(glob.glob("{}/*.7z".format(path))))
if len(candidates) == 0:
raise RuntimeError("BD TOPO data is not available in {}".format(path))
-
+
return candidates
+
def validate(context):
- paths = find_bdtopo("{}/{}".format(context.config("data_path"), context.config("bdtopo_path")))
+ paths = find_bdtopo(
+ "{}/{}".format(context.config("data_path"), context.config("bdtopo_path"))
+ )
return sum([os.path.getsize(path) for path in paths])
diff --git a/data/bpe/cleaned.py b/data/bpe/cleaned.py
index 30e1cad3..9797729e 100644
--- a/data/bpe/cleaned.py
+++ b/data/bpe/cleaned.py
@@ -10,6 +10,7 @@
- Simplify activity types for all enterprises
"""
+
def configure(context):
context.stage("data.bpe.raw")
@@ -18,32 +19,38 @@ def configure(context):
context.config("bpe_random_seed", 0)
+
ACTIVITY_TYPE_MAP = [
- ("A", "other"), # Police, post office, etc ...
- ("A504", "leisure"), # Restaurant
- ("B", "shop"), # Shopping
- ("C", "education"), # Education
- ("D", "other"), # Health
- ("E", "other"), # Transport
- ("F", "leisure"), # Sports & Culture
- ("G", "other"), # Tourism, hotels, etc. (Hôtel = G102)
+ ("A", "other"), # Police, post office, etc ...
+ ("A504", "leisure"), # Restaurant
+ ("B", "shop"), # Shopping
+ ("C", "education"), # Education
+ ("D", "other"), # Health
+ ("E", "other"), # Transport
+ ("F", "leisure"), # Sports & Culture
+ ("G", "other"), # Tourism, hotels, etc. (Hôtel = G102)
]
+
def find_outside(context, commune_id):
df_municipalities = context.data("df_municipalities")
df = context.data("df")
df = df[df["commune_id"] == commune_id]
- zone = df_municipalities[df_municipalities["commune_id"] == commune_id]["geometry"].values[0]
+ zone = df_municipalities[df_municipalities["commune_id"] == commune_id][
+ "geometry"
+ ].values[0]
indices = [
- index for index, x, y in df[["x", "y"]].itertuples()
+ index
+ for index, x, y in df[["x", "y"]].itertuples()
if not zone.contains(geo.Point(x, y))
]
context.progress.update()
return indices
+
def execute(context):
df = context.stage("data.bpe.raw")
@@ -57,9 +64,9 @@ def execute(context):
df["activity_type"] = df["activity_type"].astype("category")
- #Add
- df = df.rename(columns={"TYPEQU":"education_type"})
- df["weight"] = 500
+ # Add
+ df = df.rename(columns={"TYPEQU": "education_type"})
+ df["weight"] = 500
# Clean coordinates
df["x"] = df["LAMBERT_X"].astype(str).str.replace(",", ".").astype(float)
df["y"] = df["LAMBERT_Y"].astype(str).str.replace(",", ".").astype(float)
@@ -77,20 +84,29 @@ def execute(context):
df["commune_id"] = df["DEPCOM"].astype("category")
- print("Found %d/%d (%.2f%%) observations without IRIS" % (
- (df["iris_id"] == "undefined").sum(), len(df), 100 * (df["iris_id"] == "undefined").mean()
- ))
+ print(
+ "Found %d/%d (%.2f%%) observations without IRIS"
+ % (
+ (df["iris_id"] == "undefined").sum(),
+ len(df),
+ 100 * (df["iris_id"] == "undefined").mean(),
+ )
+ )
# Check whether all communes in BPE are within our set of requested data
df_municipalities = context.stage("data.spatial.municipalities")
- excess_communes = set(df["commune_id"].unique()) - set(df_municipalities["commune_id"].unique())
+ excess_communes = set(df["commune_id"].unique()) - set(
+ df_municipalities["commune_id"].unique()
+ )
if len(excess_communes) > 0:
raise RuntimeError("Found additional communes: %s" % excess_communes)
# We notice that we have some additional IRIS. Make sure they will be placed randomly in there commune later.
df_iris = context.stage("data.spatial.iris")
- excess_iris = set(df[df["iris_id"] != "undefined"]["iris_id"].unique()) - set(df_iris["iris_id"].unique())
+ excess_iris = set(df[df["iris_id"] != "undefined"]["iris_id"].unique()) - set(
+ df_iris["iris_id"].unique()
+ )
df.loc[df["iris_id"].isin(excess_iris), "iris_id"] = "undefined"
print("Excess IRIS without valid code:", excess_iris)
@@ -100,19 +116,42 @@ def execute(context):
f_undefined = df["iris_id"] == "undefined"
f_missing = df["x"].isna()
- print("Found %d/%d (%.2f%%) observations without coordinate" % (
- ((f_missing & ~f_undefined).sum(), len(df), 100 * (f_missing & ~f_undefined).mean()
- )))
+ print(
+ "Found %d/%d (%.2f%%) observations without coordinate"
+ % (
+ (
+ (f_missing & ~f_undefined).sum(),
+ len(df),
+ 100 * (f_missing & ~f_undefined).mean(),
+ )
+ )
+ )
if np.count_nonzero(f_missing & ~f_undefined) > 0:
# Impute missing coordinates for known IRIS
- df.update(spatial_utils.sample_from_zones(
- context, df_iris, df[f_missing & ~f_undefined], "iris_id", random, label = "Imputing IRIS coordinates ..."))
+ df.update(
+ spatial_utils.sample_from_zones(
+ context,
+ df_iris,
+ df[f_missing & ~f_undefined],
+ "iris_id",
+ random,
+ label="Imputing IRIS coordinates ...",
+ )
+ )
if np.count_nonzero(f_missing & f_undefined) > 0:
# Impute missing coordinates for unknown IRIS
- df.update(spatial_utils.sample_from_zones(
- context, df_municipalities, df[f_missing & f_undefined], "commune_id", random, label = "Imputing municipality coordinates ..."))
+ df.update(
+ spatial_utils.sample_from_zones(
+ context,
+ df_municipalities,
+ df[f_missing & f_undefined],
+ "commune_id",
+ random,
+ label="Imputing municipality coordinates ...",
+ )
+ )
# Consolidate
df["imputed"] = f_missing
@@ -122,8 +161,12 @@ def execute(context):
# the respective municipality. Find them and move them back in.
outside_indices = []
- with context.progress(label = "Finding outside observations ...", total = len(df["commune_id"].unique())):
- with context.parallel(dict(df = df, df_municipalities = df_municipalities)) as parallel:
+ with context.progress(
+ label="Finding outside observations ...", total=len(df["commune_id"].unique())
+ ):
+ with context.parallel(
+ dict(df=df, df_municipalities=df_municipalities)
+ ) as parallel:
for partial in parallel.imap(find_outside, df["commune_id"].unique()):
outside_indices += partial
@@ -131,14 +174,33 @@ def execute(context):
df.loc[outside_indices, "x"] = np.nan
df.loc[outside_indices, "y"] = np.nan
- df.update(spatial_utils.sample_from_zones(
- context, df_municipalities, df.loc[outside_indices], "commune_id", random, label = "Fixing outside locations ..."))
+ df.update(
+ spatial_utils.sample_from_zones(
+ context,
+ df_municipalities,
+ df.loc[outside_indices],
+ "commune_id",
+ random,
+ label="Fixing outside locations ...",
+ )
+ )
df.loc[outside_indices, "imputed"] = True
# Package up data set
- df = df[["enterprise_id", "activity_type","education_type", "commune_id", "imputed", "x", "y","weight"]]
+ df = df[
+ [
+ "enterprise_id",
+ "activity_type",
+ "education_type",
+ "commune_id",
+ "imputed",
+ "x",
+ "y",
+ "weight",
+ ]
+ ]
- df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y),crs="EPSG:2154")
+ df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y), crs="EPSG:2154")
return df
diff --git a/data/bpe/raw.py b/data/bpe/raw.py
index 98135631..95429e10 100644
--- a/data/bpe/raw.py
+++ b/data/bpe/raw.py
@@ -6,27 +6,38 @@
This stage loads the raw data from the French service registry.
"""
+
def configure(context):
context.config("data_path")
context.config("bpe_path", "bpe_2021/bpe21_ensemble_xy_csv.zip")
context.config("bpe_csv", "bpe21_ensemble_xy.csv")
context.stage("data.spatial.codes")
+
def execute(context):
df_records = []
df_codes = context.stage("data.spatial.codes")
requested_departements = df_codes["departement_id"].unique()
- with context.progress(label = "Reading BPE ...") as progress:
- with zipfile.ZipFile("{}/{}".format(context.config("data_path"), context.config("bpe_path"))) as archive:
+ with context.progress(label="Reading BPE ...") as progress:
+ with zipfile.ZipFile(
+ "{}/{}".format(context.config("data_path"), context.config("bpe_path"))
+ ) as archive:
with archive.open(context.config("bpe_csv")) as f:
- csv = pd.read_csv(f, usecols = [
- "DCIRIS", "LAMBERT_X", "LAMBERT_Y",
- "TYPEQU", "DEPCOM", "DEP"
- ], sep = ";",
- dtype = dict(DEPCOM = str, DEP = str, DCIRIS = str),
- chunksize = 10240
+ csv = pd.read_csv(
+ f,
+ usecols=[
+ "DCIRIS",
+ "LAMBERT_X",
+ "LAMBERT_Y",
+ "TYPEQU",
+ "DEPCOM",
+ "DEP",
+ ],
+ sep=";",
+ dtype=dict(DEPCOM=str, DEP=str, DCIRIS=str),
+ chunksize=10240,
)
for df_chunk in csv:
@@ -39,8 +50,13 @@ def execute(context):
return pd.concat(df_records)
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("bpe_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("bpe_path"))
+ ):
raise RuntimeError("BPE data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("bpe_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("bpe_path"))
+ )
diff --git a/data/census/cleaned.py b/data/census/cleaned.py
index 789d0adb..1d4d4f95 100644
--- a/data/census/cleaned.py
+++ b/data/census/cleaned.py
@@ -9,6 +9,7 @@
- Clean up spatial information and sociodemographic attributes
"""
+
def configure(context):
context.stage("data.census.raw")
context.stage("data.spatial.codes")
@@ -16,27 +17,32 @@ def configure(context):
if context.config("use_urban_type", False):
context.stage("data.spatial.urban_type")
+
def execute(context):
df = context.stage("data.census.raw")
# Construct household IDs for persons with NUMMI != Z
df_household_ids = df[["CANTVILLE", "NUMMI"]]
df_household_ids = df_household_ids[df_household_ids["NUMMI"] != "Z"]
- df_household_ids["temporary"] = df_household_ids["CANTVILLE"] + df_household_ids["NUMMI"]
+ df_household_ids["temporary"] = (
+ df_household_ids["CANTVILLE"] + df_household_ids["NUMMI"]
+ )
df_household_ids = df_household_ids.drop_duplicates("temporary")
df_household_ids["household_id"] = np.arange(len(df_household_ids))
- df = pd.merge(df, df_household_ids, on = ["CANTVILLE", "NUMMI"], how = "left")
+ df = pd.merge(df, df_household_ids, on=["CANTVILLE", "NUMMI"], how="left")
# Fill up undefined household ids (those where NUMMI == Z)
f = np.isnan(df["household_id"])
- df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1
+ df.loc[f, "household_id"] = (
+ np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1
+ )
df["household_id"] = df["household_id"].astype(int)
# Put person IDs
df["person_id"] = np.arange(len(df))
# Sorting
- df = df.sort_values(by = ["household_id", "person_id"])
+ df = df.sort_values(by=["household_id", "person_id"])
# Spatial information
df["departement_id"] = df["DEPT"].astype("category")
@@ -52,7 +58,9 @@ def execute(context):
df["iris_id"] = df["iris_id"].astype("category")
# Age
- df["age"] = df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int)
+ df["age"] = (
+ df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int)
+ )
# Clean COUPLE
df["couple"] = df["COUPLE"] == "1"
@@ -81,42 +89,63 @@ def execute(context):
df["studies"] = df["ETUD"] == "1"
# Number of vehicles
- df["number_of_vehicles"] = df["VOIT"].apply(
- lambda x: str(x).replace("Z", "0").replace("X", "0")
- ).astype(int)
-
- df["number_of_vehicles"] += df["DEROU"].apply(
- lambda x: str(x).replace("U", "0").replace("Z", "0").replace("X", "0")
- ).astype(int)
+ df["number_of_vehicles"] = (
+ df["VOIT"]
+ .apply(lambda x: str(x).replace("Z", "0").replace("X", "0"))
+ .astype(int)
+ )
+
+ df["number_of_vehicles"] += (
+ df["DEROU"]
+ .apply(lambda x: str(x).replace("U", "0").replace("Z", "0").replace("X", "0"))
+ .astype(int)
+ )
# Household size
- df_size = df[["household_id"]].groupby("household_id").size().reset_index(name = "household_size")
+ df_size = (
+ df[["household_id"]]
+ .groupby("household_id")
+ .size()
+ .reset_index(name="household_size")
+ )
df = pd.merge(df, df_size)
# Socioprofessional category
df["socioprofessional_class"] = df["CS1"].astype(int)
# Consumption units
- df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id")
-
- df = df[[
- "person_id", "household_id", "weight",
- "iris_id", "commune_id", "departement_id",
- "age", "sex", "couple",
- "commute_mode", "employed",
- "studies", "number_of_vehicles", "household_size",
- "consumption_units", "socioprofessional_class"
- ]]
+ df = pd.merge(df, hts.calculate_consumption_units(df), on="household_id")
+
+ df = df[
+ [
+ "person_id",
+ "household_id",
+ "weight",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "age",
+ "sex",
+ "couple",
+ "commute_mode",
+ "employed",
+ "studies",
+ "number_of_vehicles",
+ "household_size",
+ "consumption_units",
+ "socioprofessional_class",
+ ]
+ ]
if context.config("use_urban_type"):
- df_urban_type = context.stage("data.spatial.urban_type")[[
- "commune_id", "urban_type"
- ]]
-
+ df_urban_type = context.stage("data.spatial.urban_type")[
+ ["commune_id", "urban_type"]
+ ]
+
# Impute urban type
- df = pd.merge(df, df_urban_type, on = "commune_id", how = "left")
+ df = pd.merge(df, df_urban_type, on="commune_id", how="left")
df.loc[df["commune_id"] == "undefined", "urban_type"] = "none"
df["commune_id"] = df["commune_id"].astype("category")
- assert ~np.any(df["urban_type"].isna())
+ assert ~np.any(df["urban_type"].isna())
return df
diff --git a/data/census/filtered.py b/data/census/filtered.py
index ecd3bdcd..ac1f5771 100644
--- a/data/census/filtered.py
+++ b/data/census/filtered.py
@@ -7,10 +7,12 @@
Île-de-France.
"""
+
def configure(context):
context.stage("data.census.cleaned")
context.stage("data.spatial.codes")
+
def execute(context):
df = context.stage("data.census.cleaned")
@@ -20,7 +22,9 @@ def execute(context):
requested_departements = df_codes["departement_id"].unique()
df = df[df["departement_id"].isin(requested_departements)]
- excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique())
+ excess_communes = set(df["commune_id"].unique()) - set(
+ df_codes["commune_id"].unique()
+ )
if not excess_communes == {"undefined"}:
raise RuntimeError("Found additional communes: %s" % excess_communes)
diff --git a/data/census/projection.py b/data/census/projection.py
index dc9a8f9f..8a26b816 100644
--- a/data/census/projection.py
+++ b/data/census/projection.py
@@ -5,28 +5,32 @@
This stage loads and cleans projection data about the French population.
"""
+
def configure(context):
context.config("data_path")
context.config("projection_path", "projection_2021")
context.config("projection_scenario", "00_central")
context.config("projection_year", None)
+
def execute(context):
source_path = "{}/{}/{}.xlsx".format(
- context.config("data_path"),
- context.config("projection_path"),
- context.config("projection_scenario"))
-
+ context.config("data_path"),
+ context.config("projection_path"),
+ context.config("projection_scenario"),
+ )
+
projection_year = int(context.config("projection_year"))
- df_all = pd.read_excel(
- source_path, sheet_name = "population", skiprows = 1).iloc[:107]
-
- df_male = pd.read_excel(
- source_path, sheet_name = "populationH", skiprows = 1).iloc[:107]
-
- df_female = pd.read_excel(
- source_path, sheet_name = "populationF", skiprows = 1).iloc[:107]
+ df_all = pd.read_excel(source_path, sheet_name="population", skiprows=1).iloc[:107]
+
+ df_male = pd.read_excel(source_path, sheet_name="populationH", skiprows=1).iloc[
+ :107
+ ]
+
+ df_female = pd.read_excel(source_path, sheet_name="populationF", skiprows=1).iloc[
+ :107
+ ]
df_male["sex"] = "male"
df_female["sex"] = "female"
@@ -35,10 +39,9 @@ def execute(context):
assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes"
assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes"
- df_sex = pd.concat([
- df_male.iloc[-1:],
- df_female.iloc[-1:]
- ]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]]
+ df_sex = pd.concat([df_male.iloc[-1:], df_female.iloc[-1:]]).drop(
+ columns=["Âge au 1er janvier"]
+ )[["sex", projection_year]]
df_sex.columns = ["sex", "projection"]
df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1]
@@ -48,28 +51,28 @@ def execute(context):
df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
df_male.columns = ["age", "sex", "projection"]
- df_female.columns = ["age","sex", "projection"]
+ df_female.columns = ["age", "sex", "projection"]
df_cross = pd.concat([df_male, df_female])
df_cross["sex"] = df_cross["sex"].astype("category")
- df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]]
+ df_total = df_all.iloc[-1:].drop(columns=["Âge au 1er janvier"])[[projection_year]]
df_total.columns = ["projection"]
- return {
- "total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross
- }
+ return {"total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross}
+
def validate(context):
if context.config("projection_year") is not None:
source_path = "{}/{}/{}.xlsx".format(
- context.config("data_path"),
- context.config("projection_path"),
- context.config("projection_scenario"))
+ context.config("data_path"),
+ context.config("projection_path"),
+ context.config("projection_scenario"),
+ )
if not os.path.exists(source_path):
raise RuntimeError("Projection data is not available")
return os.path.getsize(source_path)
-
+
return 0
diff --git a/data/census/raw.py b/data/census/raw.py
index 73eebd4a..93099f36 100644
--- a/data/census/raw.py
+++ b/data/census/raw.py
@@ -6,6 +6,7 @@
This stage loads the raw data from the French population census.
"""
+
def configure(context):
context.stage("data.spatial.codes")
@@ -15,24 +16,26 @@ def configure(context):
context.config("projection_year", None)
+
COLUMNS_DTYPES = {
- "CANTVILLE":"str",
- "NUMMI":"str",
- "AGED":"str",
- "COUPLE":"str",
- "CS1":"str",
- "DEPT":"str",
- "ETUD":"str",
- "IPONDI":"str",
- "IRIS":"str",
- "REGION":"str",
- "SEXE":"str",
- "TACT":"str",
- "TRANS":"str",
- "VOIT":"str",
- "DEROU":"str"
+ "CANTVILLE": "str",
+ "NUMMI": "str",
+ "AGED": "str",
+ "COUPLE": "str",
+ "CS1": "str",
+ "DEPT": "str",
+ "ETUD": "str",
+ "IPONDI": "str",
+ "IRIS": "str",
+ "REGION": "str",
+ "SEXE": "str",
+ "TACT": "str",
+ "TRANS": "str",
+ "VOIT": "str",
+ "DEROU": "str",
}
+
def execute(context):
df_records = []
df_codes = context.stage("data.spatial.codes")
@@ -42,20 +45,26 @@ def execute(context):
# only pre-filter if we don't need to reweight the census later
prefilter_departments = context.config("projection_year") is None
- with context.progress(label = "Reading census ...") as progress:
+ with context.progress(label="Reading census ...") as progress:
with zipfile.ZipFile(
- "{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive:
+ "{}/{}".format(context.config("data_path"), context.config("census_path"))
+ ) as archive:
with archive.open(context.config("census_csv")) as f:
- csv = pd.read_csv(f,
- usecols = COLUMNS_DTYPES.keys(), sep = ";",
- dtype = COLUMNS_DTYPES,
- chunksize = 10240)
-
+ csv = pd.read_csv(
+ f,
+ usecols=COLUMNS_DTYPES.keys(),
+ sep=";",
+ dtype=COLUMNS_DTYPES,
+ chunksize=10240,
+ )
+
for df_chunk in csv:
progress.update(len(df_chunk))
-
+
if prefilter_departments:
- df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
+ df_chunk = df_chunk[
+ df_chunk["DEPT"].isin(requested_departements)
+ ]
if len(df_chunk) > 0:
df_records.append(df_chunk)
@@ -64,7 +73,11 @@ def execute(context):
def validate(context):
- if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("census_path"))):
+ if not os.path.exists(
+ "{}/{}".format(context.config("data_path"), context.config("census_path"))
+ ):
raise RuntimeError("RP 2019 data is not available")
- return os.path.getsize("{}/{}".format(context.config("data_path"), context.config("census_path")))
+ return os.path.getsize(
+ "{}/{}".format(context.config("data_path"), context.config("census_path"))
+ )
diff --git a/data/external/education.py b/data/external/education.py
index 78950ce1..7c384a65 100644
--- a/data/external/education.py
+++ b/data/external/education.py
@@ -3,6 +3,7 @@
import pandas as pd
import geopandas as gpd
+
def configure(context):
context.stage("data.bpe.cleaned")
context.stage("data.spatial.municipalities")
@@ -10,24 +11,35 @@ def configure(context):
context.config("data_path")
context.config("education_file", "education/education_addresses.geojson")
+
def execute(context):
- df_locations = context.stage("data.bpe.cleaned")[[
- "activity_type", "education_type", "commune_id","weight", "geometry"
- ]]
+ df_locations = context.stage("data.bpe.cleaned")[
+ ["activity_type", "education_type", "commune_id", "weight", "geometry"]
+ ]
df_locations = df_locations[df_locations["activity_type"] == "education"]
- df_locations = df_locations[["activity_type","education_type", "commune_id", "geometry"]].copy()
- df_locations["fake"] = False
-
- df_zones = context.stage("data.spatial.municipalities")
- required_communes = set(df_zones["commune_id"].unique())
+ df_locations = df_locations[
+ ["activity_type", "education_type", "commune_id", "geometry"]
+ ].copy()
+ df_locations["fake"] = False
+ df_zones = context.stage("data.spatial.municipalities")
+ required_communes = set(df_zones["commune_id"].unique())
- df_education = gpd.read_file("{}/{}".format(context.config("data_path"), context.config("education_file")))[["education_type", "commune_id","weight", "geometry"]]
+ df_education = gpd.read_file(
+ "{}/{}".format(context.config("data_path"), context.config("education_file"))
+ )[["education_type", "commune_id", "weight", "geometry"]]
df_education["fake"] = False
df_education = df_education.to_crs("2154")
df_education["activity_type"] = "education"
list_type = set(df_education["education_type"].unique())
- df_locations = pd.concat([df_locations[~(df_locations["education_type"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]])
+ df_locations = pd.concat(
+ [
+ df_locations[
+ ~(df_locations["education_type"].str.startswith(tuple(list_type)))
+ ],
+ df_education[df_education["commune_id"].isin(required_communes)],
+ ]
+ )
return df_locations
diff --git a/data/gtfs/cleaned.py b/data/gtfs/cleaned.py
index 81d0475e..f883fe08 100644
--- a/data/gtfs/cleaned.py
+++ b/data/gtfs/cleaned.py
@@ -6,15 +6,19 @@
selected regions and departments) and merges them together.
"""
+
def configure(context):
context.config("data_path")
context.config("gtfs_path", "gtfs_idf")
context.stage("data.spatial.municipalities")
+
def execute(context):
- input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("gtfs_path")))
-
+ input_files = get_input_files(
+ "{}/{}".format(context.config("data_path"), context.config("gtfs_path"))
+ )
+
# Prepare bounding area
df_area = context.stage("data.spatial.municipalities")
@@ -25,7 +29,9 @@ def execute(context):
feed = gtfs.cut_feed(feed, df_area)
# This was fixed in pt2matsim, so we can remove one a new release (> 20.7) is available.
- feed = gtfs.despace_stop_ids(feed) # Necessary as MATSim does not like stops/links with spaces
+ feed = gtfs.despace_stop_ids(
+ feed
+ ) # Necessary as MATSim does not like stops/links with spaces
feeds.append(feed)
@@ -34,10 +40,27 @@ def execute(context):
# Fix for pt2matsim (will be fixed after PR #173)
# Order of week days must be fixed
- days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
+ days = [
+ "monday",
+ "tuesday",
+ "wednesday",
+ "thursday",
+ "friday",
+ "saturday",
+ "sunday",
+ ]
columns = list(merged_feed["calendar"].columns)
- for day in days: columns.remove(day)
- columns += ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
+ for day in days:
+ columns.remove(day)
+ columns += [
+ "monday",
+ "tuesday",
+ "wednesday",
+ "thursday",
+ "friday",
+ "saturday",
+ "sunday",
+ ]
merged_feed["calendar"] = merged_feed["calendar"][columns]
# Write feed (not as a ZIP, but as files, for pt2matsim)
@@ -45,6 +68,7 @@ def execute(context):
return "gtfs"
+
def get_input_files(base_path):
gtfs_paths = [
str(child)
@@ -54,11 +78,14 @@ def get_input_files(base_path):
if len(gtfs_paths) == 0:
raise RuntimeError("Did not find any GTFS data (.zip) in {}".format(base_path))
-
+
return gtfs_paths
+
def validate(context):
- input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("gtfs_path")))
+ input_files = get_input_files(
+ "{}/{}".format(context.config("data_path"), context.config("gtfs_path"))
+ )
total_size = 0
for path in input_files:
diff --git a/data/gtfs/output.py b/data/gtfs/output.py
index 68c98ca9..f9fbf66f 100644
--- a/data/gtfs/output.py
+++ b/data/gtfs/output.py
@@ -4,18 +4,22 @@
Writes out the consolidated GTFS feed
"""
+
def configure(context):
context.config("output_path")
context.config("output_prefix")
context.stage("data.gtfs.cleaned")
+
def execute(context):
source_path = "%s/output" % context.path("data.gtfs.cleaned")
output_path = "%s/%sgtfs.zip" % (
- context.config("output_path"), context.config("output_prefix"))
+ context.config("output_path"),
+ context.config("output_prefix"),
+ )
- f = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED)
+ f = zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED)
print(source_path)
for path in sorted(list(glob.glob("%s/*.txt" % source_path))):
diff --git a/data/gtfs/utils.py b/data/gtfs/utils.py
index 10585a7c..a95ba1e5 100644
--- a/data/gtfs/utils.py
+++ b/data/gtfs/utils.py
@@ -5,16 +5,24 @@
import os
import numpy as np
-REQUIRED_SLOTS = [
- "agency", "stops", "routes", "trips", "stop_times"
-]
+REQUIRED_SLOTS = ["agency", "stops", "routes", "trips", "stop_times"]
OPTIONAL_SLOTS = [
- "calendar", "calendar_dates", "fare_attributes", "fare_rules",
- "shapes", "frequencies", "transfers", "pathways", "levels",
- "feed_info", "translations", "attributions"
+ "calendar",
+ "calendar_dates",
+ "fare_attributes",
+ "fare_rules",
+ "shapes",
+ "frequencies",
+ "transfers",
+ "pathways",
+ "levels",
+ "feed_info",
+ "translations",
+ "attributions",
]
+
def read_feed(path):
feed = {}
@@ -38,8 +46,13 @@ def read_feed(path):
if not "%s%s.txt" % (prefix, slot) in available_slots:
raise RuntimeError("Missing GTFS information: %s" % slot)
- if not "%scalendar.txt" % prefix in available_slots and not "%scalendar_dates.txt" % prefix in available_slots:
- raise RuntimeError("At least calendar.txt or calendar_dates.txt must be specified.")
+ if (
+ not "%scalendar.txt" % prefix in available_slots
+ and not "%scalendar_dates.txt" % prefix in available_slots
+ ):
+ raise RuntimeError(
+ "At least calendar.txt or calendar_dates.txt must be specified."
+ )
print("Loading GTFS data from %s ..." % path)
@@ -48,22 +61,27 @@ def read_feed(path):
print(" Loading %s.txt ..." % slot)
with zip.open("%s%s.txt" % (prefix, slot)) as f:
- feed[slot] = pd.read_csv(f, skipinitialspace = True)
+ feed[slot] = pd.read_csv(f, skipinitialspace=True)
else:
print(" Not loading %s.txt" % slot)
# Some cleanup
for slot in ("calendar", "calendar_dates", "trips"):
- if slot in feed and "service_id" in feed[slot] and pd.api.types.is_string_dtype(feed[slot]["service_id"]):
+ if (
+ slot in feed
+ and "service_id" in feed[slot]
+ and pd.api.types.is_string_dtype(feed[slot]["service_id"])
+ ):
initial_count = len(feed[slot])
feed[slot] = feed[slot][feed[slot]["service_id"].str.len() > 0]
final_count = len(feed[slot])
if final_count != initial_count:
- print("WARNING Removed %d/%d entries from %s with empty service_id" % (
- initial_count - final_count, initial_count, slot
- ))
+ print(
+ "WARNING Removed %d/%d entries from %s with empty service_id"
+ % (initial_count - final_count, initial_count, slot)
+ )
if "stops" in feed:
df_stops = feed["stops"]
@@ -83,7 +101,9 @@ def read_feed(path):
print("WARNING NaN numbers for min_transfer_time in transfers")
df_transfers = df_transfers[~f]
- df_transfers["min_transfer_time"] = df_transfers["min_transfer_time"].astype(int)
+ df_transfers["min_transfer_time"] = df_transfers["min_transfer_time"].astype(
+ int
+ )
feed["transfers"] = df_transfers
if "agency" in feed:
@@ -99,17 +119,19 @@ def read_feed(path):
df_routes.loc[df_routes["agency_id"].isna(), "agency_id"] = agency_id
- if "shapes" in feed: del feed["shapes"]
+ if "shapes" in feed:
+ del feed["shapes"]
feed["trips"]["shape_id"] = np.nan
# Fixes for Nantes PDL
for item in feed.keys():
- feed[item] = feed[item].drop(columns = [
- c for c in feed[item].columns if c.startswith("ext_")
- ])
+ feed[item] = feed[item].drop(
+ columns=[c for c in feed[item].columns if c.startswith("ext_")]
+ )
return feed
+
def write_feed(feed, path):
print("Writing GTFS data to %s ..." % path)
@@ -121,7 +143,7 @@ def write_feed(feed, path):
# We cannot write directly to the file handle as it
# is binary, but pandas only writes in text mode.
- zip.writestr("%s.txt" % slot, feed[slot].to_csv(index = None))
+ zip.writestr("%s.txt" % slot, feed[slot].to_csv(index=None))
else:
if not os.path.exists(path):
@@ -134,9 +156,10 @@ def write_feed(feed, path):
if slot in feed:
with open("%s/%s.txt" % (path, slot), "w+", encoding="utf-8") as f:
print(" Writing %s.txt ..." % slot)
- feed[slot].to_csv(f, index = None, lineterminator='\n')
+ feed[slot].to_csv(f, index=None, lineterminator="\n")
+
-def cut_feed(feed, df_area, crs = None):
+def cut_feed(feed, df_area, crs=None):
feed = copy_feed(feed)
df_stops = feed["stops"]
@@ -148,11 +171,10 @@ def cut_feed(feed, df_area, crs = None):
df_stations = df_stops[df_stops["location_type"] == 1].copy()
df_stations["geometry"] = [
- geo.Point(*xy)
- for xy in zip(df_stations["stop_lon"], df_stations["stop_lat"])
+ geo.Point(*xy) for xy in zip(df_stations["stop_lon"], df_stations["stop_lat"])
]
- df_stations = gpd.GeoDataFrame(df_stations, crs = "EPSG:4326")
+ df_stations = gpd.GeoDataFrame(df_stations, crs="EPSG:4326")
if not crs is None:
print("Converting stops to custom CRS", crs)
@@ -164,20 +186,22 @@ def cut_feed(feed, df_area, crs = None):
print("Filtering stations ...")
initial_count = len(df_stations)
- df_stations = gpd.sjoin(df_stations, df_area, predicate = "within")
+ df_stations = gpd.sjoin(df_stations, df_area, predicate="within")
final_count = len(df_stations)
- print("Found %d/%d stations inside the specified area" % (final_count, initial_count))
+ print(
+ "Found %d/%d stations inside the specified area" % (final_count, initial_count)
+ )
inside_stations = df_stations["stop_id"]
# 1) Remove stations that are not inside stations and not have a parent stop
df_stops = feed["stops"]
df_stops = df_stops[
- df_stops["parent_station"].isin(inside_stations) |
- (
- df_stops["parent_station"].isna() &
- df_stops["stop_id"].isin(inside_stations)
+ df_stops["parent_station"].isin(inside_stations)
+ | (
+ df_stops["parent_station"].isna()
+ & df_stops["stop_id"].isin(inside_stations)
)
]
@@ -186,15 +210,17 @@ def cut_feed(feed, df_area, crs = None):
# 2) Remove stop times
df_times = feed["stop_times"]
- df_times = df_times[df_times["stop_id"].astype(str).isin(remaining_stops.astype(str))]
+ df_times = df_times[
+ df_times["stop_id"].astype(str).isin(remaining_stops.astype(str))
+ ]
feed["stop_times"] = df_times.copy()
# 3) Remove transfers
if "transfers" in feed:
df_transfers = feed["transfers"]
df_transfers = df_transfers[
- df_transfers["from_stop_id"].isin(remaining_stops) &
- df_transfers["to_stop_id"].isin(remaining_stops)
+ df_transfers["from_stop_id"].isin(remaining_stops)
+ & df_transfers["to_stop_id"].isin(remaining_stops)
]
feed["transfers"] = df_transfers.copy()
@@ -202,8 +228,8 @@ def cut_feed(feed, df_area, crs = None):
if "pathways" in feed:
df_pathways = feed["pathways"]
df_pathways = df_pathways[
- df_pathways["from_stop_id"].isin(remaining_stops) &
- df_pathways["to_stop_id"].isin(remaining_stops)
+ df_pathways["from_stop_id"].isin(remaining_stops)
+ & df_pathways["to_stop_id"].isin(remaining_stops)
]
feed["pathways"] = df_pathways.copy()
@@ -212,9 +238,7 @@ def cut_feed(feed, df_area, crs = None):
remaining_trips = trip_counts[trip_counts > 1].index.values
df_trips = feed["trips"]
- df_trips = df_trips[
- df_trips["trip_id"].isin(remaining_trips)
- ]
+ df_trips = df_trips[df_trips["trip_id"].isin(remaining_trips)]
feed["trips"] = df_trips.copy()
feed["stop_times"] = feed["stop_times"][
@@ -224,44 +248,73 @@ def cut_feed(feed, df_area, crs = None):
# 6) Remove frequencies
if "frequencies" in feed:
df_frequencies = feed["frequencies"]
- df_frequencies = df_frequencies[
- df_frequencies["trip_id"].isin(remaining_trips)
- ]
+ df_frequencies = df_frequencies[df_frequencies["trip_id"].isin(remaining_trips)]
feed["frequencies"] = df_frequencies.copy()
return feed
+
SLOT_COLLISIONS = [
- { "slot": "agency", "identifier": "agency_id", "references": [
- ("routes", "agency_id"), ("fare_attributes", "agency_id")] },
- { "slot": "stops", "identifier": "stop_id", "references": [
- ("stops", "parent_station"), ("stop_times", "stop_id"),
- ("transfers", "from_stop_id"), ("transfers", "to_stop_id"),
- ("pathways", "from_stop_id"), ("pathways", "to_stop_id")] },
- { "slot": "routes", "identifier": "route_id", "references": [
- ("trips", "route_id"), ("fare_rules", "route_id"),
- ("attributions", "route_id")] },
- { "slot": "trips", "identifier": "trip_id", "references": [
- ("stop_times", "trip_id"), ("frequencies", "trip_id"),
- ("attributions", "trip_id")] },
- { "slot": "calendar", "identifier": "service_id", "references": [
- ("calendar_dates", "service_id"), ("trips", "service_id")] },
- { "slot": "calendar_dates", "identifier": "service_id", "references": [
- ("trips", "service_id"), ("calendar", "service_id")] },
- { "slot": "fare_attributes", "identifier": "fare_id", "references": [
- ("fare_rules", "fare_id")] },
- { "slot": "shapes", "identifier": "shape_id", "references": [
- ("trips", "shape_id")] },
- { "slot": "pathways", "identifier": "pathway_id", "references": [] },
- { "slot": "levels", "identifier": "level_id", "references": [
- ("stops", "level_id")] },
- { "slot": "attributions", "identifier": "attribution_id" },
+ {
+ "slot": "agency",
+ "identifier": "agency_id",
+ "references": [("routes", "agency_id"), ("fare_attributes", "agency_id")],
+ },
+ {
+ "slot": "stops",
+ "identifier": "stop_id",
+ "references": [
+ ("stops", "parent_station"),
+ ("stop_times", "stop_id"),
+ ("transfers", "from_stop_id"),
+ ("transfers", "to_stop_id"),
+ ("pathways", "from_stop_id"),
+ ("pathways", "to_stop_id"),
+ ],
+ },
+ {
+ "slot": "routes",
+ "identifier": "route_id",
+ "references": [
+ ("trips", "route_id"),
+ ("fare_rules", "route_id"),
+ ("attributions", "route_id"),
+ ],
+ },
+ {
+ "slot": "trips",
+ "identifier": "trip_id",
+ "references": [
+ ("stop_times", "trip_id"),
+ ("frequencies", "trip_id"),
+ ("attributions", "trip_id"),
+ ],
+ },
+ {
+ "slot": "calendar",
+ "identifier": "service_id",
+ "references": [("calendar_dates", "service_id"), ("trips", "service_id")],
+ },
+ {
+ "slot": "calendar_dates",
+ "identifier": "service_id",
+ "references": [("trips", "service_id"), ("calendar", "service_id")],
+ },
+ {
+ "slot": "fare_attributes",
+ "identifier": "fare_id",
+ "references": [("fare_rules", "fare_id")],
+ },
+ {"slot": "shapes", "identifier": "shape_id", "references": [("trips", "shape_id")]},
+ {"slot": "pathways", "identifier": "pathway_id", "references": []},
+ {"slot": "levels", "identifier": "level_id", "references": [("stops", "level_id")]},
+ {"slot": "attributions", "identifier": "attribution_id"},
]
+
def copy_feed(feed):
- return {
- slot: feed[slot].copy() for slot in feed
- }
+ return {slot: feed[slot].copy() for slot in feed}
+
def merge_feeds(feeds):
result = {}
@@ -271,7 +324,8 @@ def merge_feeds(feeds):
return result
-def merge_two_feeds(first, second, suffix = "_merged"):
+
+def merge_two_feeds(first, second, suffix="_merged"):
feed = {}
print("Merging GTFS data ...")
@@ -284,35 +338,52 @@ def merge_two_feeds(first, second, suffix = "_merged"):
df_first = first[collision["slot"]]
df_second = second[collision["slot"]]
- df_first[collision["identifier"]] = df_first[collision["identifier"]].astype(str)
- df_second[collision["identifier"]] = df_second[collision["identifier"]].astype(str)
-
- df_concat = pd.concat([df_first, df_second], sort = True).drop_duplicates()
- duplicate_ids = list(df_concat[df_concat[collision["identifier"]].duplicated()][
- collision["identifier"]].astype(str).unique())
+ df_first[collision["identifier"]] = df_first[
+ collision["identifier"]
+ ].astype(str)
+ df_second[collision["identifier"]] = df_second[
+ collision["identifier"]
+ ].astype(str)
+
+ df_concat = pd.concat([df_first, df_second], sort=True).drop_duplicates()
+ duplicate_ids = list(
+ df_concat[df_concat[collision["identifier"]].duplicated()][
+ collision["identifier"]
+ ]
+ .astype(str)
+ .unique()
+ )
if len(duplicate_ids) > 0:
- print(" Found %d duplicate identifiers in %s" % (
- len(duplicate_ids), collision["slot"]))
+ print(
+ " Found %d duplicate identifiers in %s"
+ % (len(duplicate_ids), collision["slot"])
+ )
replacement_ids = [str(id) + suffix for id in duplicate_ids]
- df_second[collision["identifier"]] = df_second[collision["identifier"]].replace(
- duplicate_ids, replacement_ids
- )
+ df_second[collision["identifier"]] = df_second[
+ collision["identifier"]
+ ].replace(duplicate_ids, replacement_ids)
for ref_slot, ref_identifier in collision["references"]:
if ref_slot in first and ref_slot in second:
- first[ref_slot][ref_identifier] = first[ref_slot][ref_identifier].astype(str)
- second[ref_slot][ref_identifier] = second[ref_slot][ref_identifier].astype(str)
+ first[ref_slot][ref_identifier] = first[ref_slot][
+ ref_identifier
+ ].astype(str)
+ second[ref_slot][ref_identifier] = second[ref_slot][
+ ref_identifier
+ ].astype(str)
- second[ref_slot][ref_identifier] = second[ref_slot][ref_identifier].replace(
- duplicate_ids, replacement_ids
- )
+ second[ref_slot][ref_identifier] = second[ref_slot][
+ ref_identifier
+ ].replace(duplicate_ids, replacement_ids)
for slot in REQUIRED_SLOTS + OPTIONAL_SLOTS:
if slot in first and slot in second:
- feed[slot] = pd.concat([first[slot], second[slot]], sort = True).drop_duplicates()
+ feed[slot] = pd.concat(
+ [first[slot], second[slot]], sort=True
+ ).drop_duplicates()
elif slot in first:
feed[slot] = first[slot].copy()
elif slot in second:
@@ -320,7 +391,8 @@ def merge_two_feeds(first, second, suffix = "_merged"):
return feed
-def despace_stop_ids(feed, replacement = ":::"):
+
+def despace_stop_ids(feed, replacement=":::"):
feed = copy_feed(feed)
references = None
@@ -332,14 +404,20 @@ def despace_stop_ids(feed, replacement = ":::"):
df_stops = feed["stops"]
df_stops["stop_id"] = df_stops["stop_id"].astype(str)
- search_ids = list(df_stops[df_stops["stop_id"].str.contains(" ")]["stop_id"].unique())
+ search_ids = list(
+ df_stops[df_stops["stop_id"].str.contains(" ")]["stop_id"].unique()
+ )
replacement_ids = [item.replace(" ", replacement) for item in search_ids]
df_stops["stop_id"] = df_stops["stop_id"].replace(search_ids, replacement_ids)
for reference_slot, reference_field in references:
if reference_slot in feed:
- feed[reference_slot][reference_field] = feed[reference_slot][reference_field].astype(str).replace(search_ids, replacement_ids)
+ feed[reference_slot][reference_field] = (
+ feed[reference_slot][reference_field]
+ .astype(str)
+ .replace(search_ids, replacement_ids)
+ )
print("De-spaced %d/%d stops" % (len(search_ids), len(df_stops)))
diff --git a/data/hts/commute_distance.py b/data/hts/commute_distance.py
index 2a83893d..249201ef 100644
--- a/data/hts/commute_distance.py
+++ b/data/hts/commute_distance.py
@@ -1,25 +1,37 @@
import pandas as pd
import numpy as np
+
def configure(context):
context.config("random_seed")
context.stage("data.hts.selected")
+
def get_commuting_distance(df_persons, df_trips, activity_type, random):
if "euclidean_distance" in df_trips:
distance_slot = "euclidean_distance"
distance_factor = 1.0
else:
distance_slot = "routed_distance"
- distance_factor = 1.0 # / 1.3
+ distance_factor = 1.0 # / 1.3
# Add commuting distances
- df_commute_distance = df_trips[
- ((df_trips["preceding_purpose"] == "home") & (df_trips["following_purpose"] == activity_type)) |
- ((df_trips["preceding_purpose"] == activity_type) & (df_trips["following_purpose"] == "home"))
- ].drop_duplicates("person_id", keep = "first")[["person_id", distance_slot]].rename(columns = { distance_slot: "commute_distance" })
+ df_commute_distance = (
+ df_trips[
+ (
+ (df_trips["preceding_purpose"] == "home")
+ & (df_trips["following_purpose"] == activity_type)
+ )
+ | (
+ (df_trips["preceding_purpose"] == activity_type)
+ & (df_trips["following_purpose"] == "home")
+ )
+ ]
+ .drop_duplicates("person_id", keep="first")[["person_id", distance_slot]]
+ .rename(columns={distance_slot: "commute_distance"})
+ )
- df_persons = pd.merge(df_persons, df_commute_distance, on = "person_id", how = "left")
+ df_persons = pd.merge(df_persons, df_commute_distance, on="person_id", how="left")
# For the ones without commuting distance, sample from the distribution
f_missing = df_persons["commute_distance"].isna()
@@ -39,7 +51,7 @@ def get_commuting_distance(df_persons, df_trips, activity_type, random):
indices = [
np.searchsorted(cdf, r)
- for r in random.random_sample(size = np.count_nonzero(f_missing))
+ for r in random.random_sample(size=np.count_nonzero(f_missing))
]
df_persons.loc[f_missing, "commute_distance"] = values[indices]
@@ -50,17 +62,19 @@ def get_commuting_distance(df_persons, df_trips, activity_type, random):
# Attach euclidean factor
df_persons["commute_distance"] *= distance_factor
- print("Missing %s commute distances: %.2f%%" % (
- activity_type, 100 * np.count_nonzero(f_missing) / len(f_missing)
- ))
+ print(
+ "Missing %s commute distances: %.2f%%"
+ % (activity_type, 100 * np.count_nonzero(f_missing) / len(f_missing))
+ )
return df_persons
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.selected")
random = np.random.RandomState(context.config("random_seed"))
return dict(
- work = get_commuting_distance(df_persons, df_trips, "work", random),
- education = get_commuting_distance(df_persons, df_trips, "education", random)
+ work=get_commuting_distance(df_persons, df_trips, "work", random),
+ education=get_commuting_distance(df_persons, df_trips, "education", random),
)
diff --git a/data/hts/comparison.py b/data/hts/comparison.py
index 3b59979a..891d4c41 100644
--- a/data/hts/comparison.py
+++ b/data/hts/comparison.py
@@ -7,28 +7,34 @@
Comparison of various attributes between EGT, ENTD and census.
"""
+
def configure(context):
context.stage("data.hts.egt.filtered")
context.stage("data.hts.entd.filtered")
context.stage("data.census.filtered")
+
def combine(htss):
households, persons, trips = [], [], []
for name, (df_hts_households, df_hts_persons, df_hts_trips) in htss.items():
- df_hts_households = pd.DataFrame(df_hts_households, copy = True)
- df_hts_persons = pd.DataFrame(df_hts_persons, copy = True)
- df_hts_trips = pd.DataFrame(df_hts_trips, copy = True)
+ df_hts_households = pd.DataFrame(df_hts_households, copy=True)
+ df_hts_persons = pd.DataFrame(df_hts_persons, copy=True)
+ df_hts_trips = pd.DataFrame(df_hts_trips, copy=True)
df_hts_households["hts"] = name
df_hts_persons["hts"] = name
df_hts_trips["hts"] = name
if "routed_distance" in df_hts_trips:
- df_hts_trips = df_hts_trips.rename(columns = { "routed_distance": "hts_distance" })
+ df_hts_trips = df_hts_trips.rename(
+ columns={"routed_distance": "hts_distance"}
+ )
df_hts_trips["distance_type"] = "routed"
elif "euclidean_distance" in df_hts_trips:
- df_hts_trips = df_hts_trips.rename(columns = { "euclidean_distance": "hts_distance" })
+ df_hts_trips = df_hts_trips.rename(
+ columns={"euclidean_distance": "hts_distance"}
+ )
df_hts_trips["distance_type"] = "euclidean"
else:
raise RuntimeError("No distance slot available")
@@ -39,11 +45,12 @@ def combine(htss):
return pd.concat(households), pd.concat(persons), pd.concat(trips)
+
def execute(context):
egt = context.stage("data.hts.egt.filtered")
entd = context.stage("data.hts.entd.filtered")
- htss = dict(egt = egt, entd = entd)
+ htss = dict(egt=egt, entd=entd)
names = sorted(list(htss.keys()))
# Make data set of all HTS
@@ -61,41 +68,80 @@ def execute(context):
"number_of_households": np.count_nonzero(f_hts_households),
"number_of_persons": np.count_nonzero(f_hts_persons),
"number_of_trips": np.count_nonzero(f_hts_trips),
- "weighted_number_of_households": df_households[f_hts_households]["household_weight"].sum(),
- "weighted_number_of_persons": df_persons[f_hts_persons]["person_weight"].sum(),
+ "weighted_number_of_households": df_households[f_hts_households][
+ "household_weight"
+ ].sum(),
+ "weighted_number_of_persons": df_persons[f_hts_persons][
+ "person_weight"
+ ].sum(),
"weighted_number_of_trips": df_trips[f_hts_trips]["trip_weight"].sum(),
- "weighted_number_of_trips_per_mobile_person": (df_persons[f_hts_persons & f_any_trips]["number_of_trips"] * df_persons[f_hts_persons & f_any_trips]["trip_weight"]).sum() / df_persons[f_hts_persons & f_any_trips]["trip_weight"].sum(),
- "share_of_students": (df_persons[f_hts_persons]["studies"] * df_persons[f_hts_persons]["person_weight"]).sum() / df_persons[f_hts_persons]["person_weight"].sum(),
- "share_of_employed": (df_persons[f_hts_persons]["employed"] * df_persons[f_hts_persons]["person_weight"]).sum() / df_persons[f_hts_persons]["person_weight"].sum(),
- "number_of_activity_chains": len(df_trips[f_hts_trips]["person_id"].unique()),
- "number_of_activity_chains": len(df_trips[f_hts_trips]["person_id"].unique()),
+ "weighted_number_of_trips_per_mobile_person": (
+ df_persons[f_hts_persons & f_any_trips]["number_of_trips"]
+ * df_persons[f_hts_persons & f_any_trips]["trip_weight"]
+ ).sum()
+ / df_persons[f_hts_persons & f_any_trips]["trip_weight"].sum(),
+ "share_of_students": (
+ df_persons[f_hts_persons]["studies"]
+ * df_persons[f_hts_persons]["person_weight"]
+ ).sum()
+ / df_persons[f_hts_persons]["person_weight"].sum(),
+ "share_of_employed": (
+ df_persons[f_hts_persons]["employed"]
+ * df_persons[f_hts_persons]["person_weight"]
+ ).sum()
+ / df_persons[f_hts_persons]["person_weight"].sum(),
+ "number_of_activity_chains": len(
+ df_trips[f_hts_trips]["person_id"].unique()
+ ),
+ "number_of_activity_chains": len(
+ df_trips[f_hts_trips]["person_id"].unique()
+ ),
}
# Trip distance distribution
- df_trips["distance_class"] = np.digitize(df_trips["hts_distance"], np.arange(1, 10) * 1000)
- df_distance = df_trips.groupby(["hts", "distance_class"])["trip_weight"].sum().reset_index(name = "trip_weight")
+ df_trips["distance_class"] = np.digitize(
+ df_trips["hts_distance"], np.arange(1, 10) * 1000
+ )
+ df_distance = (
+ df_trips.groupby(["hts", "distance_class"])["trip_weight"]
+ .sum()
+ .reset_index(name="trip_weight")
+ )
# Age distribution
AGE_BOUNDS = [14, 29, 44, 59, 74, 1000]
- df_persons["age_class"] = np.digitize(df_persons["age"], AGE_BOUNDS, right = True)
- df_age = df_persons.groupby(["hts", "age_class"])["person_weight"].sum().reset_index(name = "person_weight")
-
- df_census = pd.DataFrame(context.stage("data.census.filtered")[["age", "studies", "weight", "employed"]], copy = True)
+ df_persons["age_class"] = np.digitize(df_persons["age"], AGE_BOUNDS, right=True)
+ df_age = (
+ df_persons.groupby(["hts", "age_class"])["person_weight"]
+ .sum()
+ .reset_index(name="person_weight")
+ )
+
+ df_census = pd.DataFrame(
+ context.stage("data.census.filtered")[["age", "studies", "weight", "employed"]],
+ copy=True,
+ )
df_census["hts"] = "census"
- df_census["age_class"] = np.digitize(df_census["age"], AGE_BOUNDS, right = True)
- df_age_census = df_census.groupby(["hts", "age_class"])["weight"].sum().reset_index(name = "person_weight")
+ df_census["age_class"] = np.digitize(df_census["age"], AGE_BOUNDS, right=True)
+ df_age_census = (
+ df_census.groupby(["hts", "age_class"])["weight"]
+ .sum()
+ .reset_index(name="person_weight")
+ )
df_age = pd.concat([df_age, df_age_census])
# Add student and employment share for census
info["census"] = {
- "share_of_students": (df_census["studies"] * df_census["weight"]).sum() / df_census["weight"].sum(),
- "share_of_employed": (df_census["employed"] * df_census["weight"]).sum() / df_census["weight"].sum()
+ "share_of_students": (df_census["studies"] * df_census["weight"]).sum()
+ / df_census["weight"].sum(),
+ "share_of_employed": (df_census["employed"] * df_census["weight"]).sum()
+ / df_census["weight"].sum(),
}
return {
"info": info,
"distance_distribution": df_distance,
- "age_distribution": df_age
+ "age_distribution": df_age,
}
diff --git a/data/hts/edgt_44/cleaned.py b/data/hts/edgt_44/cleaned.py
index 1fa9b526..f9a95888 100644
--- a/data/hts/edgt_44/cleaned.py
+++ b/data/hts/edgt_44/cleaned.py
@@ -6,26 +6,54 @@
This stage cleans the Loire Atlantique EDGT.
"""
+
def configure(context):
context.stage("data.hts.edgt_44.raw")
+
PURPOSE_MAP = {
"home": [1, 2],
"work": [11, 12, 13, 81],
"education": [21, 22, 23, 24, 25, 26, 27, 28, 29],
"shop": [30, 31, 32, 33, 34, 35, 82],
"leisure": [51, 52, 53, 54],
- "other": [41, 42, 43, 44, 45, 61, 62, 63, 64, 71, 72, 73, 74, 91]
+ "other": [41, 42, 43, 44, 45, 61, 62, 63, 64, 71, 72, 73, 74, 91],
}
MODES_MAP = {
"car": [13, 15, 21, 81],
"car_passenger": [14, 16, 22, 82],
- "pt": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 72, 73, 91, 92, 94, 95],
+ "pt": [
+ 30,
+ 31,
+ 32,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 38,
+ 39,
+ 40,
+ 41,
+ 42,
+ 51,
+ 52,
+ 53,
+ 61,
+ 71,
+ 72,
+ 73,
+ 91,
+ 92,
+ 94,
+ 95,
+ ],
"bike": [11, 17, 12, 18, 93, 19],
- "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk
+ "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk
}
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.raw")
@@ -33,9 +61,13 @@ def execute(context):
df_households["departement_id"] = "44"
# Transform original IDs to integer (they are hierarchichal)
- df_households["edgt_household_id"] = (df_households["ECH"] + df_households["MTIR"]).astype(int)
+ df_households["edgt_household_id"] = (
+ df_households["ECH"] + df_households["MTIR"]
+ ).astype(int)
df_persons["edgt_person_id"] = df_persons["PER"].astype(int)
- df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PTIR"]).astype(int)
+ df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PTIR"]).astype(
+ int
+ )
df_trips["edgt_person_id"] = df_trips["PER"].astype(int)
df_trips["edgt_household_id"] = (df_trips["ECH"] + df_trips["DTIR"]).astype(int)
df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int)
@@ -44,15 +76,19 @@ def execute(context):
df_households["household_id"] = np.arange(len(df_households))
df_persons = pd.merge(
- df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]],
- on = ["edgt_household_id"]
- ).sort_values(by = ["household_id", "edgt_person_id"])
+ df_persons,
+ df_households[["edgt_household_id", "household_id", "departement_id"]],
+ on=["edgt_household_id"],
+ ).sort_values(by=["household_id", "edgt_person_id"])
df_persons["person_id"] = np.arange(len(df_persons))
df_trips = pd.merge(
- df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]],
- on = ["edgt_person_id", "edgt_household_id"]
- ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"])
+ df_trips,
+ df_persons[
+ ["edgt_person_id", "edgt_household_id", "person_id", "household_id"]
+ ],
+ on=["edgt_person_id", "edgt_household_id"],
+ ).sort_values(by=["household_id", "person_id", "edgt_trip_id"])
df_trips["trip_id"] = np.arange(len(df_trips))
# Trip flags
@@ -71,8 +107,10 @@ def execute(context):
df_persons["sex"] = df_persons["sex"].astype("category")
# Household size
- df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size")
- df_households = pd.merge(df_households, df_size, on = "household_id")
+ df_size = (
+ df_persons.groupby("household_id").size().reset_index(name="household_size")
+ )
+ df_households = pd.merge(df_households, df_size, on="household_id")
# Clean departement
df_trips["origin_departement_id"] = "44"
@@ -80,8 +118,12 @@ def execute(context):
df_households["departement_id"] = df_households["departement_id"].astype("category")
df_persons["departement_id"] = df_persons["departement_id"].astype("category")
- df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category")
- df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category")
+ df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype(
+ "category"
+ )
+ df_trips["destination_departement_id"] = df_trips[
+ "destination_departement_id"
+ ].astype("category")
# Clean employment
df_persons["employed"] = df_persons["P7"].isin(["1", "2"])
@@ -91,7 +133,9 @@ def execute(context):
# Number of vehicles
df_households["number_of_vehicles"] = df_households["M6"] + df_households["M5"]
- df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
+ df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(
+ int
+ )
df_households["number_of_bikes"] = df_households["M7"].astype(int)
# License
@@ -100,7 +144,7 @@ def execute(context):
# Has subscription (not availabile in EDGT 44)
df_persons["has_pt_subscription"] = False
- # Survey respondents
+ # Survey respondents
# PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers
# PENQ 2 : nonrespondent of travel questionary section
df_persons.loc[df_persons["PENQ"] == 1, "travel_respondent"] = True
@@ -138,13 +182,13 @@ def execute(context):
df_trips["routed_distance"] = df_trips["DIST"]
# Trip times
- df_trips["departure_time"] = 3600.0 * df_trips["D4A"] # hour
- df_trips["departure_time"] += 60.0 * df_trips["D4B"] # minute
+ df_trips["departure_time"] = 3600.0 * df_trips["D4A"] # hour
+ df_trips["departure_time"] += 60.0 * df_trips["D4B"] # minute
- df_trips["arrival_time"] = 3600.0 * df_trips["D8A"] # hour
- df_trips["arrival_time"] += 60.0 * df_trips["D8B"] # minute
+ df_trips["arrival_time"] = 3600.0 * df_trips["D8A"] # hour
+ df_trips["arrival_time"] += 60.0 * df_trips["D8B"] # minute
- df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"])
+ df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"])
df_trips = hts.fix_trip_times(df_trips)
# Durations
@@ -153,16 +197,25 @@ def execute(context):
# Add weight to trips
df_trips = pd.merge(
- df_trips, df_persons[["person_id", "COEQ"]], on = "person_id", how = "left"
- ).rename(columns = { "COEQ": "trip_weight" })
+ df_trips, df_persons[["person_id", "COEQ"]], on="person_id", how="left"
+ ).rename(columns={"COEQ": "trip_weight"})
df_persons["trip_weight"] = df_persons["COEQ"]
# Chain length
- df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips")
+ df_count = (
+ df_trips[["person_id"]]
+ .groupby("person_id")
+ .size()
+ .reset_index(name="number_of_trips")
+ )
# People with at least one trip (number_of_trips > 0)
- df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left")
+ df_persons = pd.merge(df_persons, df_count, on="person_id", how="left")
# People that awnsered the travel questionary section but stayed at home (number_of_trips = 0)
- df_persons.loc[(df_persons["travel_respondent"] == True) & (df_persons["number_of_trips"].isna()), "number_of_trips"] = 0
+ df_persons.loc[
+ (df_persons["travel_respondent"] == True)
+ & (df_persons["number_of_trips"].isna()),
+ "number_of_trips",
+ ] = 0
# Nonrespondent of travel questionary section (number_of_trips = -1)
df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int)
@@ -173,11 +226,15 @@ def execute(context):
# Calculate consumption units
hts.check_household_size(df_households, df_persons)
- df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")
+ df_households = pd.merge(
+ df_households, hts.calculate_consumption_units(df_persons), on="household_id"
+ )
# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype(int)
- df_persons.loc[df_persons["socioprofessional_class"] > 6, "socioprofessional_class"] = 8
+ df_persons.loc[
+ df_persons["socioprofessional_class"] > 6, "socioprofessional_class"
+ ] = 8
df_persons.loc[df_persons["P7"] == "7", "socioprofessional_class"] = 7
# Check departure and arrival times
diff --git a/data/hts/edgt_44/filtered.py b/data/hts/edgt_44/filtered.py
index df52ab89..cebf650a 100644
--- a/data/hts/edgt_44/filtered.py
+++ b/data/hts/edgt_44/filtered.py
@@ -5,17 +5,20 @@
This stage filters out observations which live or work outside of the area.
"""
+
def configure(context):
context.stage("data.hts.edgt_44.cleaned")
context.stage("data.spatial.codes")
-
- context.config("filter_hts",True)
+
+ context.config("filter_hts", True)
+
+
def execute(context):
- filter_edgt = context.config("filter_hts")
+ filter_edgt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.cleaned")
- if filter_edgt :
+ if filter_edgt:
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
@@ -24,15 +27,26 @@ def execute(context):
# Filter for people going outside of the area
remove_ids = set()
- remove_ids |= set(df_trips[
- ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
- ]["person_id"].unique())
+ remove_ids |= set(
+ df_trips[
+ ~df_trips["origin_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ | ~df_trips["destination_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ ]["person_id"].unique()
+ )
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
# Only keep trips and households that still have a person
- df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
- df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+ df_trips = df_trips[
+ df_trips["person_id"].isin(df_persons["person_id"].unique())
+ ]
+ df_households = df_households[
+ df_households["household_id"].isin(df_persons["household_id"])
+ ]
# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS]
diff --git a/data/hts/edgt_44/format.py b/data/hts/edgt_44/format.py
index 0dd224e1..402a5e59 100644
--- a/data/hts/edgt_44/format.py
+++ b/data/hts/edgt_44/format.py
@@ -24,12 +24,22 @@
(50, 1, "M6", "NOMBRE DE DEUX OU TROIS ROUES MOTORISES A DISPOSITION"),
(51, 2, "M7", "NOMBRE DE VELOS A DISPOSITION"),
(53, 4, "MLA", "ANNEE INSTALLATION DANS LE LOGEMENT"),
- (57, 2, "MLB1", "PREMIER CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)"),
- (59, 2, "MLB2", "DEUXIEME CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)"),
+ (
+ 57,
+ 2,
+ "MLB1",
+ "PREMIER CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)",
+ ),
+ (
+ 59,
+ 2,
+ "MLB2",
+ "DEUXIEME CRITERE DE CHOIX DE LA RESIDENCE ACTUELLE (sans classement)",
+ ),
(61, 5, "MLC", "ANCIENNE COMMUNE DE RESIDENCE"),
(66, 1, "MLD", "TYPE D'OCCUPATION DE L'ANCIEN LOGEMENT (le ménage était-il ?)"),
(67, 8, "COEM", "COEFFICIENT DE REDRESSEMENT MENAGE"),
- (75, 1, "MFIN", "FIN FICHIER MENAGE")
+ (75, 1, "MFIN", "FIN FICHIER MENAGE"),
]
PERSON_FORMAT = [
@@ -50,15 +60,35 @@
(24, 1, "P9", "PCS"),
(25, 1, "P12", "TRAVAIL OU ETUDES A DOMICILE"),
(26, 6, "P13A", "LIEU DE TRAVAIL OU D'ETUDES (OCCUPATION PRINCIPALE)"),
- (32, 1, "P15", "DISPOSITION D'UNE VOITURE EN GÉNÉRAL (DÉPLACEMENTS DOMICILE TRAVAIL OU ÉTUDES)"),
- (33, 1, "P17", "PROBLÈMES DE STATIONNEMENT EN GÉNÉRAL (SUR LIEU DE TRAVAIL OU D'ÉTUDES)"),
- (34, 1, "P17A", "DIFFICULTÉS DE STATIONNEMENT SUR OU À PROXIMITÉ DE VOTRE LIEU DE TRAVAIL OU DE VOTRE LIEU D'ÉTUDES"),
+ (
+ 32,
+ 1,
+ "P15",
+ "DISPOSITION D'UNE VOITURE EN GÉNÉRAL (DÉPLACEMENTS DOMICILE TRAVAIL OU ÉTUDES)",
+ ),
+ (
+ 33,
+ 1,
+ "P17",
+ "PROBLÈMES DE STATIONNEMENT EN GÉNÉRAL (SUR LIEU DE TRAVAIL OU D'ÉTUDES)",
+ ),
+ (
+ 34,
+ 1,
+ "P17A",
+ "DIFFICULTÉS DE STATIONNEMENT SUR OU À PROXIMITÉ DE VOTRE LIEU DE TRAVAIL OU DE VOTRE LIEU D'ÉTUDES",
+ ),
(35, 1, "P23A", "FRÉQUENCE D'UTILISATION EN SEMAINE : MARCHE A PIED"),
(36, 1, "P20", "FRÉQUENCE D'UTILISATION EN SEMAINE : BICYCLETTE"),
(37, 1, "P21", "FRÉQUENCE D'UTILISATION EN SEMAINE : 2 ROUES À MOTEUR CONDUCTEUR"),
(38, 1, "P23", "FRÉQUENCE D'UTILISATION EN SEMAINE : VOITURE CONDUCTEUR"),
(39, 1, "P24", "FRÉQUENCE D'UTILISATION EN SEMAINE : VOITURE PASSAGER"),
- (40, 1, "P25", "FRÉQUENCE D'UTILISATION EN SEMAINE : RESEAUX DE TRANSPORT EN COMMUN (TRAM, BUS, CAR…)"),
+ (
+ 40,
+ 1,
+ "P25",
+ "FRÉQUENCE D'UTILISATION EN SEMAINE : RESEAUX DE TRANSPORT EN COMMUN (TRAM, BUS, CAR…)",
+ ),
(41, 1, "P19", "SITUATION DE LA PERSONNE LA VEILLE"),
(42, 1, "P19A", "SITUATION DES ACTIFS LA VEILLE"),
(43, 1, "PL27", "FRÉQUENCE D'UTILISATION EN SEMAINE : TRAIN"),
@@ -67,7 +97,7 @@
(46, 6, "DP13", "Distance DOMICILE-TRAVAIL"),
(52, 8, "COEP", "COEFFICIENT DE REDRESSEMENT TOUTES PERSONNES"),
(60, 8, "COEQ", "COEFFICIENT DE REDRESSEMENT PERSONNES ENQUETEES"),
- (68, 1, "PFIN", "FIN FICHIER PERSONNE")
+ (68, 1, "PFIN", "FIN FICHIER PERSONNE"),
]
TRIP_FORMAT = [
@@ -95,5 +125,5 @@
(54, 8, "DOIB", "DISTANCE VOL OISEAU (en mètres)"),
(62, 8, "DIST", "DiSTANCE PARCOURUE (en mètres)"),
(70, 8, "DISP", "DiSTANCE PARCOURUE dans périmètre (en mètres)"),
- (78, 1, "DFIN", "FIN FICHIER DEPLACEMENT")
+ (78, 1, "DFIN", "FIN FICHIER DEPLACEMENT"),
]
diff --git a/data/hts/edgt_44/raw.py b/data/hts/edgt_44/raw.py
index cb58bd27..5a55d8cb 100644
--- a/data/hts/edgt_44/raw.py
+++ b/data/hts/edgt_44/raw.py
@@ -10,35 +10,66 @@
Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes)
"""
+
def configure(context):
context.config("data_path")
+
from .format import HOUSEHOLD_FORMAT, PERSON_FORMAT, TRIP_FORMAT
HOUSEHOLD_COLUMNS = {
- "MP2": str, "MTIR": str, "ECH": str, "COEM": float,
- "M6": int, "M7": int, "M5": int
+ "MP2": str,
+ "MTIR": str,
+ "ECH": str,
+ "COEM": float,
+ "M6": int,
+ "M7": int,
+ "M5": int,
}
PERSON_COLUMNS = {
- "ECH": str, "PTIR": str, "PER": int, "PP2": str, "PENQ": int,
- "P3": int, "P2": int, "P4": int,
- "P7": str, "P12": str,
- "P9": str, "P5": str,
- "COEP": float, "COEQ": float, "P1": int
+ "ECH": str,
+ "PTIR": str,
+ "PER": int,
+ "PP2": str,
+ "PENQ": int,
+ "P3": int,
+ "P2": int,
+ "P4": int,
+ "P7": str,
+ "P12": str,
+ "P9": str,
+ "P5": str,
+ "COEP": float,
+ "COEQ": float,
+ "P1": int,
}
TRIP_COLUMNS = {
- "ECH": str, "DTIR": str, "PER": int, "NDEP": int, "DP2": str,
- "D2A": int, "D5A": int, "D3": str, "D4A": int, "D4B": int,
- "D7": str, "D8A": int, "D8B": int,
- "D8C": int, "MODP": int, "DOIB": int, "DIST": int
+ "ECH": str,
+ "DTIR": str,
+ "PER": int,
+ "NDEP": int,
+ "DP2": str,
+ "D2A": int,
+ "D5A": int,
+ "D3": str,
+ "D4A": int,
+ "D4B": int,
+ "D7": str,
+ "D8A": int,
+ "D8B": int,
+ "D8C": int,
+ "MODP": int,
+ "DOIB": int,
+ "DIST": int,
}
+
def execute(context):
# Load households
df_household_dictionary = pd.DataFrame.from_records(
- HOUSEHOLD_FORMAT, columns = ["position", "size", "variable", "description"]
+ HOUSEHOLD_FORMAT, columns=["position", "size", "variable", "description"]
)
column_widths = df_household_dictionary["size"].values
@@ -46,13 +77,17 @@ def execute(context):
df_households = pd.read_fwf(
"%s/edgt_44_2015/02a_EDGT_44_MENAGE_FAF_TEL_2015-08-07_modifZF.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(HOUSEHOLD_COLUMNS.keys()),
+ dtype=HOUSEHOLD_COLUMNS,
)
# Load persons
df_person_dictionary = pd.DataFrame.from_records(
- PERSON_FORMAT, columns = ["position", "size", "variable", "description"]
+ PERSON_FORMAT, columns=["position", "size", "variable", "description"]
)
column_widths = df_person_dictionary["size"].values
@@ -60,13 +95,17 @@ def execute(context):
df_persons = pd.read_fwf(
"%s/edgt_44_2015/02b_EDGT_44_PERSO_FAF_TEL_ModifPCS_2016-04-14.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(PERSON_COLUMNS.keys()),
+ dtype=PERSON_COLUMNS,
)
# Load trips
df_trip_dictionary = pd.DataFrame.from_records(
- TRIP_FORMAT, columns = ["position", "size", "variable", "description"]
+ TRIP_FORMAT, columns=["position", "size", "variable", "description"]
)
column_widths = df_trip_dictionary["size"].values
@@ -74,21 +113,29 @@ def execute(context):
df_trips = pd.read_fwf(
"%s/edgt_44_2015/02c_EDGT_44_DEPLA_FAF_TEL_DIST_2015-11-10.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(TRIP_COLUMNS.keys()),
+ dtype=TRIP_COLUMNS,
)
return df_households, df_persons, df_trips
+
FILES = [
"02a_EDGT_44_MENAGE_FAF_TEL_2015-08-07_modifZF.txt",
"02b_EDGT_44_PERSO_FAF_TEL_ModifPCS_2016-04-14.txt",
"02c_EDGT_44_DEPLA_FAF_TEL_DIST_2015-11-10.txt",
]
+
def validate(context):
for name in FILES:
- if not os.path.exists("%s/edgt_44_2015/%s" % (context.config("data_path"), name)):
+ if not os.path.exists(
+ "%s/edgt_44_2015/%s" % (context.config("data_path"), name)
+ ):
raise RuntimeError("File missing from EDGT: %s" % name)
return [
diff --git a/data/hts/edgt_44/reweighted.py b/data/hts/edgt_44/reweighted.py
index 1bbcbd4d..647ccdef 100644
--- a/data/hts/edgt_44/reweighted.py
+++ b/data/hts/edgt_44/reweighted.py
@@ -1,8 +1,10 @@
import numpy as np
+
def configure(context):
context.stage("data.hts.edgt_44.filtered")
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.edgt_44.filtered")
diff --git a/data/hts/edgt_lyon/cleaned_adisp.py b/data/hts/edgt_lyon/cleaned_adisp.py
index eed34608..905df318 100644
--- a/data/hts/edgt_lyon/cleaned_adisp.py
+++ b/data/hts/edgt_lyon/cleaned_adisp.py
@@ -7,43 +7,76 @@
This stage cleans the Lyon EDGT.
"""
+
def configure(context):
context.stage("data.hts.edgt_lyon.raw_adisp")
+
PURPOSE_MAP = {
"home": [1, 2],
"work": [11, 12, 13, 14, 81],
"education": [21, 22, 23, 24, 25, 26, 27, 28, 29, 96, 97],
"shop": [30, 31, 32, 33, 34, 35, 82, 98],
"leisure": [51, 52, 53, 54],
- "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91]
+ "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91],
}
MODES_MAP = {
- "car": [10, 13, 15, 21, 81], # 10 is (driving) an ambulance
+ "car": [10, 13, 15, 21, 81], # 10 is (driving) an ambulance
"car_passenger": [14, 16, 22, 82],
- "pt": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 91, 92, 94, 95],
+ "pt": [
+ 31,
+ 32,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 38,
+ 39,
+ 40,
+ 41,
+ 42,
+ 51,
+ 52,
+ 53,
+ 61,
+ 71,
+ 91,
+ 92,
+ 94,
+ 95,
+ ],
"bike": [11, 17, 12, 18, 93],
- "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk
+ "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk
}
+
def execute(context):
- df_households, df_persons, df_trips, df_spatial = context.stage("data.hts.edgt_lyon.raw_adisp")
+ df_households, df_persons, df_trips, df_spatial = context.stage(
+ "data.hts.edgt_lyon.raw_adisp"
+ )
# Merge departement into households
df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy()
- df_spatial["ZFM"] = df_spatial["ZF__2015"].astype(str).str.pad(width=8, side='left', fillchar='0')
+ df_spatial["ZFM"] = (
+ df_spatial["ZF__2015"].astype(str).str.pad(width=8, side="left", fillchar="0")
+ )
df_spatial["departement_id"] = df_spatial["DepCom"].str[:2]
df_spatial = df_spatial[["ZFM", "departement_id"]]
# Attention, some households get lost here!
- df_households = pd.merge(df_households, df_spatial, on = "ZFM", how = "left")
+ df_households = pd.merge(df_households, df_spatial, on="ZFM", how="left")
df_households["departement_id"] = df_households["departement_id"].fillna("unknown")
# Transform original IDs to integer (they are hierarchichal)
- df_households["edgt_household_id"] = (df_households["ZFM"] + df_households["ECH"]).astype(int)
+ df_households["edgt_household_id"] = (
+ df_households["ZFM"] + df_households["ECH"]
+ ).astype(int)
df_persons["edgt_person_id"] = df_persons["PER"].astype(int)
- df_persons["edgt_household_id"] = (df_persons["ZFP"] + df_persons["ECH"]).astype(int)
+ df_persons["edgt_household_id"] = (df_persons["ZFP"] + df_persons["ECH"]).astype(
+ int
+ )
df_trips["edgt_person_id"] = df_trips["PER"].astype(int)
df_trips["edgt_household_id"] = (df_trips["ZFD"] + df_trips["ECH"]).astype(int)
df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int)
@@ -52,15 +85,19 @@ def execute(context):
df_households["household_id"] = np.arange(len(df_households))
df_persons = pd.merge(
- df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]],
- on = ["edgt_household_id"]
- ).sort_values(by = ["household_id", "edgt_person_id"])
+ df_persons,
+ df_households[["edgt_household_id", "household_id", "departement_id"]],
+ on=["edgt_household_id"],
+ ).sort_values(by=["household_id", "edgt_person_id"])
df_persons["person_id"] = np.arange(len(df_persons))
df_trips = pd.merge(
- df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]],
- on = ["edgt_person_id", "edgt_household_id"]
- ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"])
+ df_trips,
+ df_persons[
+ ["edgt_person_id", "edgt_household_id", "person_id", "household_id"]
+ ],
+ on=["edgt_person_id", "edgt_household_id"],
+ ).sort_values(by=["household_id", "person_id", "edgt_trip_id"])
df_trips["trip_id"] = np.arange(len(df_trips))
# Trip flags
@@ -79,25 +116,45 @@ def execute(context):
df_persons["sex"] = df_persons["sex"].astype("category")
# Household size
- df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size")
- df_households = pd.merge(df_households, df_size, on = "household_id")
+ df_size = (
+ df_persons.groupby("household_id").size().reset_index(name="household_size")
+ )
+ df_households = pd.merge(df_households, df_size, on="household_id")
# Clean departement
- df_trips = pd.merge(df_trips, df_spatial.rename(columns = {
- "ZFM": "D3", "departement_id": "origin_departement_id"
- }), on = "D3", how = "left")
+ df_trips = pd.merge(
+ df_trips,
+ df_spatial.rename(
+ columns={"ZFM": "D3", "departement_id": "origin_departement_id"}
+ ),
+ on="D3",
+ how="left",
+ )
- df_trips = pd.merge(df_trips, df_spatial.rename(columns = {
- "ZFM": "D7", "departement_id": "destination_departement_id"
- }), on = "D7", how = "left")
+ df_trips = pd.merge(
+ df_trips,
+ df_spatial.rename(
+ columns={"ZFM": "D7", "departement_id": "destination_departement_id"}
+ ),
+ on="D7",
+ how="left",
+ )
- df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna("unknown")
- df_trips["destination_departement_id"] = df_trips["destination_departement_id"].fillna("unknown")
+ df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna(
+ "unknown"
+ )
+ df_trips["destination_departement_id"] = df_trips[
+ "destination_departement_id"
+ ].fillna("unknown")
df_households["departement_id"] = df_households["departement_id"].astype("category")
df_persons["departement_id"] = df_persons["departement_id"].astype("category")
- df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category")
- df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category")
+ df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype(
+ "category"
+ )
+ df_trips["destination_departement_id"] = df_trips[
+ "destination_departement_id"
+ ].astype("category")
# Clean employment
df_persons["employed"] = df_persons["P9"].isin(["1", "2"])
@@ -107,16 +164,20 @@ def execute(context):
# Number of vehicles
df_households["number_of_vehicles"] = df_households["M6"] + df_households["M14"]
- df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
+ df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(
+ int
+ )
df_households["number_of_bikes"] = df_households["M21"].astype(int)
# License
df_persons["has_license"] = df_persons["P7"] == "1"
# Has subscription
- df_persons["has_pt_subscription"] = df_persons["P12"].isin(["1", "2", "3", "5", "6"])
+ df_persons["has_pt_subscription"] = df_persons["P12"].isin(
+ ["1", "2", "3", "5", "6"]
+ )
- # Survey respondents
+ # Survey respondents
# PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers
# PENQ 2 : nonrespondent of travel questionary section
df_persons["PENQ"] = df_persons["PENQ"].fillna("2").astype("int")
@@ -151,13 +212,13 @@ def execute(context):
df_trips["routed_distance"] = df_trips["D12"]
# Trip times
- df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour
- df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute
+ df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour
+ df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute
- df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour
- df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute
+ df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour
+ df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute
- df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"])
+ df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"])
df_trips = hts.fix_trip_times(df_trips)
# Durations
@@ -166,16 +227,25 @@ def execute(context):
# Add weight to trips
df_trips = pd.merge(
- df_trips, df_persons[["person_id", "COE1"]], on = "person_id", how = "left"
- ).rename(columns = { "COE1": "trip_weight" })
+ df_trips, df_persons[["person_id", "COE1"]], on="person_id", how="left"
+ ).rename(columns={"COE1": "trip_weight"})
df_persons["trip_weight"] = df_persons["COE1"]
# Chain length
- df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips")
+ df_count = (
+ df_trips[["person_id"]]
+ .groupby("person_id")
+ .size()
+ .reset_index(name="number_of_trips")
+ )
# People with at least one trip (number_of_trips > 0)
- df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left")
+ df_persons = pd.merge(df_persons, df_count, on="person_id", how="left")
# People that answered the travel questionary section but stayed at home (number_of_trips = 0)
- df_persons.loc[(df_persons["travel_respondent"] == True) & (df_persons["number_of_trips"].isna()), "number_of_trips"] = 0
+ df_persons.loc[
+ (df_persons["travel_respondent"] == True)
+ & (df_persons["number_of_trips"].isna()),
+ "number_of_trips",
+ ] = 0
# Nonrespondent of travel questionary section (number_of_trips = -1)
df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int)
@@ -186,7 +256,9 @@ def execute(context):
# Calculate consumption units
hts.check_household_size(df_households, df_persons)
- df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")
+ df_households = pd.merge(
+ df_households, hts.calculate_consumption_units(df_persons), on="household_id"
+ )
# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["PCSC"].fillna(8).astype(int)
diff --git a/data/hts/edgt_lyon/cleaned_cerema.py b/data/hts/edgt_lyon/cleaned_cerema.py
index d452820b..850a53ac 100644
--- a/data/hts/edgt_lyon/cleaned_cerema.py
+++ b/data/hts/edgt_lyon/cleaned_cerema.py
@@ -6,28 +6,55 @@
This stage cleans the Lyon EDGT.
"""
+
def configure(context):
context.stage("data.hts.edgt_lyon.raw_cerema")
+
PURPOSE_MAP = {
"home": [1, 2],
"work": [11, 12, 13, 81],
"education": [21, 22, 23, 24, 25, 26, 27, 28, 29],
"shop": [30, 31, 32, 33, 34, 35, 82],
"leisure": [51, 52, 53, 54],
- "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91]
+ "other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91],
}
MODES_MAP = {
"car": [13, 15, 21, 81],
"car_passenger": [14, 16, 22, 82],
- "pt": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 61, 71, 91, 92, 94, 95],
+ "pt": [
+ 31,
+ 32,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 38,
+ 39,
+ 40,
+ 41,
+ 42,
+ 51,
+ 52,
+ 53,
+ 61,
+ 71,
+ 91,
+ 92,
+ 94,
+ 95,
+ ],
"bike": [11, 17, 12, 18, 93],
- "walk": [1, 2] # Actually, 2 is not really explained, but we assume it is walk
+ "walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk
}
+
def execute(context):
- df_households, df_persons, df_trips, df_spatial = context.stage("data.hts.edgt_lyon.raw_cerema")
+ df_households, df_persons, df_trips, df_spatial = context.stage(
+ "data.hts.edgt_lyon.raw_cerema"
+ )
# Merge departement into households
df_spatial = df_spatial[["ZF__2015", "DepCom"]].copy()
@@ -36,13 +63,17 @@ def execute(context):
df_spatial = df_spatial[["MP2", "departement_id"]]
# Attention, some households get lost here!
- df_households = pd.merge(df_households, df_spatial, on = "MP2", how = "left")
+ df_households = pd.merge(df_households, df_spatial, on="MP2", how="left")
df_households["departement_id"] = df_households["departement_id"].fillna("unknown")
# Transform original IDs to integer (they are hierarchichal)
- df_households["edgt_household_id"] = (df_households["ECH"] + df_households["MP2"]).astype(int)
+ df_households["edgt_household_id"] = (
+ df_households["ECH"] + df_households["MP2"]
+ ).astype(int)
df_persons["edgt_person_id"] = df_persons["PER"].astype(int)
- df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PP2"]).astype(int)
+ df_persons["edgt_household_id"] = (df_persons["ECH"] + df_persons["PP2"]).astype(
+ int
+ )
df_trips["edgt_person_id"] = df_trips["PER"].astype(int)
df_trips["edgt_household_id"] = (df_trips["ECH"] + df_trips["DP2"]).astype(int)
df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int)
@@ -51,15 +82,19 @@ def execute(context):
df_households["household_id"] = np.arange(len(df_households))
df_persons = pd.merge(
- df_persons, df_households[["edgt_household_id", "household_id", "departement_id"]],
- on = ["edgt_household_id"]
- ).sort_values(by = ["household_id", "edgt_person_id"])
+ df_persons,
+ df_households[["edgt_household_id", "household_id", "departement_id"]],
+ on=["edgt_household_id"],
+ ).sort_values(by=["household_id", "edgt_person_id"])
df_persons["person_id"] = np.arange(len(df_persons))
df_trips = pd.merge(
- df_trips, df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]],
- on = ["edgt_person_id", "edgt_household_id"]
- ).sort_values(by = ["household_id", "person_id", "edgt_trip_id"])
+ df_trips,
+ df_persons[
+ ["edgt_person_id", "edgt_household_id", "person_id", "household_id"]
+ ],
+ on=["edgt_person_id", "edgt_household_id"],
+ ).sort_values(by=["household_id", "person_id", "edgt_trip_id"])
df_trips["trip_id"] = np.arange(len(df_trips))
# Trip flags
@@ -78,25 +113,45 @@ def execute(context):
df_persons["sex"] = df_persons["sex"].astype("category")
# Household size
- df_size = df_persons.groupby("household_id").size().reset_index(name = "household_size")
- df_households = pd.merge(df_households, df_size, on = "household_id")
+ df_size = (
+ df_persons.groupby("household_id").size().reset_index(name="household_size")
+ )
+ df_households = pd.merge(df_households, df_size, on="household_id")
# Clean departement
- df_trips = pd.merge(df_trips, df_spatial.rename(columns = {
- "MP2": "D3", "departement_id": "origin_departement_id"
- }), on = "D3", how = "left")
+ df_trips = pd.merge(
+ df_trips,
+ df_spatial.rename(
+ columns={"MP2": "D3", "departement_id": "origin_departement_id"}
+ ),
+ on="D3",
+ how="left",
+ )
- df_trips = pd.merge(df_trips, df_spatial.rename(columns = {
- "MP2": "D7", "departement_id": "destination_departement_id"
- }), on = "D7", how = "left")
+ df_trips = pd.merge(
+ df_trips,
+ df_spatial.rename(
+ columns={"MP2": "D7", "departement_id": "destination_departement_id"}
+ ),
+ on="D7",
+ how="left",
+ )
- df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna("unknown")
- df_trips["destination_departement_id"] = df_trips["destination_departement_id"].fillna("unknown")
+ df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna(
+ "unknown"
+ )
+ df_trips["destination_departement_id"] = df_trips[
+ "destination_departement_id"
+ ].fillna("unknown")
df_households["departement_id"] = df_households["departement_id"].astype("category")
df_persons["departement_id"] = df_persons["departement_id"].astype("category")
- df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category")
- df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype("category")
+ df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype(
+ "category"
+ )
+ df_trips["destination_departement_id"] = df_trips[
+ "destination_departement_id"
+ ].astype("category")
# Clean employment
df_persons["employed"] = df_persons["P7"].isin(["1", "2"])
@@ -106,7 +161,9 @@ def execute(context):
# Number of vehicles
df_households["number_of_vehicles"] = df_households["M6"] + df_households["M5"]
- df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
+ df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(
+ int
+ )
df_households["number_of_bikes"] = df_households["M7"].astype(int)
# License
@@ -115,7 +172,7 @@ def execute(context):
# Has subscription
df_persons["has_pt_subscription"] = df_persons["P10"].isin(["1", "2", "3"])
- # Survey respondents
+ # Survey respondents
# PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers
# PENQ 2 : nonrespondent of travel questionary section
df_persons["PENQ"] = df_persons["PENQ"].fillna("2").astype(int)
@@ -149,13 +206,13 @@ def execute(context):
df_trips["routed_distance"] = df_trips["DIST"]
# Trip times
- df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour
- df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute
+ df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour
+ df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute
- df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour
- df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute
+ df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour
+ df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute
- df_trips = df_trips.sort_values(by = ["household_id", "person_id", "trip_id"])
+ df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"])
df_trips = hts.fix_trip_times(df_trips)
# Durations
@@ -164,18 +221,26 @@ def execute(context):
# Add weight to trips
df_trips = pd.merge(
- df_trips, df_persons[["person_id", "COEQ"]], on = "person_id", how = "left"
- ).rename(columns = { "COEQ": "trip_weight" })
+ df_trips, df_persons[["person_id", "COEQ"]], on="person_id", how="left"
+ ).rename(columns={"COEQ": "trip_weight"})
df_persons["trip_weight"] = df_persons["COEQ"]
# Chain length
- df_count = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "number_of_trips")
+ df_count = (
+ df_trips[["person_id"]]
+ .groupby("person_id")
+ .size()
+ .reset_index(name="number_of_trips")
+ )
# People with at least one trip (number_of_trips > 0)
- df_persons = pd.merge(df_persons, df_count, on = "person_id", how = "left")
-
+ df_persons = pd.merge(df_persons, df_count, on="person_id", how="left")
+
# People that awnsered the travel questionary section but stayed at home (number_of_trips = 0)
- df_persons.loc[df_persons["travel_respondent"] & df_persons["number_of_trips"].isna(), "number_of_trips"] = 0
+ df_persons.loc[
+ df_persons["travel_respondent"] & df_persons["number_of_trips"].isna(),
+ "number_of_trips",
+ ] = 0
# Nonrespondent of travel questionary section (number_of_trips = -1)
df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int)
@@ -187,11 +252,15 @@ def execute(context):
# Calculate consumption units
hts.check_household_size(df_households, df_persons)
- df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")
+ df_households = pd.merge(
+ df_households, hts.calculate_consumption_units(df_persons), on="household_id"
+ )
# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["P9"].fillna(8).astype(int)
- df_persons.loc[df_persons["socioprofessional_class"] > 6, "socioprofessional_class"] = 8
+ df_persons.loc[
+ df_persons["socioprofessional_class"] > 6, "socioprofessional_class"
+ ] = 8
df_persons.loc[df_persons["P7"] == "7", "socioprofessional_class"] = 7
# Check departure and arrival times
diff --git a/data/hts/edgt_lyon/filtered.py b/data/hts/edgt_lyon/filtered.py
index cf957685..bac0a1a3 100644
--- a/data/hts/edgt_lyon/filtered.py
+++ b/data/hts/edgt_lyon/filtered.py
@@ -5,27 +5,39 @@
This stage filters out observations which live or work outside of the area.
"""
+
def configure(context):
edgt_lyon_source = context.config("edgt_lyon_source", "unchosen")
if edgt_lyon_source == "unchosen":
- raise RuntimeError("Using 'hts: edgt_lyon' without specifying 'edgt_lyon_source' (either 'cerema' or 'adisp')")
+ raise RuntimeError(
+ "Using 'hts: edgt_lyon' without specifying 'edgt_lyon_source' (either 'cerema' or 'adisp')"
+ )
elif edgt_lyon_source == "adisp":
- context.stage("data.hts.edgt_lyon.cleaned_adisp", alias="data.hts.edgt_lyon.cleaned")
+ context.stage(
+ "data.hts.edgt_lyon.cleaned_adisp", alias="data.hts.edgt_lyon.cleaned"
+ )
elif edgt_lyon_source == "cerema":
- context.stage("data.hts.edgt_lyon.cleaned_cerema", alias="data.hts.edgt_lyon.cleaned")
+ context.stage(
+ "data.hts.edgt_lyon.cleaned_cerema", alias="data.hts.edgt_lyon.cleaned"
+ )
else:
- raise RuntimeError("Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s" % edgt_lyon_source)
-
+ raise RuntimeError(
+ "Unknown Lyon EDGT source (only 'cerema' and 'adisp' are supported): %s"
+ % edgt_lyon_source
+ )
+
context.stage("data.spatial.codes")
-
- context.config("filter_hts",True)
+
+ context.config("filter_hts", True)
+
+
def execute(context):
filter_edgt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.cleaned")
- if filter_edgt :
+ if filter_edgt:
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
@@ -34,15 +46,26 @@ def execute(context):
# Filter for people going outside of the area
remove_ids = set()
- remove_ids |= set(df_trips[
- ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
- ]["person_id"].unique())
+ remove_ids |= set(
+ df_trips[
+ ~df_trips["origin_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ | ~df_trips["destination_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ ]["person_id"].unique()
+ )
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
# Only keep trips and households that still have a person
- df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
- df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+ df_trips = df_trips[
+ df_trips["person_id"].isin(df_persons["person_id"].unique())
+ ]
+ df_households = df_households[
+ df_households["household_id"].isin(df_persons["household_id"])
+ ]
# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS]
diff --git a/data/hts/edgt_lyon/raw_adisp.py b/data/hts/edgt_lyon/raw_adisp.py
index 5ec5f111..31f9d877 100644
--- a/data/hts/edgt_lyon/raw_adisp.py
+++ b/data/hts/edgt_lyon/raw_adisp.py
@@ -10,77 +10,121 @@
Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes)
"""
+
def configure(context):
context.config("data_path")
+
HOUSEHOLD_COLUMNS = {
- "ECH": str, "ZFM": str, # id
- "M6": int, "M21": int, "M14": int, # number_of_cars, number_of_bikes, number_of_motorbikes
- "COE0": float # weights
+ "ECH": str,
+ "ZFM": str, # id
+ "M6": int,
+ "M21": int,
+ "M14": int, # number_of_cars, number_of_bikes, number_of_motorbikes
+ "COE0": float, # weights
}
PERSON_COLUMNS = {
- "ECH": str, "PER": int, "ZFP": str, # id
- "PENQ": str, # respondents of travel questionary section
- "P2": int, "P4": int, # sex, age
- "P9": str, # employed, studies
- "P7": str, "P12": str, # has_license, has_pt_subscription
- "PCSC": str, # socioprofessional_class
- "COEP": float, "COE1": float # weights
+ "ECH": str,
+ "PER": int,
+ "ZFP": str, # id
+ "PENQ": str, # respondents of travel questionary section
+ "P2": int,
+ "P4": int, # sex, age
+ "P9": str, # employed, studies
+ "P7": str,
+ "P12": str, # has_license, has_pt_subscription
+ "PCSC": str, # socioprofessional_class
+ "COEP": float,
+ "COE1": float, # weights
}
TRIP_COLUMNS = {
- "ECH": str, "PER": int, "NDEP": int, "ZFD": str, # id
- "D2A": int, "D5A": int, # preceding_purpose, following_purpose
- "D3": str, "D7": str, # origin_zone, destination_zone
- "D4": int, "D8": int, # time_departure, time_arrival
- "MODP": int, "D11": int, "D12": int # mode, euclidean_distance, routed_distance
+ "ECH": str,
+ "PER": int,
+ "NDEP": int,
+ "ZFD": str, # id
+ "D2A": int,
+ "D5A": int, # preceding_purpose, following_purpose
+ "D3": str,
+ "D7": str, # origin_zone, destination_zone
+ "D4": int,
+ "D8": int, # time_departure, time_arrival
+ "MODP": int,
+ "D11": int,
+ "D12": int, # mode, euclidean_distance, routed_distance
}
+
def execute(context):
# Load households
- df_households = pd.concat([
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_faf_men.csv"
- % context.config("data_path"), sep=";", usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS
- ),
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_tel_men.csv"
- % context.config("data_path"), sep=";", usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS
- )
- ])
+ df_households = pd.concat(
+ [
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_faf_men.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(HOUSEHOLD_COLUMNS.keys()),
+ dtype=HOUSEHOLD_COLUMNS,
+ ),
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_tel_men.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(HOUSEHOLD_COLUMNS.keys()),
+ dtype=HOUSEHOLD_COLUMNS,
+ ),
+ ]
+ )
# Load persons
- df_persons = pd.concat([
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_faf_pers.csv"
- % context.config("data_path"), sep=";", usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS
- ),
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_tel_pers.csv"
- % context.config("data_path"), sep=";", usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS
- )
- ])
+ df_persons = pd.concat(
+ [
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_faf_pers.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(PERSON_COLUMNS.keys()),
+ dtype=PERSON_COLUMNS,
+ ),
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_tel_pers.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(PERSON_COLUMNS.keys()),
+ dtype=PERSON_COLUMNS,
+ ),
+ ]
+ )
# Load trips
- df_trips = pd.concat([
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_faf_depl.csv"
- % context.config("data_path"), sep=";", usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS
- ),
- pd.read_csv(
- "%s/edgt_lyon_2015/lyon_2015_std_tel_depl.csv"
- % context.config("data_path"), sep=";", usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS
- )
- ])
+ df_trips = pd.concat(
+ [
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_faf_depl.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(TRIP_COLUMNS.keys()),
+ dtype=TRIP_COLUMNS,
+ ),
+ pd.read_csv(
+ "%s/edgt_lyon_2015/lyon_2015_std_tel_depl.csv"
+ % context.config("data_path"),
+ sep=";",
+ usecols=list(TRIP_COLUMNS.keys()),
+ dtype=TRIP_COLUMNS,
+ ),
+ ]
+ )
# Load spatial data
df_spatial = gpd.read_file(
- "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB"
- % context.config("data_path"))
+ "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" % context.config("data_path")
+ )
return df_households, df_persons, df_trips, df_spatial
+
FILES = [
"lyon_2015_std_faf_men.csv",
"lyon_2015_std_tel_men.csv",
@@ -92,12 +136,15 @@ def execute(context):
"EDGT_AML2015_ZF_GT.ID",
"EDGT_AML2015_ZF_GT.IND",
"EDGT_AML2015_ZF_GT.MAP",
- "EDGT_AML2015_ZF_GT.TAB"
+ "EDGT_AML2015_ZF_GT.TAB",
]
+
def validate(context):
for name in FILES:
- if not os.path.exists("%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)):
+ if not os.path.exists(
+ "%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)
+ ):
raise RuntimeError("File missing from EDGT: %s" % name)
return [
diff --git a/data/hts/edgt_lyon/raw_cerema.py b/data/hts/edgt_lyon/raw_cerema.py
index c48b28a3..76e3835d 100644
--- a/data/hts/edgt_lyon/raw_cerema.py
+++ b/data/hts/edgt_lyon/raw_cerema.py
@@ -10,82 +10,134 @@
Adapted from the first implementation by Valentin Le Besond (IFSTTAR Nantes)
"""
+
def configure(context):
context.config("data_path")
+
HOUSEHOLD_COLUMNS = {
- "MP2": str, "ECH": str, "COEM": float,
- "M6": int, "M7": int, "M5": int
+ "MP2": str,
+ "ECH": str,
+ "COEM": float,
+ "M6": int,
+ "M7": int,
+ "M5": int,
}
PERSON_COLUMNS = {
- "ECH": str, "PER": int, "PP2": str, "PENQ": str,
- "P3": int, "P2": int, "P4": int,
- "P7": str, "P12": str,
- "P10": str, "P9": str, "P5": str,
- "COEP": float, "COEQ": float, "P1": int
+ "ECH": str,
+ "PER": int,
+ "PP2": str,
+ "PENQ": str,
+ "P3": int,
+ "P2": int,
+ "P4": int,
+ "P7": str,
+ "P12": str,
+ "P10": str,
+ "P9": str,
+ "P5": str,
+ "COEP": float,
+ "COEQ": float,
+ "P1": int,
}
TRIP_COLUMNS = {
- "ECH": str, "PER": int, "NDEP": int, "DP2": str,
- "D2A": int, "D5A": int, "D3": str, "D4": int,
- "D7": str, "D8": int,
- "D8C": int, "MODP": int, "DOIB": int, "DIST": int
+ "ECH": str,
+ "PER": int,
+ "NDEP": int,
+ "DP2": str,
+ "D2A": int,
+ "D5A": int,
+ "D3": str,
+ "D4": int,
+ "D7": str,
+ "D8": int,
+ "D8C": int,
+ "MODP": int,
+ "DOIB": int,
+ "DIST": int,
}
+
def execute(context):
# Load households
df_household_dictionary = pd.read_excel(
"%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls"
- % context.config("data_path"), skiprows = 1, nrows = 21,
- usecols = [1,2], names = ["size", "variable"])
+ % context.config("data_path"),
+ skiprows=1,
+ nrows=21,
+ usecols=[1, 2],
+ names=["size", "variable"],
+ )
column_widths = df_household_dictionary["size"].values
column_names = df_household_dictionary["variable"].values
df_households = pd.read_fwf(
"%s/edgt_lyon_2015/EDGT_AML_MENAGE_FAF_TEL_2015-08-03.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(HOUSEHOLD_COLUMNS.keys()), dtype = HOUSEHOLD_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(HOUSEHOLD_COLUMNS.keys()),
+ dtype=HOUSEHOLD_COLUMNS,
)
# Load persons
df_person_dictionary = pd.read_excel(
"%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls"
- % context.config("data_path"), skiprows = 25, nrows = 34,
- usecols = [1,2], names = ["size", "variable"])
+ % context.config("data_path"),
+ skiprows=25,
+ nrows=34,
+ usecols=[1, 2],
+ names=["size", "variable"],
+ )
column_widths = df_person_dictionary["size"].values
column_names = df_person_dictionary["variable"].values
df_persons = pd.read_fwf(
"%s/edgt_lyon_2015/EDGT_AML_PERSO_DIST_DT_2015-10-27.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(PERSON_COLUMNS.keys()), dtype = PERSON_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(PERSON_COLUMNS.keys()),
+ dtype=PERSON_COLUMNS,
)
# Load trips
df_trip_dictionary = pd.read_excel(
"%s/edgt_lyon_2015/EDGT-AML-2015_Total_Dessin&Dictionnaire.xls"
- % context.config("data_path"), skiprows = 62, nrows = 24,
- usecols = [1,2], names = ["size", "variable"])
+ % context.config("data_path"),
+ skiprows=62,
+ nrows=24,
+ usecols=[1, 2],
+ names=["size", "variable"],
+ )
column_widths = df_trip_dictionary["size"].values
column_names = df_trip_dictionary["variable"].values
df_trips = pd.read_fwf(
"%s/edgt_lyon_2015/EDGT_AML_DEPLA_DIST_2015-10-27.txt"
- % context.config("data_path"), widths = column_widths, header = None,
- names = column_names, usecols = list(TRIP_COLUMNS.keys()), dtype = TRIP_COLUMNS
+ % context.config("data_path"),
+ widths=column_widths,
+ header=None,
+ names=column_names,
+ usecols=list(TRIP_COLUMNS.keys()),
+ dtype=TRIP_COLUMNS,
)
# Load spatial data
df_spatial = gpd.read_file(
- "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB"
- % context.config("data_path"))
+ "%s/edgt_lyon_2015/EDGT_AML2015_ZF_GT.TAB" % context.config("data_path")
+ )
return df_households, df_persons, df_trips, df_spatial
+
FILES = [
"EDGT_AML_MENAGE_FAF_TEL_2015-08-03.txt",
"EDGT_AML_PERSO_DIST_DT_2015-10-27.txt",
@@ -95,12 +147,15 @@ def execute(context):
"EDGT_AML2015_ZF_GT.ID",
"EDGT_AML2015_ZF_GT.IND",
"EDGT_AML2015_ZF_GT.MAP",
- "EDGT_AML2015_ZF_GT.TAB"
+ "EDGT_AML2015_ZF_GT.TAB",
]
+
def validate(context):
for name in FILES:
- if not os.path.exists("%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)):
+ if not os.path.exists(
+ "%s/edgt_lyon_2015/%s" % (context.config("data_path"), name)
+ ):
raise RuntimeError("File missing from EDGT: %s" % name)
return [
diff --git a/data/hts/edgt_lyon/reweighted.py b/data/hts/edgt_lyon/reweighted.py
index f858d79b..368c5558 100644
--- a/data/hts/edgt_lyon/reweighted.py
+++ b/data/hts/edgt_lyon/reweighted.py
@@ -1,8 +1,10 @@
import numpy as np
+
def configure(context):
context.stage("data.hts.edgt_lyon.filtered")
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.edgt_lyon.filtered")
diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py
index 490320c9..7f54f317 100644
--- a/data/hts/egt/cleaned.py
+++ b/data/hts/egt/cleaned.py
@@ -7,43 +7,46 @@
This stage cleans the regional HTS.
"""
+
def configure(context):
context.stage("data.hts.egt.raw")
if context.config("use_urban_type", False):
context.stage("data.spatial.urban_type")
+
INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6]
PURPOSE_MAP = {
- 1 : "home",
- 2 : "work",
- 3 : "work",
- 4 : "education",
- 5 : "shop",
- 6 : "other",
- 7 : "other",
- 8 : "leisure"
+ 1: "home",
+ 2: "work",
+ 3: "work",
+ 4: "education",
+ 5: "shop",
+ 6: "other",
+ 7: "other",
+ 8: "leisure",
# 9 : "other" # default
}
MODES_MAP = {
- 1 : "pt",
- 2 : "car",
- 3 : "car_passenger",
- 4 : "car",
- 5 : "bike",
- #6 : "pt", # default (other)
- 7 : "walk"
+ 1: "pt",
+ 2: "car",
+ 3: "car_passenger",
+ 4: "car",
+ 5: "bike",
+ # 6 : "pt", # default (other)
+ 7: "walk",
}
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.egt.raw")
# Make copies
- df_households = pd.DataFrame(df_households, copy = True)
- df_persons = pd.DataFrame(df_persons, copy = True)
- df_trips = pd.DataFrame(df_trips, copy = True)
+ df_households = pd.DataFrame(df_households, copy=True)
+ df_persons = pd.DataFrame(df_persons, copy=True)
+ df_trips = pd.DataFrame(df_trips, copy=True)
# Transform original IDs to integer (they are hierarchichal)
df_households["egt_household_id"] = df_households["NQUEST"].astype(int)
@@ -57,14 +60,16 @@ def execute(context):
df_households["household_id"] = np.arange(len(df_households))
df_persons = pd.merge(
- df_persons, df_households[["egt_household_id", "household_id"]],
- on = "egt_household_id"
+ df_persons,
+ df_households[["egt_household_id", "household_id"]],
+ on="egt_household_id",
)
df_persons["person_id"] = np.arange(len(df_persons))
df_trips = pd.merge(
- df_trips, df_persons[["egt_person_id", "egt_household_id", "person_id", "household_id"]],
- on = ["egt_person_id", "egt_household_id"]
+ df_trips,
+ df_persons[["egt_person_id", "egt_household_id", "person_id", "household_id"]],
+ on=["egt_person_id", "egt_household_id"],
)
df_trips["trip_id"] = np.arange(len(df_trips))
@@ -88,9 +93,13 @@ def execute(context):
# Clean departement
df_persons["departement_id"] = df_persons["RESDEP"].astype(str).astype("category")
- df_households["departement_id"] = df_households["RESDEP"].astype(str).astype("category")
+ df_households["departement_id"] = (
+ df_households["RESDEP"].astype(str).astype("category")
+ )
df_trips["origin_departement_id"] = df_trips["ORDEP"].astype(str).astype("category")
- df_trips["destination_departement_id"] = df_trips["DESTDEP"].astype(str).astype("category")
+ df_trips["destination_departement_id"] = (
+ df_trips["DESTDEP"].astype(str).astype("category")
+ )
# Clean employment
df_persons["employed"] = df_persons["OCCP"].isin([1.0, 2.0])
@@ -99,38 +108,50 @@ def execute(context):
df_persons["studies"] = df_persons["OCCP"].isin([3.0, 4.0, 5.0])
# Number of vehicles
- df_households["number_of_vehicles"] = df_households["NB_2RM"] + df_households["NB_VD"]
- df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
+ df_households["number_of_vehicles"] = (
+ df_households["NB_2RM"] + df_households["NB_VD"]
+ )
+ df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(
+ int
+ )
df_households["number_of_bikes"] = df_households["NB_VELO"].astype(int)
# License
- df_persons["has_license"] = (df_persons["PERMVP"] == 1) | (df_persons["PERM2RM"] == 1)
+ df_persons["has_license"] = (df_persons["PERMVP"] == 1) | (
+ df_persons["PERM2RM"] == 1
+ )
# Has subscription
df_persons["has_pt_subscription"] = df_persons["ABONTC"] > 1
# Household income
df_households["income_class"] = df_households["REVENU"] - 1
- df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
+ df_households.loc[
+ df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"
+ ] = -1
df_households["income_class"] = df_households["income_class"].astype(int)
# Impute urban type
if context.config("use_urban_type"):
- df_urban_type = context.stage("data.spatial.urban_type")[[
- "commune_id", "urban_type"
- ]]
+ df_urban_type = context.stage("data.spatial.urban_type")[
+ ["commune_id", "urban_type"]
+ ]
# Household municipality
df_households["commune_id"] = df_households["RESCOMM"].astype(str)
- df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left")
+ df_persons = pd.merge(
+ df_persons, df_households[["household_id", "commune_id"]], how="left"
+ )
assert np.all(~df_persons["commune_id"].isna())
-
+
# Impute urban type
- df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left")
- df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category")
+ df_persons = pd.merge(df_persons, df_urban_type, on="commune_id", how="left")
+ df_persons["urban_type"] = (
+ df_persons["urban_type"].fillna("none").astype("category")
+ )
- df_households.drop(columns = ["commune_id"])
- df_persons.drop(columns = ["commune_id"])
+ df_households.drop(columns=["commune_id"])
+ df_persons.drop(columns=["commune_id"])
# Trip purpose
df_trips["following_purpose"] = "other"
@@ -165,8 +186,8 @@ def execute(context):
# Add weight to trips
df_trips = pd.merge(
- df_trips, df_persons[["person_id", "person_weight"]], on = "person_id", how = "left"
- ).rename(columns = { "person_weight": "trip_weight" })
+ df_trips, df_persons[["person_id", "person_weight"]], on="person_id", how="left"
+ ).rename(columns={"person_weight": "trip_weight"})
df_persons["trip_weight"] = df_persons["person_weight"]
# Chain length
@@ -179,7 +200,9 @@ def execute(context):
# Calculate consumption units
hts.check_household_size(df_households, df_persons)
- df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")
+ df_households = pd.merge(
+ df_households, hts.calculate_consumption_units(df_persons), on="household_id"
+ )
# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["CS8"].fillna(8).astype(int)
@@ -194,19 +217,29 @@ def execute(context):
nan_count = np.count_nonzero(f)
total_count = len(df_persons)
- print("Dropping %d/%d persons because of NaN values in departure and arrival times" % (nan_count, total_count))
+ print(
+ "Dropping %d/%d persons because of NaN values in departure and arrival times"
+ % (nan_count, total_count)
+ )
df_persons = df_persons[~f]
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
- df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+ df_households = df_households[
+ df_households["household_id"].isin(df_persons["household_id"])
+ ]
# Fix activity types (because of inconsistent EGT data and removing in the timing fixing step)
hts.fix_activity_types(df_trips)
return df_households, df_persons, df_trips
+
def calculate_income_class(df):
assert "household_income" in df
assert "consumption_units" in df
- return np.digitize(df["household_income"] / df["consumption_units"], INCOME_CLASS_BOUNDS, right = True)
+ return np.digitize(
+ df["household_income"] / df["consumption_units"],
+ INCOME_CLASS_BOUNDS,
+ right=True,
+ )
diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py
index 29f06604..54701c26 100644
--- a/data/hts/egt/filtered.py
+++ b/data/hts/egt/filtered.py
@@ -6,49 +6,70 @@
Île-de-France.
"""
+
def configure(context):
context.stage("data.hts.egt.cleaned")
context.stage("data.spatial.codes")
- context.config("filter_hts",True)
+ context.config("filter_hts", True)
+
+
def execute(context):
- filter_egt = context.config("filter_hts")
+ filter_egt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")
- if filter_egt :
+ if filter_egt:
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
- f = df_persons["departement_id"].astype(str).isin(requested_departments) # pandas bug!
+ f = (
+ df_persons["departement_id"].astype(str).isin(requested_departments)
+ ) # pandas bug!
df_persons = df_persons[f]
# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()
- remove_ids |= set(df_trips[
- ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
- ]["person_id"].unique())
+ remove_ids |= set(
+ df_trips[
+ ~df_trips["origin_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ | ~df_trips["destination_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ ]["person_id"].unique()
+ )
- remove_ids |= set(df_persons[
- ~df_persons["departement_id"].isin(requested_departments)
- ])
+ remove_ids |= set(
+ df_persons[~df_persons["departement_id"].isin(requested_departments)]
+ )
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
# Only keep trips and households that still have a person
- df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
- df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+ df_trips = df_trips[
+ df_trips["person_id"].isin(df_persons["person_id"].unique())
+ ]
+ df_households = df_households[
+ df_households["household_id"].isin(df_persons["household_id"])
+ ]
# Finish up
household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
df_households = df_households[household_columns]
-
+
person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]
- if "urban_type" in df_persons: person_columns.append("urban_type")
+ if "urban_type" in df_persons:
+ person_columns.append("urban_type")
df_persons = df_persons[person_columns]
-
- trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]
+
+ trip_columns = (
+ hts.TRIP_COLUMNS
+ + ["euclidean_distance"]
+ + ["egt_household_id", "egt_person_id", "egt_trip_id"]
+ )
df_trips = df_trips[trip_columns]
hts.check(df_households, df_persons, df_trips)
diff --git a/data/hts/egt/raw.py b/data/hts/egt/raw.py
index 53b88a21..2ce75ed8 100644
--- a/data/hts/egt/raw.py
+++ b/data/hts/egt/raw.py
@@ -7,49 +7,97 @@
"""
MENAGES_COLUMNS = [
- "RESDEP", "NQUEST", "POIDSM", "NB_VELO", "NB_VD", "REVENU", "RESCOMM",
- "NB_2RM", "MNP"
+ "RESDEP",
+ "NQUEST",
+ "POIDSM",
+ "NB_VELO",
+ "NB_VD",
+ "REVENU",
+ "RESCOMM",
+ "NB_2RM",
+ "MNP",
]
PERSONNES_COLUMNS = [
- "RESDEP", "NP", "POIDSP", "NQUEST", "SEXE", "AGE", "PERMVP",
- "ABONTC", "OCCP", "PERM2RM", "NBDEPL", "CS8"
+ "RESDEP",
+ "NP",
+ "POIDSP",
+ "NQUEST",
+ "SEXE",
+ "AGE",
+ "PERMVP",
+ "ABONTC",
+ "OCCP",
+ "PERM2RM",
+ "NBDEPL",
+ "CS8",
]
DEPLACEMENTS_COLUMNS = [
- "NQUEST", "NP", "ND",
- "ORDEP", "DESTDEP", "ORH", "DESTH", "ORM", "DESTM", "ORCOMM", "DESTCOMM",
- "DPORTEE", "MODP_H7", "DESTMOT_H9", "ORMOT_H9"
+ "NQUEST",
+ "NP",
+ "ND",
+ "ORDEP",
+ "DESTDEP",
+ "ORH",
+ "DESTH",
+ "ORM",
+ "DESTM",
+ "ORCOMM",
+ "DESTCOMM",
+ "DPORTEE",
+ "MODP_H7",
+ "DESTMOT_H9",
+ "ORMOT_H9",
]
+
def configure(context):
context.config("data_path")
+
def execute(context):
df_menages = pd.read_csv(
"%s/egt_2010/Menages_semaine.csv" % context.config("data_path"),
- sep = ",", encoding = "latin1", usecols = MENAGES_COLUMNS
+ sep=",",
+ encoding="latin1",
+ usecols=MENAGES_COLUMNS,
)
df_personnes = pd.read_csv(
"%s/egt_2010/Personnes_semaine.csv" % context.config("data_path"),
- sep = ",", encoding = "latin1", usecols = PERSONNES_COLUMNS
+ sep=",",
+ encoding="latin1",
+ usecols=PERSONNES_COLUMNS,
)
df_deplacements = pd.read_csv(
"%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path"),
- sep = ",", encoding = "latin1", usecols = DEPLACEMENTS_COLUMNS
+ sep=",",
+ encoding="latin1",
+ usecols=DEPLACEMENTS_COLUMNS,
)
return df_menages, df_personnes, df_deplacements
+
def validate(context):
- for name in ("Menages_semaine.csv", "Personnes_semaine.csv", "Deplacements_semaine.csv"):
+ for name in (
+ "Menages_semaine.csv",
+ "Personnes_semaine.csv",
+ "Deplacements_semaine.csv",
+ ):
if not os.path.exists("%s/egt_2010/%s" % (context.config("data_path"), name)):
raise RuntimeError("File missing from EGT: %s" % name)
return [
- os.path.getsize("%s/egt_2010/Menages_semaine.csv" % context.config("data_path")),
- os.path.getsize("%s/egt_2010/Personnes_semaine.csv" % context.config("data_path")),
- os.path.getsize("%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path"))
+ os.path.getsize(
+ "%s/egt_2010/Menages_semaine.csv" % context.config("data_path")
+ ),
+ os.path.getsize(
+ "%s/egt_2010/Personnes_semaine.csv" % context.config("data_path")
+ ),
+ os.path.getsize(
+ "%s/egt_2010/Deplacements_semaine.csv" % context.config("data_path")
+ ),
]
diff --git a/data/hts/entd/cleaned.py b/data/hts/entd/cleaned.py
index 51bfd966..51618eee 100644
--- a/data/hts/entd/cleaned.py
+++ b/data/hts/entd/cleaned.py
@@ -7,10 +7,27 @@
This stage cleans the national HTS.
"""
+
def configure(context):
context.stage("data.hts.entd.raw")
-INCOME_CLASS_BOUNDS = [400, 600, 800, 1000, 1200, 1500, 1800, 2000, 2500, 3000, 4000, 6000, 10000, 1e6]
+
+INCOME_CLASS_BOUNDS = [
+ 400,
+ 600,
+ 800,
+ 1000,
+ 1200,
+ 1500,
+ 1800,
+ 2000,
+ 2500,
+ 3000,
+ 4000,
+ 6000,
+ 10000,
+ 1e6,
+]
PURPOSE_MAP = [
("1", "home"),
@@ -22,38 +39,47 @@ def configure(context):
("6", "other"),
("7", "leisure"),
("8", "leisure"),
- ("9", "work")
+ ("9", "work"),
]
MODES_MAP = [
("1", "walk"),
- ("2", "car"), #
- ("2.20", "bike"), # bike
- ("2.23", "car_passenger"), # motorcycle passenger
- ("2.25", "car_passenger"), # same
+ ("2", "car"), #
+ ("2.20", "bike"), # bike
+ ("2.23", "car_passenger"), # motorcycle passenger
+ ("2.25", "car_passenger"), # same
("3", "car"),
("3.32", "car_passenger"),
- ("4", "pt"), # taxi
+ ("4", "pt"), # taxi
("5", "pt"),
("6", "pt"),
- ("7", "pt"), # Plane
- ("8", "pt"), # Boat
-# ("9", "pt") # Other
+ ("7", "pt"), # Plane
+ ("8", "pt"), # Boat
+ # ("9", "pt") # Other
]
+
def convert_time(x):
- return np.dot(np.array(x.split(":"), dtype = float), [3600.0, 60.0, 1.0])
+ return np.dot(np.array(x.split(":"), dtype=float), [3600.0, 60.0, 1.0])
+
def execute(context):
- df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc = context.stage("data.hts.entd.raw")
+ df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc = context.stage(
+ "data.hts.entd.raw"
+ )
# Make copies
- df_persons = pd.DataFrame(df_tcm_individu, copy = True)
- df_households = pd.DataFrame(df_tcm_menage, copy = True)
- df_trips = pd.DataFrame(df_deploc, copy = True)
+ df_persons = pd.DataFrame(df_tcm_individu, copy=True)
+ df_households = pd.DataFrame(df_tcm_menage, copy=True)
+ df_trips = pd.DataFrame(df_deploc, copy=True)
# Get weights for persons that actually have trips
- df_persons = pd.merge(df_persons, df_trips[["IDENT_IND", "PONDKI"]].drop_duplicates("IDENT_IND"), on = "IDENT_IND", how = "left")
+ df_persons = pd.merge(
+ df_persons,
+ df_trips[["IDENT_IND", "PONDKI"]].drop_duplicates("IDENT_IND"),
+ on="IDENT_IND",
+ how="left",
+ )
df_persons["is_kish"] = ~df_persons["PONDKI"].isna()
df_persons["trip_weight"] = df_persons["PONDKI"].fillna(0.0)
@@ -64,13 +90,21 @@ def execute(context):
print("Filtering out %d non-reference day trips" % np.count_nonzero(~f))
# Merge in additional information from ENTD
- df_households = pd.merge(df_households, df_menage[[
- "idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO", "V1_JNBVELOADT"
- ]], on = "idENT_MEN", how = "left")
+ df_households = pd.merge(
+ df_households,
+ df_menage[
+ ["idENT_MEN", "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO", "V1_JNBVELOADT"]
+ ],
+ on="idENT_MEN",
+ how="left",
+ )
- df_persons = pd.merge(df_persons, df_individu[[
- "IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R", "V1_ICARTABON"
- ]], on = "IDENT_IND", how = "left")
+ df_persons = pd.merge(
+ df_persons,
+ df_individu[["IDENT_IND", "V1_GPERMIS", "V1_GPERMIS2R", "V1_ICARTABON"]],
+ on="IDENT_IND",
+ how="left",
+ )
# Transform original IDs to integer (they are hierarchichal)
df_persons["entd_person_id"] = df_persons["IDENT_IND"].astype(int)
@@ -82,14 +116,16 @@ def execute(context):
df_households["household_id"] = np.arange(len(df_households))
df_persons = pd.merge(
- df_persons, df_households[["entd_household_id", "household_id"]],
- on = "entd_household_id"
+ df_persons,
+ df_households[["entd_household_id", "household_id"]],
+ on="entd_household_id",
)
df_persons["person_id"] = np.arange(len(df_persons))
df_trips = pd.merge(
- df_trips, df_persons[["entd_person_id", "person_id", "household_id"]],
- on = ["entd_person_id"]
+ df_trips,
+ df_persons[["entd_person_id", "person_id", "household_id"]],
+ on=["entd_person_id"],
)
df_trips["trip_id"] = np.arange(len(df_trips))
@@ -109,19 +145,24 @@ def execute(context):
df_households["household_size"] = df_households["NPERS"]
# Clean departement
- df_households["departement_id"] = df_households["DEP"].fillna("undefined").astype("category")
- df_persons["departement_id"] = df_persons["DEP"].fillna("undefined").astype("category")
+ df_households["departement_id"] = (
+ df_households["DEP"].fillna("undefined").astype("category")
+ )
+ df_persons["departement_id"] = (
+ df_persons["DEP"].fillna("undefined").astype("category")
+ )
- df_trips["origin_departement_id"] = df_trips["V2_MORIDEP"].fillna("undefined").astype("category")
- df_trips["destination_departement_id"] = df_trips["V2_MDESDEP"].fillna("undefined").astype("category")
+ df_trips["origin_departement_id"] = (
+ df_trips["V2_MORIDEP"].fillna("undefined").astype("category")
+ )
+ df_trips["destination_departement_id"] = (
+ df_trips["V2_MDESDEP"].fillna("undefined").astype("category")
+ )
# Clean urban type
- df_households["urban_type"] = df_households["numcom_UU2010"].replace({
- "B": "suburb",
- "C": "central_city",
- "I": "isolated_city",
- "R": "none"
- })
+ df_households["urban_type"] = df_households["numcom_UU2010"].replace(
+ {"B": "suburb", "C": "central_city", "I": "isolated_city", "R": "none"}
+ )
assert np.all(~df_households["urban_type"].isna())
df_households["urban_type"] = df_households["urban_type"].astype("category")
@@ -139,32 +180,67 @@ def execute(context):
df_households["number_of_vehicles"] += df_households["V1_JNBVEH"].fillna(0)
df_households["number_of_vehicles"] += df_households["V1_JNBMOTO"].fillna(0)
df_households["number_of_vehicles"] += df_households["V1_JNBCYCLO"].fillna(0)
- df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
+ df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(
+ int
+ )
- df_households["number_of_bikes"] = df_households["V1_JNBVELOADT"].fillna(0).astype(int)
+ df_households["number_of_bikes"] = (
+ df_households["V1_JNBVELOADT"].fillna(0).astype(int)
+ )
# License
- df_persons["has_license"] = (df_persons["V1_GPERMIS"] == 1) | (df_persons["V1_GPERMIS2R"] == 1)
+ df_persons["has_license"] = (df_persons["V1_GPERMIS"] == 1) | (
+ df_persons["V1_GPERMIS2R"] == 1
+ )
# Has subscription
df_persons["has_pt_subscription"] = df_persons["V1_ICARTABON"] == 1
# Household income
df_households["income_class"] = -1
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"), "income_class"] = 0
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 400"), "income_class"] = 1
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 600"), "income_class"] = 2
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 800"), "income_class"] = 3
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"), "income_class"] = 4
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"), "income_class"] = 5
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"), "income_class"] = 6
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"), "income_class"] = 7
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"), "income_class"] = 8
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"), "income_class"] = 9
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"), "income_class"] = 10
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"), "income_class"] = 11
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"), "income_class"] = 12
- df_households.loc[df_households["TrancheRevenuMensuel"].str.startswith("10 000"), "income_class"] = 13
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("Moins de 400"),
+ "income_class",
+ ] = 0
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 400"), "income_class"
+ ] = 1
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 600"), "income_class"
+ ] = 2
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 800"), "income_class"
+ ] = 3
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 1 000"), "income_class"
+ ] = 4
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 1 200"), "income_class"
+ ] = 5
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 1 500"), "income_class"
+ ] = 6
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 1 800"), "income_class"
+ ] = 7
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 2 000"), "income_class"
+ ] = 8
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 2 500"), "income_class"
+ ] = 9
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 3 000"), "income_class"
+ ] = 10
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 4 000"), "income_class"
+ ] = 11
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("De 6 000"), "income_class"
+ ] = 12
+ df_households.loc[
+ df_households["TrancheRevenuMensuel"].str.startswith("10 000"), "income_class"
+ ] = 13
df_households["income_class"] = df_households["income_class"].astype(int)
# Trip purpose
@@ -173,11 +249,13 @@ def execute(context):
for prefix, activity_type in PURPOSE_MAP:
df_trips.loc[
- df_trips["V2_MMOTIFDES"].astype(str).str.startswith(prefix), "following_purpose"
+ df_trips["V2_MMOTIFDES"].astype(str).str.startswith(prefix),
+ "following_purpose",
] = activity_type
df_trips.loc[
- df_trips["V2_MMOTIFORI"].astype(str).str.startswith(prefix), "preceding_purpose"
+ df_trips["V2_MMOTIFORI"].astype(str).str.startswith(prefix),
+ "preceding_purpose",
] = activity_type
df_trips["following_purpose"] = df_trips["following_purpose"].astype("category")
@@ -187,15 +265,17 @@ def execute(context):
df_trips["mode"] = "pt"
for prefix, mode in MODES_MAP:
- df_trips.loc[
- df_trips["V2_MTP"].astype(str).str.startswith(prefix), "mode"
- ] = mode
+ df_trips.loc[df_trips["V2_MTP"].astype(str).str.startswith(prefix), "mode"] = (
+ mode
+ )
df_trips["mode"] = df_trips["mode"].astype("category")
# Further trip attributes
df_trips["routed_distance"] = df_trips["V2_MDISTTOT"] * 1000.0
- df_trips["routed_distance"] = df_trips["routed_distance"].fillna(0.0) # This should be just one within Île-de-France
+ df_trips["routed_distance"] = df_trips["routed_distance"].fillna(
+ 0.0
+ ) # This should be just one within Île-de-France
# Only leave weekday trips
f = df_trips["V2_TYPJOUR"] == 1
@@ -205,10 +285,14 @@ def execute(context):
# Only leave one day per person
initial_count = len(df_trips)
- df_first_day = df_trips[["person_id", "IDENT_JOUR"]].sort_values(
- by = ["person_id", "IDENT_JOUR"]
- ).drop_duplicates("person_id")
- df_trips = pd.merge(df_trips, df_first_day, how = "inner", on = ["person_id", "IDENT_JOUR"])
+ df_first_day = (
+ df_trips[["person_id", "IDENT_JOUR"]]
+ .sort_values(by=["person_id", "IDENT_JOUR"])
+ .drop_duplicates("person_id")
+ )
+ df_trips = pd.merge(
+ df_trips, df_first_day, how="inner", on=["person_id", "IDENT_JOUR"]
+ )
final_count = len(df_trips)
print("Removed %d trips for non-primary days" % (initial_count - final_count))
@@ -217,7 +301,9 @@ def execute(context):
df_trips = hts.compute_first_last(df_trips)
# Trip times
- df_trips["departure_time"] = df_trips["V2_MORIHDEP"].apply(convert_time).astype(float)
+ df_trips["departure_time"] = (
+ df_trips["V2_MORIHDEP"].apply(convert_time).astype(float)
+ )
df_trips["arrival_time"] = df_trips["V2_MDESHARR"].apply(convert_time).astype(float)
df_trips = hts.fix_trip_times(df_trips)
@@ -230,11 +316,17 @@ def execute(context):
# Chain length
df_persons = pd.merge(
- df_persons, df_trips[["person_id", "NDEP"]].drop_duplicates("person_id").rename(columns = { "NDEP": "number_of_trips" }),
- on = "person_id", how = "left"
+ df_persons,
+ df_trips[["person_id", "NDEP"]]
+ .drop_duplicates("person_id")
+ .rename(columns={"NDEP": "number_of_trips"}),
+ on="person_id",
+ how="left",
)
df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int)
- df_persons.loc[(df_persons["number_of_trips"] == -1) & df_persons["is_kish"], "number_of_trips"] = 0
+ df_persons.loc[
+ (df_persons["number_of_trips"] == -1) & df_persons["is_kish"], "number_of_trips"
+ ] = 0
# Passenger attribute
df_persons["is_passenger"] = df_persons["person_id"].isin(
@@ -243,18 +335,23 @@ def execute(context):
# Calculate consumption units
hts.check_household_size(df_households, df_persons)
- df_households = pd.merge(df_households, hts.calculate_consumption_units(df_persons), on = "household_id")
+ df_households = pd.merge(
+ df_households, hts.calculate_consumption_units(df_persons), on="household_id"
+ )
# Socioprofessional class
- df_persons["socioprofessional_class"] = df_persons["CS24"].fillna(80).astype(int) // 10
+ df_persons["socioprofessional_class"] = (
+ df_persons["CS24"].fillna(80).astype(int) // 10
+ )
# Fix activity types (because of 1 inconsistent ENTD data)
hts.fix_activity_types(df_trips)
return df_households, df_persons, df_trips
+
def calculate_income_class(df):
assert "household_income" in df
assert "consumption_units" in df
- return np.digitize(df["household_income"], INCOME_CLASS_BOUNDS, right = True)
+ return np.digitize(df["household_income"], INCOME_CLASS_BOUNDS, right=True)
diff --git a/data/hts/entd/filtered.py b/data/hts/entd/filtered.py
index e9bb2ca1..71d36485 100644
--- a/data/hts/entd/filtered.py
+++ b/data/hts/entd/filtered.py
@@ -6,17 +6,20 @@
Île-de-France.
"""
+
def configure(context):
context.stage("data.hts.entd.cleaned")
context.stage("data.spatial.codes")
- context.config("filter_hts",True)
+ context.config("filter_hts", True)
+
+
def execute(context):
- filter_entd = context.config("filter_hts")
+ filter_entd = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.entd.cleaned")
- if filter_entd :
+ if filter_entd:
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
@@ -25,18 +28,31 @@ def execute(context):
# Filter for people going outside of the area (because they have NaN distances)
remove_ids = set()
- remove_ids |= set(df_trips[
- ~df_trips["origin_departement_id"].astype(str).isin(requested_departments) | ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
- ]["person_id"].unique())
+ remove_ids |= set(
+ df_trips[
+ ~df_trips["origin_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ | ~df_trips["destination_departement_id"]
+ .astype(str)
+ .isin(requested_departments)
+ ]["person_id"].unique()
+ )
df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]
# Only keep trips and households that still have a person
- df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
- df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
+ df_trips = df_trips[
+ df_trips["person_id"].isin(df_persons["person_id"].unique())
+ ]
+ df_households = df_households[
+ df_households["household_id"].isin(df_persons["household_id"])
+ ]
# Finish up
- df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]]
+ df_households = df_households[
+ hts.HOUSEHOLD_COLUMNS + ["urban_type", "income_class"]
+ ]
df_persons = df_persons[hts.PERSON_COLUMNS]
df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance"]]
diff --git a/data/hts/entd/raw.py b/data/hts/entd/raw.py
index f4bdd91a..16b1ab85 100644
--- a/data/hts/entd/raw.py
+++ b/data/hts/entd/raw.py
@@ -7,80 +7,132 @@
"""
Q_MENAGE_COLUMNS = [
- "DEP", "idENT_MEN", "PONDV1", "RG",
+ "DEP",
+ "idENT_MEN",
+ "PONDV1",
+ "RG",
"V1_JNBVELOADT",
- "V1_JNBVEH", "V1_JNBMOTO", "V1_JNBCYCLO"
+ "V1_JNBVEH",
+ "V1_JNBMOTO",
+ "V1_JNBCYCLO",
]
Q_TCM_MENAGE_COLUMNS = [
- "NPERS", "PONDV1", "TrancheRevenuMensuel",
- "DEP", "idENT_MEN", "RG", "numcom_UU2010"
+ "NPERS",
+ "PONDV1",
+ "TrancheRevenuMensuel",
+ "DEP",
+ "idENT_MEN",
+ "RG",
+ "numcom_UU2010",
]
Q_INDIVIDU_COLUMNS = [
- "IDENT_IND", "idENT_MEN",
- "RG", "V1_GPERMIS", "V1_ICARTABON",
- "V1_GPERMIS2R"
+ "IDENT_IND",
+ "idENT_MEN",
+ "RG",
+ "V1_GPERMIS",
+ "V1_ICARTABON",
+ "V1_GPERMIS2R",
]
Q_TCM_INDIVIDU_COLUMNS = [
- "AGE", "ETUDES", "IDENT_IND", "IDENT_MEN",
- "PONDV1", "CS24", "SEXE", "DEP", "SITUA",
+ "AGE",
+ "ETUDES",
+ "IDENT_IND",
+ "IDENT_MEN",
+ "PONDV1",
+ "CS24",
+ "SEXE",
+ "DEP",
+ "SITUA",
]
K_DEPLOC_COLUMNS = [
- "IDENT_IND", "V2_MMOTIFDES", "V2_MMOTIFORI",
- "V2_TYPJOUR", "V2_MORIHDEP", "V2_MDESHARR", "V2_MDISTTOT",
- "IDENT_JOUR", "V2_MTP",
- "V2_MDESDEP", "V2_MORIDEP", "NDEP", "V2_MOBILREF",
- "PONDKI"
+ "IDENT_IND",
+ "V2_MMOTIFDES",
+ "V2_MMOTIFORI",
+ "V2_TYPJOUR",
+ "V2_MORIHDEP",
+ "V2_MDESHARR",
+ "V2_MDISTTOT",
+ "IDENT_JOUR",
+ "V2_MTP",
+ "V2_MDESDEP",
+ "V2_MORIDEP",
+ "NDEP",
+ "V2_MOBILREF",
+ "PONDKI",
]
+
def configure(context):
context.config("data_path")
+
def execute(context):
df_individu = pd.read_csv(
"%s/entd_2008/Q_individu.csv" % context.config("data_path"),
- sep = ";", encoding = "latin1", usecols = Q_INDIVIDU_COLUMNS,
- dtype = { "DEP": str }
+ sep=";",
+ encoding="latin1",
+ usecols=Q_INDIVIDU_COLUMNS,
+ dtype={"DEP": str},
)
df_tcm_individu = pd.read_csv(
"%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path"),
- sep = ";", encoding = "latin1", usecols = Q_TCM_INDIVIDU_COLUMNS,
- dtype = { "DEP": str }
+ sep=";",
+ encoding="latin1",
+ usecols=Q_TCM_INDIVIDU_COLUMNS,
+ dtype={"DEP": str},
)
df_menage = pd.read_csv(
"%s/entd_2008/Q_menage.csv" % context.config("data_path"),
- sep = ";", encoding = "latin1", usecols = Q_MENAGE_COLUMNS,
- dtype = { "DEP": str }
+ sep=";",
+ encoding="latin1",
+ usecols=Q_MENAGE_COLUMNS,
+ dtype={"DEP": str},
)
df_tcm_menage = pd.read_csv(
"%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path"),
- sep = ";", encoding = "latin1", usecols = Q_TCM_MENAGE_COLUMNS,
- dtype = { "DEP": str }
+ sep=";",
+ encoding="latin1",
+ usecols=Q_TCM_MENAGE_COLUMNS,
+ dtype={"DEP": str},
)
df_deploc = pd.read_csv(
"%s/entd_2008/K_deploc.csv" % context.config("data_path"),
- sep = ";", encoding = "latin1", usecols = K_DEPLOC_COLUMNS,
- dtype = { "DEP": str, "V2_MTP": str }
+ sep=";",
+ encoding="latin1",
+ usecols=K_DEPLOC_COLUMNS,
+ dtype={"DEP": str, "V2_MTP": str},
)
return df_individu, df_tcm_individu, df_menage, df_tcm_menage, df_deploc
+
def validate(context):
- for name in ("Q_individu.csv", "Q_tcm_individu.csv", "Q_menage.csv", "Q_tcm_menage_0.csv", "K_deploc.csv"):
+ for name in (
+ "Q_individu.csv",
+ "Q_tcm_individu.csv",
+ "Q_menage.csv",
+ "Q_tcm_menage_0.csv",
+ "K_deploc.csv",
+ ):
if not os.path.exists("%s/entd_2008/%s" % (context.config("data_path"), name)):
raise RuntimeError("File missing from ENTD: %s" % name)
return [
os.path.getsize("%s/entd_2008/Q_individu.csv" % context.config("data_path")),
- os.path.getsize("%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path")),
+ os.path.getsize(
+ "%s/entd_2008/Q_tcm_individu.csv" % context.config("data_path")
+ ),
os.path.getsize("%s/entd_2008/Q_menage.csv" % context.config("data_path")),
- os.path.getsize("%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path")),
- os.path.getsize("%s/entd_2008/K_deploc.csv" % context.config("data_path"))
+ os.path.getsize(
+ "%s/entd_2008/Q_tcm_menage_0.csv" % context.config("data_path")
+ ),
+ os.path.getsize("%s/entd_2008/K_deploc.csv" % context.config("data_path")),
]
diff --git a/data/hts/entd/reweighted.py b/data/hts/entd/reweighted.py
index 517a3ca9..2367e68d 100644
--- a/data/hts/entd/reweighted.py
+++ b/data/hts/entd/reweighted.py
@@ -1,8 +1,10 @@
import numpy as np
+
def configure(context):
context.stage("data.hts.entd.filtered")
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.entd.filtered")
diff --git a/data/hts/hts.py b/data/hts/hts.py
index 86bc0365..59ce73e7 100644
--- a/data/hts/hts.py
+++ b/data/hts/hts.py
@@ -1,6 +1,7 @@
import pandas as pd
import numpy as np
+
def swap_departure_arrival_times(df, f):
assert "arrival_time" in df
assert "departure_time" in df
@@ -11,6 +12,7 @@ def swap_departure_arrival_times(df, f):
df.loc[f, "departure_time"] = arrival_times
df.loc[f, "arrival_time"] = departure_times
+
def fix_trip_times(df_trips):
"""
- Negative duration:
@@ -22,7 +24,16 @@ def fix_trip_times(df_trips):
- Intresecting trips
"""
- columns = ["trip_id", "person_id", "departure_time", "arrival_time", "preceding_purpose", "following_purpose", "is_first_trip", "is_last_trip"]
+ columns = [
+ "trip_id",
+ "person_id",
+ "departure_time",
+ "arrival_time",
+ "preceding_purpose",
+ "following_purpose",
+ "is_first_trip",
+ "is_last_trip",
+ ]
df_main = df_trips
df_next = df_main.shift(-1)
df_previous = df_main.shift(1)
@@ -33,9 +44,16 @@ def fix_trip_times(df_trips):
# 1.1) Departure and arrival time may have been swapped, and chain is consistent
f_swap = np.copy(f_negative)
- f_swap &= (df_main["arrival_time"] > df_previous["arrival_time"]) | df_main["is_first_trip"]
- f_swap &= (df_main["departure_time"] < df_next["departure_time"]) | df_main["is_last_trip"]
- print(" of which %d can swap departure and arrival time without conflicts with previous or following trip" % np.count_nonzero(f_swap))
+ f_swap &= (df_main["arrival_time"] > df_previous["arrival_time"]) | df_main[
+ "is_first_trip"
+ ]
+ f_swap &= (df_main["departure_time"] < df_next["departure_time"]) | df_main[
+ "is_last_trip"
+ ]
+ print(
+ " of which %d can swap departure and arrival time without conflicts with previous or following trip"
+ % np.count_nonzero(f_swap)
+ )
swap_departure_arrival_times(df_main, f_swap)
f_negative[f_swap] = False
@@ -44,13 +62,19 @@ def fix_trip_times(df_trips):
# However, the offset duration is unlikely to be a trip over midnight
offset = df_main["departure_time"] - df_main["arrival_time"]
f_swap = (offset > 0) & (offset < 10 * 3600)
- print(" of which %d are unlikely to cover midnight, so we swap arrival and departure time although there are conflicts" % np.count_nonzero(f_swap))
+ print(
+ " of which %d are unlikely to cover midnight, so we swap arrival and departure time although there are conflicts"
+ % np.count_nonzero(f_swap)
+ )
swap_departure_arrival_times(df_main, f_swap)
f_negative[f_swap] = False
# 1.3) Covering midnight -> Shift arrival time
- print(" of which %d seem to cover midnight, so we shift arrival time by 24h" % np.count_nonzero(f_negative))
+ print(
+ " of which %d seem to cover midnight, so we shift arrival time by 24h"
+ % np.count_nonzero(f_negative)
+ )
df_main.loc[f_negative, "arrival_time"] += 24 * 3600.0
# 2) Current trip is after following trip
@@ -83,10 +107,16 @@ def fix_trip_times(df_trips):
# Intersecting trips
f = ~df_main["is_last_trip"]
f &= df_main["arrival_time"] > df_next["departure_time"]
- print("Found %d occurences where current trip ends after next trip starts" % np.count_nonzero(f))
+ print(
+ "Found %d occurences where current trip ends after next trip starts"
+ % np.count_nonzero(f)
+ )
f &= df_main["departure_time"] <= df_next["departure_time"]
- print(" of which we're able to shorten %d to make it consistent" % np.count_nonzero(f))
+ print(
+ " of which we're able to shorten %d to make it consistent"
+ % np.count_nonzero(f)
+ )
df_main.loc[f, "arrival_time"] = df_next["departure_time"]
# Included trips (moving the first one to the start of the following trip and setting duration to zero)
@@ -95,10 +125,14 @@ def fix_trip_times(df_trips):
f &= df_main["arrival_time"] <= df_next["arrival_time"]
df_main.loc[f, "departure_time"] = df_next["departure_time"]
df_main.loc[f, "arrival_time"] = df_next["departure_time"]
- print("Found %d occurences where current trip is included in next trip" % np.count_nonzero(f))
+ print(
+ "Found %d occurences where current trip is included in next trip"
+ % np.count_nonzero(f)
+ )
return df_main
+
def check_trip_times(df_trips):
print("Validating trip times...")
any_errors = False
@@ -168,31 +202,43 @@ def check_trip_times(df_trips):
print(" => All trip times are consistent!")
return True
+
def fix_activity_types(df_trips):
- f = (df_trips["preceding_purpose"] != df_trips["following_purpose"].shift(1)) & ~df_trips["is_first_trip"]
- df_trips.loc[f, "preceding_purpose"] = df_trips.shift(1)["following_purpose"][f].values
+ f = (
+ df_trips["preceding_purpose"] != df_trips["following_purpose"].shift(1)
+ ) & ~df_trips["is_first_trip"]
+ df_trips.loc[f, "preceding_purpose"] = df_trips.shift(1)["following_purpose"][
+ f
+ ].values
print("Fixing %d inconsistent activity types" % np.count_nonzero(f))
check_activity_types(df_trips)
+
def check_activity_types(df_trips):
- f = (df_trips["following_purpose"] != df_trips["preceding_purpose"].shift(-1)) & ~df_trips["is_last_trip"]
- f |= (df_trips["following_purpose"].shift(1) != df_trips["preceding_purpose"]) & ~df_trips["is_first_trip"]
+ f = (
+ df_trips["following_purpose"] != df_trips["preceding_purpose"].shift(-1)
+ ) & ~df_trips["is_last_trip"]
+ f |= (
+ df_trips["following_purpose"].shift(1) != df_trips["preceding_purpose"]
+ ) & ~df_trips["is_first_trip"]
error_count = np.count_nonzero(f)
print("Trips with inconsistent activity types: %d" % error_count)
return error_count == 0
+
def compute_first_last(df_trips):
assert "person_id" in df_trips
- df_trips = df_trips.sort_values(by = ["person_id", "trip_id"])
+ df_trips = df_trips.sort_values(by=["person_id", "trip_id"])
df_trips["is_first_trip"] = df_trips["person_id"].ne(df_trips["person_id"].shift(1))
df_trips["is_last_trip"] = df_trips["person_id"].ne(df_trips["person_id"].shift(-1))
return df_trips
+
def compute_activity_duration(df_trips):
assert "departure_time" in df_trips
assert "arrival_time" in df_trips
@@ -201,13 +247,17 @@ def compute_activity_duration(df_trips):
df_trips["activity_duration"] = df_next["departure_time"] - df_trips["arrival_time"]
df_trips.loc[df_trips["is_last_trip"], "activity_duration"] = np.nan
+
def check_household_size(df_households, df_persons):
- df_size = df_persons.groupby("household_id").size().reset_index(name = "count")
- df_size = pd.merge(df_households[["household_id", "household_size"]], df_size, on = "household_id")
+ df_size = df_persons.groupby("household_id").size().reset_index(name="count")
+ df_size = pd.merge(
+ df_households[["household_id", "household_size"]], df_size, on="household_id"
+ )
assert len(df_size) == len(df_households)
assert (df_size["household_size"] == df_size["count"]).all()
+
def calculate_consumption_units(df_persons):
df_units = df_persons[["household_id", "age"]].copy()
df_units["under_14"] = df_units["age"] < 14
@@ -220,28 +270,52 @@ def calculate_consumption_units(df_persons):
return df_units[["household_id", "consumption_units"]]
+
HOUSEHOLD_COLUMNS = [
- "household_id", "household_weight", "household_size",
- "number_of_vehicles", "number_of_bikes", "departement_id",
- "consumption_units", # "income_class"
+ "household_id",
+ "household_weight",
+ "household_size",
+ "number_of_vehicles",
+ "number_of_bikes",
+ "departement_id",
+ "consumption_units", # "income_class"
]
PERSON_COLUMNS = [
- "person_id", "household_id", "person_weight",
- "age", "sex", "employed", "studies",
- "has_license", "has_pt_subscription",
- "number_of_trips", "departement_id", "trip_weight",
- "is_passenger", "socioprofessional_class"
+ "person_id",
+ "household_id",
+ "person_weight",
+ "age",
+ "sex",
+ "employed",
+ "studies",
+ "has_license",
+ "has_pt_subscription",
+ "number_of_trips",
+ "departement_id",
+ "trip_weight",
+ "is_passenger",
+ "socioprofessional_class",
]
TRIP_COLUMNS = [
- "person_id", "trip_id", "trip_weight",
- "departure_time", "arrival_time",
- "trip_duration", "activity_duration",
- "following_purpose", "preceding_purpose", "is_last_trip", "is_first_trip",
- "mode", "origin_departement_id", "destination_departement_id"
+ "person_id",
+ "trip_id",
+ "trip_weight",
+ "departure_time",
+ "arrival_time",
+ "trip_duration",
+ "activity_duration",
+ "following_purpose",
+ "preceding_purpose",
+ "is_last_trip",
+ "is_first_trip",
+ "mode",
+ "origin_departement_id",
+ "destination_departement_id",
]
+
def check(df_households, df_persons, df_trips):
assert check_trip_times(df_trips)
assert check_activity_types(df_trips)
diff --git a/data/hts/output.py b/data/hts/output.py
index cee14cad..1ee0eca3 100644
--- a/data/hts/output.py
+++ b/data/hts/output.py
@@ -9,23 +9,34 @@
pipeline.
"""
+
def configure(context):
context.stage("data.hts.selected")
context.config("output_path")
context.config("output_prefix", "ile_de_france_")
+
def execute(context):
df_households, df_persons, df_trips = context.stage("data.hts.selected")
- df_households.to_csv("%s/%shts_households.csv" % (
- context.config("output_path"), context.config("output_prefix")
- ), sep = ";", index = False)
+ df_households.to_csv(
+ "%s/%shts_households.csv"
+ % (context.config("output_path"), context.config("output_prefix")),
+ sep=";",
+ index=False,
+ )
- df_persons.to_csv("%s/%shts_persons.csv" % (
- context.config("output_path"), context.config("output_prefix")
- ), sep = ";", index = False)
+ df_persons.to_csv(
+ "%s/%shts_persons.csv"
+ % (context.config("output_path"), context.config("output_prefix")),
+ sep=";",
+ index=False,
+ )
- df_trips.to_csv("%s/%shts_trips.csv" % (
- context.config("output_path"), context.config("output_prefix")
- ), sep = ";", index = False)
+ df_trips.to_csv(
+ "%s/%shts_trips.csv"
+ % (context.config("output_path"), context.config("output_prefix")),
+ sep=";",
+ index=False,
+ )
diff --git a/data/hts/selected.py b/data/hts/selected.py
index d5c5bd43..1832fbc7 100644
--- a/data/hts/selected.py
+++ b/data/hts/selected.py
@@ -1,19 +1,21 @@
import pandas as pd
import numpy as np
+
def configure(context):
hts = context.config("hts")
if hts == "egt":
- context.stage("data.hts.egt.filtered", alias = "hts")
+ context.stage("data.hts.egt.filtered", alias="hts")
elif hts == "entd":
- context.stage("data.hts.entd.reweighted", alias = "hts")
+ context.stage("data.hts.entd.reweighted", alias="hts")
elif hts == "edgt_lyon":
- context.stage("data.hts.edgt_lyon.reweighted", alias = "hts")
+ context.stage("data.hts.edgt_lyon.reweighted", alias="hts")
elif hts == "edgt_44":
- context.stage("data.hts.edgt_44.reweighted", alias = "hts")
+ context.stage("data.hts.edgt_44.reweighted", alias="hts")
else:
raise RuntimeError("Unknown HTS: %s" % hts)
+
def execute(context):
return context.stage("hts")
diff --git a/data/income/municipality.py b/data/income/municipality.py
index 7bf65015..76fbb2f2 100644
--- a/data/income/municipality.py
+++ b/data/income/municipality.py
@@ -19,21 +19,44 @@
EQASIM_INCOME_ATTRIBUTES = ["size", "family_comp"]
# final columns of the income DataFrame
-INCOME_DF_COLUMNS = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "attribute", "value", "is_imputed", "is_missing", "reference_median"]
+INCOME_DF_COLUMNS = [
+ "commune_id",
+ "q1",
+ "q2",
+ "q3",
+ "q4",
+ "q5",
+ "q6",
+ "q7",
+ "q8",
+ "q9",
+ "attribute",
+ "value",
+ "is_imputed",
+ "is_missing",
+ "reference_median",
+]
def configure(context):
context.config("data_path")
context.stage("data.spatial.municipalities")
- context.config("income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip")
+ context.config(
+ "income_com_path", "filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip"
+ )
context.config("income_com_xlsx", "FILO2019_DISP_COM.xlsx")
context.config("income_year", 19)
-def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_municipalities):
+def _income_distributions_from_filosofi_ensemble_sheet(
+ filsofi_sheets, year, df_municipalities
+):
requested_communes = set(df_municipalities["commune_id"].unique())
- df = filsofi_sheets["ENSEMBLE"][["CODGEO"] + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]]
+ df = filsofi_sheets["ENSEMBLE"][
+ ["CODGEO"]
+ + [("D%d" % q) + year if q != 5 else "Q2" + year for q in range(1, 10)]
+ ]
df.columns = ["commune_id", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
df.loc[:, "reference_median"] = df["q5"].values
@@ -42,13 +65,21 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_
# Find communes without data
df["commune_id"] = df["commune_id"].astype("category")
- missing_communes = set(df_municipalities["commune_id"].unique()) - set(df["commune_id"].unique())
- print("Found %d/%d municipalities that are missing" % (len(missing_communes), len(requested_communes)))
+ missing_communes = set(df_municipalities["commune_id"].unique()) - set(
+ df["commune_id"].unique()
+ )
+ print(
+ "Found %d/%d municipalities that are missing"
+ % (len(missing_communes), len(requested_communes))
+ )
# Find communes without full distribution
df["is_imputed"] = df["q2"].isna()
df["is_missing"] = False
- print("Found %d/%d municipalities which do not have full distribution" % (sum(df["is_imputed"]), len(requested_communes)))
+ print(
+ "Found %d/%d municipalities which do not have full distribution"
+ % (sum(df["is_imputed"]), len(requested_communes))
+ )
# First, find suitable distribution for incomplete cases by finding the one with the most similar median
incomplete_medians = df[df["is_imputed"]]["q5"].values
@@ -56,29 +87,44 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_
df_complete = df[~df["is_imputed"]]
complete_medians = df_complete["q5"].values
- indices = np.argmin(np.abs(complete_medians[:, np.newaxis] - incomplete_medians[np.newaxis, :]), axis = 0)
+ indices = np.argmin(
+ np.abs(complete_medians[:, np.newaxis] - incomplete_medians[np.newaxis, :]),
+ axis=0,
+ )
for k in range(1, 10):
- df.loc[df["is_imputed"], "q%d" % k] = df_complete.iloc[indices]["q%d" % k].values
+ df.loc[df["is_imputed"], "q%d" % k] = df_complete.iloc[indices][
+ "q%d" % k
+ ].values
# Second, add missing municipalities by neirest neighbor
# ... build tree of existing communes
- df_existing = df_municipalities[df_municipalities["commune_id"].astype(str).isin(df["commune_id"])] # pandas Bug
- coordinates = np.vstack([df_existing["geometry"].centroid.x, df_existing["geometry"].centroid.y]).T
+ df_existing = df_municipalities[
+ df_municipalities["commune_id"].astype(str).isin(df["commune_id"])
+ ] # pandas Bug
+ coordinates = np.vstack(
+ [df_existing["geometry"].centroid.x, df_existing["geometry"].centroid.y]
+ ).T
kd_tree = KDTree(coordinates)
# ... query tree for missing communes
- df_missing = df_municipalities[df_municipalities["commune_id"].astype(str).isin(missing_communes)] # pandas Bug
+ df_missing = df_municipalities[
+ df_municipalities["commune_id"].astype(str).isin(missing_communes)
+ ] # pandas Bug
if len(df_missing) > 0:
- coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T
+ coordinates = np.vstack(
+ [df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]
+ ).T
indices = kd_tree.query(coordinates)[1].flatten()
# ... build data frame of imputed communes
- df_reconstructed = pd.concat([
- df[df["commune_id"] == df_existing.iloc[index]["commune_id"]]
- for index in indices
- ])
+ df_reconstructed = pd.concat(
+ [
+ df[df["commune_id"] == df_existing.iloc[index]["commune_id"]]
+ for index in indices
+ ]
+ )
df_reconstructed["commune_id"] = df_missing["commune_id"].values
df_reconstructed["is_imputed"] = True
df_reconstructed["is_missing"] = True
@@ -97,11 +143,15 @@ def _income_distributions_from_filosofi_ensemble_sheet(filsofi_sheets, year, df_
return df[INCOME_DF_COLUMNS]
-def _income_distributions_from_filosofi_attribute_sheets(filsofi_sheets, year, df_municipalities, attributes):
+def _income_distributions_from_filosofi_attribute_sheets(
+ filsofi_sheets, year, df_municipalities, attributes
+):
requested_communes = set(df_municipalities["commune_id"].unique())
# read attributes
- df_with_attributes = read_filosofi_attributes(filsofi_sheets, year, attributes, requested_communes)
+ df_with_attributes = read_filosofi_attributes(
+ filsofi_sheets, year, attributes, requested_communes
+ )
df_with_attributes.rename(
columns={
@@ -139,8 +189,8 @@ def _read_filosofi_excel(context):
sheet_list = sheet_list + [x["sheet"] for x in attr["modalities"]]
# open and read income data file
- with zipfile.ZipFile("{}/{}".format(
- context.config("data_path"), context.config("income_com_path"))
+ with zipfile.ZipFile(
+ "{}/{}".format(context.config("data_path"), context.config("income_com_path"))
) as archive:
with archive.open(context.config("income_com_xlsx")) as f:
df = pd.read_excel(f, sheet_name=sheet_list, skiprows=5)
@@ -159,17 +209,25 @@ def execute(context):
filosofi_excel, attributes = _read_filosofi_excel(context)
# Read ENSEMBLE sheet: global distributions, by commune
- ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet(filosofi_excel, year, df_municipalities)
+ ensemble_distributions = _income_distributions_from_filosofi_ensemble_sheet(
+ filosofi_excel, year, df_municipalities
+ )
# Read attribute sheets: distributions on individuals with specific attribute values
# (ex: sheet TYPMENR_2 corresponds to households with `family_comp`=`Single_wom`)
- attribute_distributions = _income_distributions_from_filosofi_attribute_sheets(filosofi_excel, year, df_municipalities, attributes)
+ attribute_distributions = _income_distributions_from_filosofi_attribute_sheets(
+ filosofi_excel, year, df_municipalities, attributes
+ )
return pd.concat([ensemble_distributions, attribute_distributions])
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_com_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("income_com_path"))
+ ):
raise RuntimeError("Municipality Filosofi data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("income_com_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("income_com_path"))
+ )
diff --git a/data/income/region.py b/data/income/region.py
index 29643d0c..bb062ae3 100644
--- a/data/income/region.py
+++ b/data/income/region.py
@@ -6,19 +6,22 @@
Loads the regional aggregated income distribution.
"""
+
def configure(context):
context.config("data_path")
- context.config("income_reg_path", "filosofi_2019/indic-struct-distrib-revenu-2019-SUPRA.zip")
+ context.config(
+ "income_reg_path", "filosofi_2019/indic-struct-distrib-revenu-2019-SUPRA.zip"
+ )
context.config("income_reg_xlsx", "FILO2019_DISP_REG.xlsx")
context.config("income_year", 19)
+
def execute(context):
- with zipfile.ZipFile("{}/{}".format(
- context.config("data_path"), context.config("income_reg_path"))) as archive:
+ with zipfile.ZipFile(
+ "{}/{}".format(context.config("data_path"), context.config("income_reg_path"))
+ ) as archive:
with archive.open(context.config("income_reg_xlsx")) as f:
- df = pd.read_excel(f,
- sheet_name = "ENSEMBLE", skiprows = 5
- )
+ df = pd.read_excel(f, sheet_name="ENSEMBLE", skiprows=5)
values = df[df["CODGEO"] == 11][
[
@@ -29,8 +32,13 @@ def execute(context):
return values
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("income_reg_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("income_reg_path"))
+ ):
raise RuntimeError("Regional Filosofi data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("income_reg_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("income_reg_path"))
+ )
diff --git a/data/od/cleaned.py b/data/od/cleaned.py
index e13348f1..2690cab9 100644
--- a/data/od/cleaned.py
+++ b/data/od/cleaned.py
@@ -6,25 +6,34 @@
and education.
"""
+
def configure(context):
context.stage("data.od.raw")
context.stage("data.spatial.codes")
-RENAME = { "COMMUNE" : "origin_id", "DCLT" : "destination_id", "IPONDI" : "weight", "DCETUF" : "destination_id" }
+
+RENAME = {
+ "COMMUNE": "origin_id",
+ "DCLT": "destination_id",
+ "IPONDI": "weight",
+ "DCETUF": "destination_id",
+}
+
def execute(context):
-
-
+
# Load data
df_work, df_education = context.stage("data.od.raw")
# Renaming
- df_work = df_work.rename(RENAME, axis = 1)
- df_education = df_education.rename(RENAME, axis = 1)
+ df_work = df_work.rename(RENAME, axis=1)
+ df_education = df_education.rename(RENAME, axis=1)
# Fix arrondissements
df_work.loc[~df_work["ARM"].str.contains("Z"), "origin_id"] = df_work["ARM"]
- df_education.loc[~df_education["ARM"].str.contains("Z"), "origin_id"] = df_education["ARM"]
+ df_education.loc[~df_education["ARM"].str.contains("Z"), "origin_id"] = (
+ df_education["ARM"]
+ )
# Verify spatial data for work
df_codes = context.stage("data.spatial.codes")
@@ -32,7 +41,9 @@ def execute(context):
df_work["origin_id"] = df_work["origin_id"].astype("category")
df_work["destination_id"] = df_work["destination_id"].astype("category")
- excess_communes = (set(df_work["origin_id"].unique()) | set(df_work["destination_id"].unique())) - set(df_codes["commune_id"].unique())
+ excess_communes = (
+ set(df_work["origin_id"].unique()) | set(df_work["destination_id"].unique())
+ ) - set(df_codes["commune_id"].unique())
if len(excess_communes) > 0:
raise RuntimeError("Found additional communes: %s" % excess_communes)
@@ -42,7 +53,10 @@ def execute(context):
df_education["origin_id"] = df_education["origin_id"].astype("category")
df_education["destination_id"] = df_education["destination_id"].astype("category")
- excess_communes = (set(df_education["origin_id"].unique()) | set(df_education["destination_id"].unique())) - set(df_codes["commune_id"].unique())
+ excess_communes = (
+ set(df_education["origin_id"].unique())
+ | set(df_education["destination_id"].unique())
+ ) - set(df_codes["commune_id"].unique())
if len(excess_communes) > 0:
raise RuntimeError("Found additional communes: %s" % excess_communes)
@@ -55,7 +69,7 @@ def execute(context):
df_work.loc[df_work["TRANS"] == 5, "commute_mode"] = "car"
df_work.loc[df_work["TRANS"] == 6, "commute_mode"] = "pt"
df_work["commute_mode"] = df_work["commute_mode"].astype("category")
-
+
assert not np.any(df_work["commute_mode"].isna())
# Clean age range for education
@@ -65,15 +79,23 @@ def execute(context):
df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school"
df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education"
df_education["age_range"] = df_education["age_range"].astype("category")
-
+
assert not np.any(df_education["age_range"].isna())
# Aggregate the flows
print("Aggregating work ...")
- df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index()
+ df_work = (
+ df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"]
+ .sum()
+ .reset_index()
+ )
print("Aggregating education ...")
- df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index()
+ df_education = (
+ df_education.groupby(["origin_id", "destination_id", "age_range"])["weight"]
+ .sum()
+ .reset_index()
+ )
df_work["weight"] = df_work["weight"].fillna(0.0)
df_education["weight"] = df_education["weight"].fillna(0.0)
diff --git a/data/od/raw.py b/data/od/raw.py
index 41bc515b..fb70cce9 100644
--- a/data/od/raw.py
+++ b/data/od/raw.py
@@ -6,6 +6,7 @@
Loads raw OD data from French census data.
"""
+
def configure(context):
context.stage("data.spatial.codes")
context.config("data_path")
@@ -14,27 +15,34 @@ def configure(context):
context.config("od_pro_csv", "FD_MOBPRO_2019.csv")
context.config("od_sco_csv", "FD_MOBSCO_2019.csv")
+
def execute(context):
df_codes = context.stage("data.spatial.codes")
requested_communes = df_codes["commune_id"].unique()
# First, load work
- with context.progress(label = "Reading work flows ...") as progress:
+ with context.progress(label="Reading work flows ...") as progress:
df_records = []
COLUMNS_DTYPES = {
- "COMMUNE":"str",
- "ARM":"str",
- "TRANS":"int",
- "IPONDI":"float",
- "DCLT":"str"
+ "COMMUNE": "str",
+ "ARM": "str",
+ "TRANS": "int",
+ "IPONDI": "float",
+ "DCLT": "str",
}
with zipfile.ZipFile(
- "{}/{}".format(context.config("data_path"), context.config("od_pro_path"))) as archive:
+ "{}/{}".format(context.config("data_path"), context.config("od_pro_path"))
+ ) as archive:
with archive.open(context.config("od_pro_csv")) as f:
- csv = pd.read_csv(f, usecols = COLUMNS_DTYPES.keys(),
- dtype = COLUMNS_DTYPES, sep = ";",chunksize = 10240)
+ csv = pd.read_csv(
+ f,
+ usecols=COLUMNS_DTYPES.keys(),
+ dtype=COLUMNS_DTYPES,
+ sep=";",
+ chunksize=10240,
+ )
for df_chunk in csv:
progress.update(len(df_chunk))
@@ -50,22 +58,28 @@ def execute(context):
work = pd.concat(df_records)
# Second, load education
- with context.progress(label = "Reading education flows ...") as progress:
+ with context.progress(label="Reading education flows ...") as progress:
df_records = []
COLUMNS_DTYPES = {
- "COMMUNE":"str",
- "ARM":"str",
- "IPONDI":"float",
- "DCETUF":"str",
- "AGEREV10":"int"
+ "COMMUNE": "str",
+ "ARM": "str",
+ "IPONDI": "float",
+ "DCETUF": "str",
+ "AGEREV10": "int",
}
with zipfile.ZipFile(
- "{}/{}".format(context.config("data_path"), context.config("od_sco_path"))) as archive:
+ "{}/{}".format(context.config("data_path"), context.config("od_sco_path"))
+ ) as archive:
with archive.open(context.config("od_sco_csv")) as f:
- csv = pd.read_csv(f, usecols = COLUMNS_DTYPES.keys(),
- dtype = COLUMNS_DTYPES, sep = ";",chunksize = 10240)
+ csv = pd.read_csv(
+ f,
+ usecols=COLUMNS_DTYPES.keys(),
+ dtype=COLUMNS_DTYPES,
+ sep=";",
+ chunksize=10240,
+ )
for df_chunk in csv:
progress.update(len(df_chunk))
@@ -84,13 +98,21 @@ def execute(context):
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("od_pro_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("od_pro_path"))
+ ):
raise RuntimeError("RP MOBPRO data is not available")
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("od_sco_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("od_sco_path"))
+ ):
raise RuntimeError("RP MOBSCO data is not available")
return [
- os.path.getsize("%s/%s" % (context.config("data_path"), context.config("od_pro_path"))),
- os.path.getsize("%s/%s" % (context.config("data_path"), context.config("od_sco_path")))
+ os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("od_pro_path"))
+ ),
+ os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("od_sco_path"))
+ ),
]
diff --git a/data/od/weighted.py b/data/od/weighted.py
index f50702f6..e9c5e86e 100644
--- a/data/od/weighted.py
+++ b/data/od/weighted.py
@@ -9,13 +9,15 @@
Potential TODO: Do this by mode of transport!
"""
+
def configure(context):
context.stage("data.od.cleaned")
context.stage("data.spatial.codes")
- context.config("education_location_source","bpe")
+ context.config("education_location_source", "bpe")
+
-def fix_origins(df, commune_ids, purpose,category):
+def fix_origins(df, commune_ids, purpose, category):
existing_ids = set(np.unique(df["origin_id"]))
missing_ids = commune_ids - existing_ids
categories = set(np.unique(df[category]))
@@ -23,14 +25,27 @@ def fix_origins(df, commune_ids, purpose,category):
rows = []
for origin_id in missing_ids:
for destination_id in commune_ids:
- for category_name in categories :
- rows.append((origin_id, destination_id, category_name, 1.0 if origin_id == destination_id else 0.0))
+ for category_name in categories:
+ rows.append(
+ (
+ origin_id,
+ destination_id,
+ category_name,
+ 1.0 if origin_id == destination_id else 0.0,
+ )
+ )
print("Fixing %d origins for %s" % (len(missing_ids), purpose))
- return pd.concat([df, pd.DataFrame.from_records(
- rows, columns = ["origin_id", "destination_id", category, "weight"]
- )]).sort_values(["origin_id", "destination_id"])
+ return pd.concat(
+ [
+ df,
+ pd.DataFrame.from_records(
+ rows, columns=["origin_id", "destination_id", category, "weight"]
+ ),
+ ]
+ ).sort_values(["origin_id", "destination_id"])
+
def execute(context):
df_codes = context.stage("data.spatial.codes")
@@ -40,22 +55,44 @@ def execute(context):
df_work, df_education = context.stage("data.od.cleaned")
# Add missing origins
- df_work = fix_origins(df_work, commune_ids, "work","commute_mode")
- df_education = fix_origins(df_education, commune_ids, "education","age_range")
+ df_work = fix_origins(df_work, commune_ids, "work", "commute_mode")
+ df_education = fix_origins(df_education, commune_ids, "education", "age_range")
# Aggregate work (we do not consider different modes at the moment)
- df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
-
+ df_work = (
+ df_work[["origin_id", "destination_id", "weight"]]
+ .groupby(["origin_id", "destination_id"])
+ .sum()
+ .reset_index()
+ )
+
# Compute totals
- df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
- df_work = pd.merge(df_work, df_total, on = "origin_id")
+ df_total = (
+ df_work[["origin_id", "weight"]]
+ .groupby("origin_id")
+ .sum()
+ .reset_index()
+ .rename({"weight": "total"}, axis=1)
+ )
+ df_work = pd.merge(df_work, df_total, on="origin_id")
+
+ df_total = (
+ df_education[["origin_id", "age_range", "weight"]]
+ .groupby(["origin_id", "age_range"])
+ .sum()
+ .reset_index()
+ .rename({"weight": "total"}, axis=1)
+ )
+ df_education = pd.merge(df_education, df_total, on=["origin_id", "age_range"])
- df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1)
- df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"])
-
- if context.config("education_location_source") == 'bpe':
+ if context.config("education_location_source") == "bpe":
# Aggregate education (we do not consider different age range with bpe source)
- df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
+ df_education = (
+ df_education[["origin_id", "destination_id", "weight", "total"]]
+ .groupby(["origin_id", "destination_id"])
+ .sum()
+ .reset_index()
+ )
# Compute weight
df_work["weight"] /= df_work["total"]
df_education["weight"] /= df_education["total"]
@@ -63,5 +100,5 @@ def execute(context):
del df_work["total"]
del df_education["total"]
df_education = df_education.fillna(0.0)
-
+
return df_work, df_education
diff --git a/data/osm/cleaned.py b/data/osm/cleaned.py
index c15de109..f7fb95cf 100644
--- a/data/osm/cleaned.py
+++ b/data/osm/cleaned.py
@@ -18,6 +18,7 @@
Additionally, the stage cuts the OSM data to the requested region of the pipeline.
"""
+
def configure(context):
context.config("data_path")
context.config("osm_path", "osm_idf")
@@ -28,11 +29,12 @@ def configure(context):
context.stage("data.osm.osmosis")
context.stage("data.spatial.municipalities")
-def write_poly(df, path, geometry_column = "geometry"):
+
+def write_poly(df, path, geometry_column="geometry"):
df = df.to_crs("EPSG:4326")
df["aggregate"] = 0
- area = df.dissolve(by = "aggregate")[geometry_column].values[0]
+ area = df.dissolve(by="aggregate")[geometry_column].values[0]
if not hasattr(area, "exterior"):
print("Selected area is not connected -> Using convex hull.")
@@ -51,9 +53,12 @@ def write_poly(df, path, geometry_column = "geometry"):
with open(path, "w+") as f:
f.write("\n".join(data))
+
def execute(context):
- input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("osm_path")))
-
+ input_files = get_input_files(
+ "{}/{}".format(context.config("data_path"), context.config("osm_path"))
+ )
+
# Prepare bounding area
df_area = context.stage("data.spatial.municipalities")
write_poly(df_area, "%s/boundary.poly" % context.path())
@@ -70,12 +75,22 @@ def execute(context):
absolute_path = os.path.abspath(path)
- data.osm.osmosis.run(context, [
- "--read-%s" % mode, absolute_path,
- "--tag-filter", "accept-ways", "highway=%s" % highway_tags, "railway=%s" % railway_tags,
- "--bounding-polygon", "file=%s/boundary.poly" % context.path(), "completeWays=yes",
- "--write-pbf", "filtered_%d.osm.pbf" % index
- ])
+ data.osm.osmosis.run(
+ context,
+ [
+ "--read-%s" % mode,
+ absolute_path,
+ "--tag-filter",
+ "accept-ways",
+ "highway=%s" % highway_tags,
+ "railway=%s" % railway_tags,
+ "--bounding-polygon",
+ "file=%s/boundary.poly" % context.path(),
+ "completeWays=yes",
+ "--write-pbf",
+ "filtered_%d.osm.pbf" % index,
+ ],
+ )
# Merge filtered files if there are multiple ones
print("Merging and compressing OSM data...")
@@ -98,17 +113,23 @@ def execute(context):
return "output.osm.gz"
+
def get_input_files(base_path):
osm_paths = sorted(list(glob.glob("{}/*.osm.pbf".format(base_path))))
osm_paths += sorted(list(glob.glob("{}/*.osm.xml".format(base_path))))
if len(osm_paths) == 0:
- raise RuntimeError("Did not find any OSM data (.osm.pbf) in {}".format(base_path))
-
+ raise RuntimeError(
+ "Did not find any OSM data (.osm.pbf) in {}".format(base_path)
+ )
+
return osm_paths
+
def validate(context):
- input_files = get_input_files("{}/{}".format(context.config("data_path"), context.config("osm_path")))
+ input_files = get_input_files(
+ "{}/{}".format(context.config("data_path"), context.config("osm_path"))
+ )
total_size = 0
for path in input_files:
diff --git a/data/osm/osmosis.py b/data/osm/osmosis.py
index 3913ddf4..39959d5e 100644
--- a/data/osm/osmosis.py
+++ b/data/osm/osmosis.py
@@ -1,15 +1,17 @@
import subprocess as sp
import shutil, os
+
def configure(context):
context.config("osmosis_binary", "osmosis")
context.config("java_binary", "java")
context.config("java_memory", "50G")
-def run(context, arguments = [], cwd = None):
+
+def run(context, arguments=[], cwd=None):
"""
- This function calls osmosis.
+ This function calls osmosis.
"""
# Make sure there is a dependency
context.stage("data.osm.osmosis")
@@ -18,9 +20,7 @@ def run(context, arguments = [], cwd = None):
cwd = context.path()
# Prepare command line
- command_line = [
- shutil.which(context.config("osmosis_binary"))
- ] + arguments
+ command_line = [shutil.which(context.config("osmosis_binary"))] + arguments
# Prepare environment
environment = os.environ.copy()
@@ -28,20 +28,23 @@ def run(context, arguments = [], cwd = None):
environment["JAVACMD_OPTIONS"] = "-Xmx%s" % context.config("java_memory")
# Run Osmosis
- return_code = sp.check_call(command_line, cwd = cwd, env = environment)
+ return_code = sp.check_call(command_line, cwd=cwd, env=environment)
if not return_code == 0:
raise RuntimeError("Osmosis return code: %d" % return_code)
+
def validate(context):
if shutil.which(context.config("osmosis_binary")) in ["", None]:
- raise RuntimeError("Cannot find Osmosis binary at: %s" % context.config("osmosis_binary"))
+ raise RuntimeError(
+ "Cannot find Osmosis binary at: %s" % context.config("osmosis_binary")
+ )
- if not b"0.48." in sp.check_output([
- shutil.which(context.config("osmosis_binary")),
- "-v"
- ], stderr = sp.STDOUT):
+ if not b"0.48." in sp.check_output(
+ [shutil.which(context.config("osmosis_binary")), "-v"], stderr=sp.STDOUT
+ ):
print("WARNING! Osmosis of at least version 0.48.x is recommended!")
+
def execute(context):
pass
diff --git a/data/sirene/cleaned.py b/data/sirene/cleaned.py
index 9bef6da5..0c5ef575 100644
--- a/data/sirene/cleaned.py
+++ b/data/sirene/cleaned.py
@@ -4,26 +4,27 @@
"""
Clean the SIRENE enterprise census.
"""
-
+
+
def configure(context):
- context.stage("data.sirene.raw_siren", ephemeral = True)
- context.stage("data.sirene.raw_siret", ephemeral = True)
+ context.stage("data.sirene.raw_siren", ephemeral=True)
+ context.stage("data.sirene.raw_siret", ephemeral=True)
context.stage("data.spatial.codes")
context.config("exclude_no_employee", False)
+
def execute(context):
df_sirene_establishments = context.stage("data.sirene.raw_siret")
df_sirene_headquarters = context.stage("data.sirene.raw_siren")
-
# Filter out establishments without a corresponding headquarter
- df_sirene = df_sirene_establishments[df_sirene_establishments["siren"].isin(df_sirene_headquarters["siren"])].copy()
+ df_sirene = df_sirene_establishments[
+ df_sirene_establishments["siren"].isin(df_sirene_headquarters["siren"])
+ ].copy()
# Remove inactive enterprises
- df_sirene = df_sirene[
- df_sirene["etatAdministratifEtablissement"] == "A"
- ].copy()
-
+ df_sirene = df_sirene[df_sirene["etatAdministratifEtablissement"] == "A"].copy()
+
if context.config("exclude_no_employee"):
# exclude "NN", "00", and NaN
df_sirene = df_sirene[
@@ -32,37 +33,93 @@ def execute(context):
].copy()
# Define work place weights by person under salary ....
- df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN
- df_sirene["maximum_employees"] = 1 # Includes "NN", "00", and NaN
-
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "01", "minimum_employees"] = 1
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "01", "maximum_employees"] = 2
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "02", "minimum_employees"] = 3
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "02", "maximum_employees"] = 5
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "03", "minimum_employees"] = 6
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "03", "maximum_employees"] = 9
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "11", "minimum_employees"] = 10
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "11", "maximum_employees"] = 19
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "12", "minimum_employees"] = 20
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "12", "maximum_employees"] = 49
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "21", "minimum_employees"] = 50
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "21", "maximum_employees"] = 99
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "22", "minimum_employees"] = 100
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "22", "maximum_employees"] = 199
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "31", "minimum_employees"] = 200
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "31", "maximum_employees"] = 249
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "32", "minimum_employees"] = 250
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "32", "maximum_employees"] = 499
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "41", "minimum_employees"] = 500
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "41", "maximum_employees"] = 999
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "42", "minimum_employees"] = 1000
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "42", "maximum_employees"] = 1999
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "51", "minimum_employees"] = 2000
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "51", "maximum_employees"] = 4999
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "52", "minimum_employees"] = 5000
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "52", "maximum_employees"] = 9999
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "53", "minimum_employees"] = 10000
- df_sirene.loc[df_sirene["trancheEffectifsEtablissement"] == "53", "maximum_employees"] = np.inf
+ df_sirene["minimum_employees"] = 1 # Includes "NN", "00", and NaN
+ df_sirene["maximum_employees"] = 1 # Includes "NN", "00", and NaN
+
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "01", "minimum_employees"
+ ] = 1
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "01", "maximum_employees"
+ ] = 2
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "02", "minimum_employees"
+ ] = 3
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "02", "maximum_employees"
+ ] = 5
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "03", "minimum_employees"
+ ] = 6
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "03", "maximum_employees"
+ ] = 9
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "11", "minimum_employees"
+ ] = 10
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "11", "maximum_employees"
+ ] = 19
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "12", "minimum_employees"
+ ] = 20
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "12", "maximum_employees"
+ ] = 49
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "21", "minimum_employees"
+ ] = 50
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "21", "maximum_employees"
+ ] = 99
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "22", "minimum_employees"
+ ] = 100
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "22", "maximum_employees"
+ ] = 199
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "31", "minimum_employees"
+ ] = 200
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "31", "maximum_employees"
+ ] = 249
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "32", "minimum_employees"
+ ] = 250
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "32", "maximum_employees"
+ ] = 499
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "41", "minimum_employees"
+ ] = 500
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "41", "maximum_employees"
+ ] = 999
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "42", "minimum_employees"
+ ] = 1000
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "42", "maximum_employees"
+ ] = 1999
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "51", "minimum_employees"
+ ] = 2000
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "51", "maximum_employees"
+ ] = 4999
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "52", "minimum_employees"
+ ] = 5000
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "52", "maximum_employees"
+ ] = 9999
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "53", "minimum_employees"
+ ] = 10000
+ df_sirene.loc[
+ df_sirene["trancheEffectifsEtablissement"] == "53", "maximum_employees"
+ ] = np.inf
# Add activity classification
df_sirene["ape"] = df_sirene["activitePrincipaleEtablissement"]
@@ -80,15 +137,24 @@ def execute(context):
if len(excess_communes) > 5:
raise RuntimeError("Found more than 5 excess municipalities in SIRENE data")
- df_sirene = df_sirene[["siren", "commune_id", "minimum_employees", "maximum_employees", "ape", "siret"]]
+ df_sirene = df_sirene[
+ [
+ "siren",
+ "commune_id",
+ "minimum_employees",
+ "maximum_employees",
+ "ape",
+ "siret",
+ ]
+ ]
# Add law status
initial_count = len(df_sirene)
- df_sirene = pd.merge(df_sirene, df_sirene_headquarters, on = "siren")
+ df_sirene = pd.merge(df_sirene, df_sirene_headquarters, on="siren")
df_sirene["law_status"] = df_sirene["categorieJuridiqueUniteLegale"]
- df_sirene = df_sirene.drop(columns = ["categorieJuridiqueUniteLegale", "siren"])
+ df_sirene = df_sirene.drop(columns=["categorieJuridiqueUniteLegale", "siren"])
final_count = len(df_sirene)
assert initial_count == final_count
diff --git a/data/sirene/localized.py b/data/sirene/localized.py
index 243b51c7..e2111c1d 100644
--- a/data/sirene/localized.py
+++ b/data/sirene/localized.py
@@ -6,6 +6,8 @@
Should we consider using location accuracy variable to optimize process?
"""
+
+
def configure(context):
context.stage("data.sirene.cleaned")
context.stage("data.sirene.raw_geoloc")
@@ -15,19 +17,20 @@ def execute(context):
df_sirene = context.stage("data.sirene.cleaned")
df_siret_geoloc = context.stage("data.sirene.raw_geoloc")
-
# merging geographical SIREN file (containing only SIRET and location) with full SIREN file (all variables and processed)
- df_siret_geoloc.set_index(("siret"),inplace=True,verify_integrity=True)
- df_sirene.set_index(("siret"),inplace=True,verify_integrity=True)
+ df_siret_geoloc.set_index(("siret"), inplace=True, verify_integrity=True)
+ df_sirene.set_index(("siret"), inplace=True, verify_integrity=True)
df_siret_geoloc.sort_index(inplace=True)
df_sirene.sort_index(inplace=True)
- df_sirene = df_sirene.join(df_siret_geoloc,how="left")
- df_sirene.dropna(subset=['x', 'y'],inplace=True)
-
+ df_sirene = df_sirene.join(df_siret_geoloc, how="left")
+ df_sirene.dropna(subset=["x", "y"], inplace=True)
# convert to geopandas dataframe with Lambert 93, EPSG:2154 french official projection
- df_sirene = gpd.GeoDataFrame(df_sirene, geometry=gpd.points_from_xy(df_sirene.x, df_sirene.y),crs="EPSG:2154")
-
+ df_sirene = gpd.GeoDataFrame(
+ df_sirene,
+ geometry=gpd.points_from_xy(df_sirene.x, df_sirene.y),
+ crs="EPSG:2154",
+ )
return df_sirene
diff --git a/data/sirene/output.py b/data/sirene/output.py
index a64a9a27..87de6fe9 100644
--- a/data/sirene/output.py
+++ b/data/sirene/output.py
@@ -3,16 +3,20 @@
makes it easy to extract the data set from the pipeline.
"""
+
def configure(context):
context.stage("data.sirene.localized")
context.config("output_path")
context.config("output_prefix", "ile_de_france_")
+
def execute(context):
df_sirene = context.stage("data.sirene.localized")
df_sirene["commune_id"] = df_sirene["commune_id"].astype(str)
- df_sirene.to_file("%s/%ssirene.gpkg" % (
- context.config("output_path"), context.config("output_prefix")), driver = "GPKG")
-
+ df_sirene.to_file(
+ "%s/%ssirene.gpkg"
+ % (context.config("output_path"), context.config("output_prefix")),
+ driver="GPKG",
+ )
diff --git a/data/sirene/raw_geoloc.py b/data/sirene/raw_geoloc.py
index 7887710c..5537f499 100644
--- a/data/sirene/raw_geoloc.py
+++ b/data/sirene/raw_geoloc.py
@@ -5,10 +5,14 @@
This stage loads the geolocalization data for the French enterprise registry.
"""
+
def configure(context):
context.config("data_path")
- context.config("siret_geo_path", "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip")
-
+ context.config(
+ "siret_geo_path",
+ "sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip",
+ )
+
context.stage("data.spatial.codes")
@@ -16,37 +20,47 @@ def execute(context):
# Filter by departement
df_codes = context.stage("data.spatial.codes")
requested_departements = set(df_codes["departement_id"].unique())
-
+
COLUMNS_DTYPES = {
- "siret":"int64",
- "x":"float",
- "y":"float",
- "plg_code_commune":"str",
+ "siret": "int64",
+ "x": "float",
+ "y": "float",
+ "plg_code_commune": "str",
}
- df_siret_geoloc = pd.DataFrame(columns=["siret","x","y"])
-
- with context.progress(label = "Reading geolocalized SIRET ...") as progress:
- csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siret_geo_path")),
- usecols = COLUMNS_DTYPES.keys(), sep=";",dtype = COLUMNS_DTYPES,chunksize = 10240)
-
- for df_chunk in csv:
+ df_siret_geoloc = pd.DataFrame(columns=["siret", "x", "y"])
+
+ with context.progress(label="Reading geolocalized SIRET ...") as progress:
+ csv = pd.read_csv(
+ "%s/%s" % (context.config("data_path"), context.config("siret_geo_path")),
+ usecols=COLUMNS_DTYPES.keys(),
+ sep=";",
+ dtype=COLUMNS_DTYPES,
+ chunksize=10240,
+ )
+
+ for df_chunk in csv:
progress.update(len(df_chunk))
-
- f = df_chunk["siret"].isna() # Just to get a mask
-
+
+ f = df_chunk["siret"].isna() # Just to get a mask
+
for departement in requested_departements:
f |= df_chunk["plg_code_commune"].str.startswith(departement)
- df_siret_geoloc = pd.concat([df_siret_geoloc, df_chunk[f]],ignore_index=True)
+ df_siret_geoloc = pd.concat(
+ [df_siret_geoloc, df_chunk[f]], ignore_index=True
+ )
return df_siret_geoloc
-
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siret_geo_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("siret_geo_path"))
+ ):
raise RuntimeError("SIRENE: geolocaized SIRET data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siret_geo_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("siret_geo_path"))
+ )
diff --git a/data/sirene/raw_siren.py b/data/sirene/raw_siren.py
index 0a7d0ae5..a612f1ce 100644
--- a/data/sirene/raw_siren.py
+++ b/data/sirene/raw_siren.py
@@ -5,41 +5,48 @@
This stage loads the raw data from the French enterprise registry.
"""
+
def configure(context):
context.config("data_path")
context.config("siren_path", "sirene/StockUniteLegale_utf8.zip")
context.stage("data.sirene.raw_siret")
+
def execute(context):
relevant_siren = context.stage("data.sirene.raw_siret")["siren"].unique()
df_siren = []
-
-
COLUMNS_DTYPES = {
- "siren":"int32",
- "categorieJuridiqueUniteLegale":"str",
+ "siren": "int32",
+ "categorieJuridiqueUniteLegale": "str",
}
-
- with context.progress(label = "Reading SIREN...") as progress:
- csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siren_path")),
- usecols = COLUMNS_DTYPES.keys(), dtype = COLUMNS_DTYPES,chunksize = 10240)
+
+ with context.progress(label="Reading SIREN...") as progress:
+ csv = pd.read_csv(
+ "%s/%s" % (context.config("data_path"), context.config("siren_path")),
+ usecols=COLUMNS_DTYPES.keys(),
+ dtype=COLUMNS_DTYPES,
+ chunksize=10240,
+ )
for df_chunk in csv:
progress.update(len(df_chunk))
- df_chunk = df_chunk[
- df_chunk["siren"].isin(relevant_siren)
- ]
+ df_chunk = df_chunk[df_chunk["siren"].isin(relevant_siren)]
if len(df_chunk) > 0:
df_siren.append(df_chunk)
return pd.concat(df_siren)
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siren_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("siren_path"))
+ ):
raise RuntimeError("SIRENE: SIREN data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siren_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("siren_path"))
+ )
diff --git a/data/sirene/raw_siret.py b/data/sirene/raw_siret.py
index 7b10713a..0bbadbcd 100644
--- a/data/sirene/raw_siret.py
+++ b/data/sirene/raw_siret.py
@@ -5,12 +5,14 @@
This stage loads the raw data from the French enterprise registry.
"""
+
def configure(context):
context.config("data_path")
context.config("siret_path", "sirene/StockEtablissement_utf8.zip")
context.stage("data.spatial.codes")
+
def execute(context):
# Filter by departement
df_codes = context.stage("data.spatial.codes")
@@ -18,24 +20,27 @@ def execute(context):
df_siret = []
-
COLUMNS_DTYPES = {
- "siren":"int32",
- "siret":"int64",
- "codeCommuneEtablissement":"str",
- "activitePrincipaleEtablissement":"str",
- "trancheEffectifsEtablissement":"str",
- "etatAdministratifEtablissement":"str"
+ "siren": "int32",
+ "siret": "int64",
+ "codeCommuneEtablissement": "str",
+ "activitePrincipaleEtablissement": "str",
+ "trancheEffectifsEtablissement": "str",
+ "etatAdministratifEtablissement": "str",
}
-
- with context.progress(label = "Reading SIRET...") as progress:
- csv = pd.read_csv("%s/%s" % (context.config("data_path"), context.config("siret_path")),
- usecols = COLUMNS_DTYPES.keys(), dtype = COLUMNS_DTYPES,chunksize = 10240)
+
+ with context.progress(label="Reading SIRET...") as progress:
+ csv = pd.read_csv(
+ "%s/%s" % (context.config("data_path"), context.config("siret_path")),
+ usecols=COLUMNS_DTYPES.keys(),
+ dtype=COLUMNS_DTYPES,
+ chunksize=10240,
+ )
for df_chunk in csv:
progress.update(len(df_chunk))
- f = df_chunk["codeCommuneEtablissement"].isna() # Just to get a mask
+ f = df_chunk["codeCommuneEtablissement"].isna() # Just to get a mask
for departement in requested_departements:
f |= df_chunk["codeCommuneEtablissement"].str.startswith(departement)
@@ -46,11 +51,15 @@ def execute(context):
if len(df_chunk) > 0:
df_siret.append(df_chunk)
-
return pd.concat(df_siret)
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("siret_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("siret_path"))
+ ):
raise RuntimeError("SIRENE: SIRET data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("siret_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("siret_path"))
+ )
diff --git a/data/spatial/centroid_distances.py b/data/spatial/centroid_distances.py
index b84d6371..ff8507de 100644
--- a/data/spatial/centroid_distances.py
+++ b/data/spatial/centroid_distances.py
@@ -1,18 +1,32 @@
import pandas as pd
+
def configure(context):
context.stage("data.spatial.municipalities")
+
def execute(context):
df = context.stage("data.spatial.municipalities")
records = []
- with context.progress(total = len(df)**2, label = "Calculating centroid distances ...") as progress:
+ with context.progress(
+ total=len(df) ** 2, label="Calculating centroid distances ..."
+ ) as progress:
for origin_id, origin_geometry in zip(df["commune_id"], df["geometry"]):
- for destination_id, destination_geometry in zip(df["commune_id"], df["geometry"]):
- records.append((
- origin_id, destination_id, origin_geometry.centroid.distance(destination_geometry.centroid)
- ))
+ for destination_id, destination_geometry in zip(
+ df["commune_id"], df["geometry"]
+ ):
+ records.append(
+ (
+ origin_id,
+ destination_id,
+ origin_geometry.centroid.distance(
+ destination_geometry.centroid
+ ),
+ )
+ )
progress.update()
- return pd.DataFrame.from_records(records, columns = ["origin_id", "destination_id", "centroid_distance"])
+ return pd.DataFrame.from_records(
+ records, columns=["origin_id", "destination_id", "centroid_distance"]
+ )
diff --git a/data/spatial/code_changes.py b/data/spatial/code_changes.py
index a65df499..4c80a724 100644
--- a/data/spatial/code_changes.py
+++ b/data/spatial/code_changes.py
@@ -10,43 +10,58 @@
YEAR = 2021
SOURCE = "codes_%d/reference_IRIS_geo%d.xlsx" % (YEAR, YEAR)
+
def configure(context):
context.config("data_path")
context.config("regions", [11])
context.config("departments", [])
+
def execute(context):
# Load IRIS registry
df_modifications = pd.read_excel(
"%s/%s" % (context.config("data_path"), SOURCE),
- skiprows = 5, sheet_name = "Modifications_IRIS"
- )[["IRIS_INI", "IRIS_FIN", "COM_INI", "COM_FIN"]].rename(columns = {
- "IRIS_INI": "initial_iris", "IRIS_FIN": "final_iris",
- "COM_INI": "initial_commune", "COM_FIN": "final_commune"
- })
+ skiprows=5,
+ sheet_name="Modifications_IRIS",
+ )[["IRIS_INI", "IRIS_FIN", "COM_INI", "COM_FIN"]].rename(
+ columns={
+ "IRIS_INI": "initial_iris",
+ "IRIS_FIN": "final_iris",
+ "COM_INI": "initial_commune",
+ "COM_FIN": "final_commune",
+ }
+ )
- df_modifications["initial_iris"] = df_modifications["initial_iris"].astype("category")
+ df_modifications["initial_iris"] = df_modifications["initial_iris"].astype(
+ "category"
+ )
df_modifications["final_iris"] = df_modifications["final_iris"].astype("category")
- df_modifications["initial_commune"] = df_modifications["initial_commune"].astype("category")
- df_modifications["final_commune"] = df_modifications["final_commune"].astype("category")
+ df_modifications["initial_commune"] = df_modifications["initial_commune"].astype(
+ "category"
+ )
+ df_modifications["final_commune"] = df_modifications["final_commune"].astype(
+ "category"
+ )
return df_modifications
+
def validate(context):
if not os.path.exists("%s/%s" % (context.config("data_path"), SOURCE)):
raise RuntimeError("Spatial reference codes are not available")
return os.path.getsize("%s/%s" % (context.config("data_path"), SOURCE))
+
def update(df_changes, level, values):
initial_slot = "initial_%s" % level
final_slot = "final_%s" % level
df_source = df_changes[df_changes[initial_slot].isin(values.unique())]
- dictionary = { k: v for k, v in zip(df_source[initial_slot], df_source[final_slot]) }
+ dictionary = {k: v for k, v in zip(df_source[initial_slot], df_source[final_slot])}
if len(dictionary) > 0:
print("Updating %d deprecated zone identifiers ..." % len(dictionary))
-
+
return values.replace(dictionary)
diff --git a/data/spatial/codes.py b/data/spatial/codes.py
index 38200a14..c7049363 100644
--- a/data/spatial/codes.py
+++ b/data/spatial/codes.py
@@ -8,6 +8,7 @@
departement and région.
"""
+
def configure(context):
context.config("data_path")
@@ -16,19 +17,23 @@ def configure(context):
context.config("codes_path", "codes_2021/reference_IRIS_geo2021.zip")
context.config("codes_xlsx", "reference_IRIS_geo2021.xlsx")
+
def execute(context):
# Load IRIS registry
with zipfile.ZipFile(
- "{}/{}".format(context.config("data_path"), context.config("codes_path"))) as archive:
+ "{}/{}".format(context.config("data_path"), context.config("codes_path"))
+ ) as archive:
with archive.open(context.config("codes_xlsx")) as f:
- df_codes = pd.read_excel(f,
- skiprows = 5, sheet_name = "Emboitements_IRIS"
- )[["CODE_IRIS", "DEPCOM", "DEP", "REG"]].rename(columns = {
- "CODE_IRIS": "iris_id",
- "DEPCOM": "commune_id",
- "DEP": "departement_id",
- "REG": "region_id"
- })
+ df_codes = pd.read_excel(f, skiprows=5, sheet_name="Emboitements_IRIS")[
+ ["CODE_IRIS", "DEPCOM", "DEP", "REG"]
+ ].rename(
+ columns={
+ "CODE_IRIS": "iris_id",
+ "DEPCOM": "commune_id",
+ "DEP": "departement_id",
+ "REG": "region_id",
+ }
+ )
df_codes["iris_id"] = df_codes["iris_id"].astype("category")
df_codes["commune_id"] = df_codes["commune_id"].astype("category")
@@ -47,12 +52,19 @@ def execute(context):
df_codes["iris_id"] = df_codes["iris_id"].cat.remove_unused_categories()
df_codes["commune_id"] = df_codes["commune_id"].cat.remove_unused_categories()
- df_codes["departement_id"] = df_codes["departement_id"].cat.remove_unused_categories()
+ df_codes["departement_id"] = df_codes[
+ "departement_id"
+ ].cat.remove_unused_categories()
return df_codes
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("codes_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("codes_path"))
+ ):
raise RuntimeError("Spatial reference codes are not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("codes_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("codes_path"))
+ )
diff --git a/data/spatial/departments.py b/data/spatial/departments.py
index c055e51e..15b669af 100644
--- a/data/spatial/departments.py
+++ b/data/spatial/departments.py
@@ -7,11 +7,17 @@
Provides the municipality zoning system.
"""
+
def configure(context):
context.stage("data.spatial.municipalities")
+
def execute(context):
- df_departements = context.stage("data.spatial.municipalities").dissolve(
- by = "departement_id").drop(columns = ["commune_id", "has_iris"]).reset_index()
+ df_departements = (
+ context.stage("data.spatial.municipalities")
+ .dissolve(by="departement_id")
+ .drop(columns=["commune_id", "has_iris"])
+ .reset_index()
+ )
return df_departements
diff --git a/data/spatial/iris.py b/data/spatial/iris.py
index 8f10457a..56603084 100644
--- a/data/spatial/iris.py
+++ b/data/spatial/iris.py
@@ -8,35 +8,35 @@
Loads the IRIS zoning system.
"""
+
def configure(context):
context.config("data_path")
context.config("iris_path", "iris_2021")
context.stage("data.spatial.codes")
+
def execute(context):
df_codes = context.stage("data.spatial.codes")
- source_path = find_iris("{}/{}".format(context.config("data_path"), context.config("iris_path")))
+ source_path = find_iris(
+ "{}/{}".format(context.config("data_path"), context.config("iris_path"))
+ )
with py7zr.SevenZipFile(source_path) as archive:
- contour_paths = [
- path for path in archive.getnames()
- if "LAMB93" in path
- ]
+ contour_paths = [path for path in archive.getnames() if "LAMB93" in path]
archive.extract(context.path(), contour_paths)
-
+
shp_path = [path for path in contour_paths if path.endswith(".shp")]
if len(shp_path) != 1:
- raise RuntimeError("Cannot find IRIS shapes inside the archive, please report this as an error!")
+ raise RuntimeError(
+ "Cannot find IRIS shapes inside the archive, please report this as an error!"
+ )
- df_iris = gpd.read_file("{}/{}".format(context.path(), shp_path[0]))[[
- "CODE_IRIS", "INSEE_COM", "geometry"
- ]].rename(columns = {
- "CODE_IRIS": "iris_id",
- "INSEE_COM": "commune_id"
- })
+ df_iris = gpd.read_file("{}/{}".format(context.path(), shp_path[0]))[
+ ["CODE_IRIS", "INSEE_COM", "geometry"]
+ ].rename(columns={"CODE_IRIS": "iris_id", "INSEE_COM": "commune_id"})
df_iris.crs = "EPSG:2154"
@@ -44,28 +44,35 @@ def execute(context):
df_iris["commune_id"] = df_iris["commune_id"].astype("category")
# Merge with requested codes and verify integrity
- df_iris = pd.merge(df_iris, df_codes, on = ["iris_id", "commune_id"])
+ df_iris = pd.merge(df_iris, df_codes, on=["iris_id", "commune_id"])
requested_iris = set(df_codes["iris_id"].unique())
merged_iris = set(df_iris["iris_id"].unique())
if requested_iris != merged_iris:
- raise RuntimeError("Some IRIS are missing: %s" % (requested_iris - merged_iris,))
+ raise RuntimeError(
+ "Some IRIS are missing: %s" % (requested_iris - merged_iris,)
+ )
return df_iris
+
def find_iris(path):
candidates = sorted(list(glob.glob("{}/*.7z".format(path))))
if len(candidates) == 0:
raise RuntimeError("IRIS data is not available in {}".format(path))
-
+
if len(candidates) > 1:
- raise RuntimeError("Multiple candidates for IRIS are available in {}".format(path))
-
+ raise RuntimeError(
+ "Multiple candidates for IRIS are available in {}".format(path)
+ )
+
return candidates[0]
def validate(context):
- path = find_iris("{}/{}".format(context.config("data_path"), context.config("iris_path")))
+ path = find_iris(
+ "{}/{}".format(context.config("data_path"), context.config("iris_path"))
+ )
return os.path.getsize(path)
diff --git a/data/spatial/municipalities.py b/data/spatial/municipalities.py
index b46eb696..71c553cf 100644
--- a/data/spatial/municipalities.py
+++ b/data/spatial/municipalities.py
@@ -7,14 +7,20 @@
Provides the municipality zoning system.
"""
+
def configure(context):
context.stage("data.spatial.iris")
+
def execute(context):
df_iris = context.stage("data.spatial.iris")
df_iris["has_iris"] = ~df_iris["iris_id"].astype(str).str.endswith("0000")
- df_municipalities = context.stage("data.spatial.iris").dissolve(
- by = "commune_id").drop(columns = ["iris_id"]).reset_index()
+ df_municipalities = (
+ context.stage("data.spatial.iris")
+ .dissolve(by="commune_id")
+ .drop(columns=["iris_id"])
+ .reset_index()
+ )
return df_municipalities
diff --git a/data/spatial/population.py b/data/spatial/population.py
index 04ab94bb..624df8ce 100644
--- a/data/spatial/population.py
+++ b/data/spatial/population.py
@@ -6,6 +6,7 @@
Loads aggregate population data.
"""
+
def configure(context):
context.config("data_path")
context.stage("data.spatial.codes")
@@ -13,19 +14,28 @@ def configure(context):
context.config("population_xlsx", "base-ic-evol-struct-pop-2019.xlsx")
context.config("population_year", 19)
+
def execute(context):
year = str(context.config("population_year"))
with zipfile.ZipFile(
- "{}/{}".format(context.config("data_path"), context.config("population_path"))) as archive:
+ "{}/{}".format(context.config("data_path"), context.config("population_path"))
+ ) as archive:
with archive.open(context.config("population_xlsx")) as f:
df_population = pd.read_excel(
f,
- skiprows = 5, sheet_name = "IRIS", usecols = ["IRIS", "COM", "DEP", "REG", "P%s_POP" % year]
- ).rename(columns = {
- "IRIS": "iris_id", "COM": "commune_id", "DEP": "departement_id", "REG": "region_id",
- "P%s_POP" % year: "population"
- })
+ skiprows=5,
+ sheet_name="IRIS",
+ usecols=["IRIS", "COM", "DEP", "REG", "P%s_POP" % year],
+ ).rename(
+ columns={
+ "IRIS": "iris_id",
+ "COM": "commune_id",
+ "DEP": "departement_id",
+ "REG": "region_id",
+ "P%s_POP" % year: "population",
+ }
+ )
df_population["iris_id"] = df_population["iris_id"].astype("category")
df_population["commune_id"] = df_population["commune_id"].astype("category")
@@ -34,19 +44,31 @@ def execute(context):
# Merge into code data and verify integrity
df_codes = context.stage("data.spatial.codes")
- df_population = pd.merge(df_population, df_codes, on = ["iris_id", "commune_id", "departement_id", "region_id"])
+ df_population = pd.merge(
+ df_population,
+ df_codes,
+ on=["iris_id", "commune_id", "departement_id", "region_id"],
+ )
requested_iris = set(df_codes["iris_id"].unique())
merged_iris = set(df_population["iris_id"].unique())
if requested_iris != merged_iris:
- raise RuntimeError("Some IRIS are missing: %s" % (requested_iris - merged_iris,))
+ raise RuntimeError(
+ "Some IRIS are missing: %s" % (requested_iris - merged_iris,)
+ )
+
+ return df_population[
+ ["region_id", "departement_id", "commune_id", "iris_id", "population"]
+ ]
- return df_population[["region_id", "departement_id", "commune_id", "iris_id", "population"]]
def validate(context):
- if not os.path.exists("{}/{}".format(context.config("data_path"), context.config("population_path"))):
+ if not os.path.exists(
+ "{}/{}".format(context.config("data_path"), context.config("population_path"))
+ ):
raise RuntimeError("Aggregated census data is not available")
- return os.path.getsize("{}/{}".format(context.config("data_path"), context.config("population_path")))
-
\ No newline at end of file
+ return os.path.getsize(
+ "{}/{}".format(context.config("data_path"), context.config("population_path"))
+ )
diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py
index 7e5c0c26..4f80a954 100644
--- a/data/spatial/urban_type.py
+++ b/data/spatial/urban_type.py
@@ -5,56 +5,73 @@
# START Money patching openpyxl to parse INSEE file
from openpyxl.styles.colors import WHITE, RGB
+
__old_rgb_set__ = RGB.__set__
+
def __rgb_set_fixed__(self, instance, value):
try:
__old_rgb_set__(self, instance, value)
except ValueError as e:
- if e.args[0] == 'Colors must be aRGB hex values':
+ if e.args[0] == "Colors must be aRGB hex values":
__old_rgb_set__(self, instance, WHITE)
+
RGB.__set__ = __rgb_set_fixed__
# END Monkey patching openpyxl
# Loads the input data for the urban type (unité urbain)
+
def configure(context):
context.stage("data.spatial.municipalities")
context.config("data_path")
context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip")
+
def execute(context):
- with zipfile.ZipFile("{}/{}".format(
- context.config("data_path"), context.config("urban_type_path"))) as archive:
+ with zipfile.ZipFile(
+ "{}/{}".format(context.config("data_path"), context.config("urban_type_path"))
+ ) as archive:
assert len(archive.filelist) == 1
with archive.open(archive.filelist[0]) as f:
- df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5)
-
+ df = pd.read_excel(f, sheet_name="Composition_communale", skiprows=5)
+
df = df[["CODGEO", "STATUT_2017"]].copy()
- df = df.set_axis(["commune_id", "urban_type"], axis = "columns")
+ df = df.set_axis(["commune_id", "urban_type"], axis="columns")
# Cities that have districts are not detailed in the UU file, only the whole city is mentioned
# However the municipalities file details the districts with their respective INSEE codes
- cities_with_districts = {"75056": [str(75101 + i) for i in (range(20))], # Paris
- "69123": [str(69001 + i) for i in range(9)], # Lyon
- "13055": [str(13201 + i) for i in range(15)]} # Marseilles
+ cities_with_districts = {
+ "75056": [str(75101 + i) for i in (range(20))], # Paris
+ "69123": [str(69001 + i) for i in range(9)], # Lyon
+ "13055": [str(13201 + i) for i in range(15)],
+ } # Marseilles
# Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts
for city_code in cities_with_districts:
base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"]
replacement_codes = cities_with_districts[city_code]
- df = pd.concat([df, pd.DataFrame({
- "commune_id": replacement_codes,
- "urban_type": [base_type] * len(replacement_codes)
- })])
-
+ df = pd.concat(
+ [
+ df,
+ pd.DataFrame(
+ {
+ "commune_id": replacement_codes,
+ "urban_type": [base_type] * len(replacement_codes),
+ }
+ ),
+ ]
+ )
+
df = df[~df["commune_id"].isin(cities_with_districts.keys())]
# Clean unités urbaines
- df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
+ df["urban_type"] = df["urban_type"].replace(
+ {"B": "suburb", "C": "central_city", "I": "isolated_city", "H": "none"}
+ )
assert np.all(~df["urban_type"].isna())
df["urban_type"] = df["urban_type"].astype("category")
@@ -66,8 +83,13 @@ def execute(context):
return df
+
def validate(context):
- if not os.path.exists("%s/%s" % (context.config("data_path"), context.config("urban_type_path"))):
+ if not os.path.exists(
+ "%s/%s" % (context.config("data_path"), context.config("urban_type_path"))
+ ):
raise RuntimeError("Urban type data is not available")
- return os.path.getsize("%s/%s" % (context.config("data_path"), context.config("urban_type_path")))
+ return os.path.getsize(
+ "%s/%s" % (context.config("data_path"), context.config("urban_type_path"))
+ )
diff --git a/data/spatial/utils.py b/data/spatial/utils.py
index 29b272d4..048116b4 100644
--- a/data/spatial/utils.py
+++ b/data/spatial/utils.py
@@ -3,20 +3,23 @@
import geopandas as gpd
import pandas as pd
-def to_gpd(context, df, x = "x", y = "y", crs = "EPSG:2154", column = "geometry"):
+
+def to_gpd(context, df, x="x", y="y", crs="EPSG:2154", column="geometry"):
df[column] = [
- geo.Point(*coord) for coord in context.progress(
- zip(df[x], df[y]), total = len(df),
- label = "Converting coordinates"
- )]
- df = gpd.GeoDataFrame(df, crs = "EPSG:2154", geometry = column)
+ geo.Point(*coord)
+ for coord in context.progress(
+ zip(df[x], df[y]), total=len(df), label="Converting coordinates"
+ )
+ ]
+ df = gpd.GeoDataFrame(df, crs="EPSG:2154", geometry=column)
if not df.crs == "EPSG:2154":
df = df.to_crs("EPSG:2154")
return df
-def sample_from_shape(shape, count, random, sample_size = None):
+
+def sample_from_shape(shape, count, random, sample_size=None):
points = []
if sample_size is None:
@@ -24,15 +27,16 @@ def sample_from_shape(shape, count, random, sample_size = None):
while len(points) < count:
minx, miny, maxx, maxy = shape.bounds
- candidates = random.random_sample(size = (sample_size, 2))
- candidates[:,0] = minx + candidates[:,0] * (maxx - minx)
- candidates[:,1] = miny + candidates[:,1] * (maxy - miny)
+ candidates = random.random_sample(size=(sample_size, 2))
+ candidates[:, 0] = minx + candidates[:, 0] * (maxx - minx)
+ candidates[:, 1] = miny + candidates[:, 1] * (maxy - miny)
candidates = [geo.Point(*point) for point in candidates]
candidates = [point for point in candidates if shape.contains(point)]
points += candidates
return np.array([(point.x, point.y) for point in points[:count]])
+
def _sample_from_zones(context, args):
attribute_value, random_seed = args
@@ -46,9 +50,12 @@ def _sample_from_zones(context, args):
f = df[attribute] == attribute_value
coordinates = sample_from_shape(zone, np.count_nonzero(f), random)
- return pd.DataFrame(coordinates, columns = ["x", "y"], index = f[f].index)
+ return pd.DataFrame(coordinates, columns=["x", "y"], index=f[f].index)
+
-def sample_from_zones(context, df_zones, df, attribute, random, label = "Sampling coordinates ..."):
+def sample_from_zones(
+ context, df_zones, df, attribute, random, label="Sampling coordinates ..."
+):
assert attribute in df
assert attribute in df_zones
@@ -57,8 +64,14 @@ def sample_from_zones(context, df_zones, df, attribute, random, label = "Samplin
df_result = []
- with context.parallel(dict(df_zones = df_zones, df = df, attribute = attribute)) as parallel:
- for df_partial in context.progress(parallel.imap(_sample_from_zones, zip(unique_values, random_seeds)), label = label, total = len(unique_values)):
+ with context.parallel(
+ dict(df_zones=df_zones, df=df, attribute=attribute)
+ ) as parallel:
+ for df_partial in context.progress(
+ parallel.imap(_sample_from_zones, zip(unique_values, random_seeds)),
+ label=label,
+ total=len(unique_values),
+ ):
df_result.append(df_partial)
return pd.concat(df_result)
diff --git a/data/tiles/raw.py b/data/tiles/raw.py
index b42a5d33..7af35c73 100644
--- a/data/tiles/raw.py
+++ b/data/tiles/raw.py
@@ -9,6 +9,7 @@
This stage loads the raw data from the French population income, poverty and living standards in tiled data.
"""
+
def configure(context):
context.stage("data.spatial.departments")
context.config("data_path")
@@ -62,4 +63,4 @@ def validate(context):
return os.path.getsize(
"{}/{}".format(context.config("data_path"), context.config("tiles_path"))
- )
\ No newline at end of file
+ )
diff --git a/data/vehicles/raw.py b/data/vehicles/raw.py
index 95a9fc31..b726ab63 100644
--- a/data/vehicles/raw.py
+++ b/data/vehicles/raw.py
@@ -9,31 +9,49 @@
https://www.statistiques.developpement-durable.gouv.fr/donnees-sur-le-parc-automobile-francais-au-1er-janvier-2021
"""
+
def configure(context):
context.config("data_path")
context.config("vehicles_path", "vehicles")
context.config("vehicles_year", 2021)
context.stage("data.spatial.codes")
+
def execute(context):
df_codes = context.stage("data.spatial.codes")
# the downloaded excel files meta-data are actually have a badly formatted ISO datetime
- # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1659
- with mock.patch.object(excel.ExcelReader, 'read_properties', lambda self: None):
+ # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1659
+ with mock.patch.object(excel.ExcelReader, "read_properties", lambda self: None):
year = str(context.config("vehicles_year"))
-
- with zipfile.ZipFile("{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_communes.zip")) as archive:
+
+ with zipfile.ZipFile(
+ "{}/{}/{}".format(
+ context.config("data_path"),
+ context.config("vehicles_path"),
+ "parc_vp_communes.zip",
+ )
+ ) as archive:
with archive.open("Parc_VP_Communes_{}.xlsx".format(year)) as f:
df_municipalities = pd.read_excel(f)
- with zipfile.ZipFile("{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_regions.zip")) as archive:
+ with zipfile.ZipFile(
+ "{}/{}/{}".format(
+ context.config("data_path"),
+ context.config("vehicles_path"),
+ "parc_vp_regions.zip",
+ )
+ ) as archive:
with archive.open("Parc_VP_Regions_{}.xlsx".format(year)) as f:
df_regions = pd.read_excel(f)
-
+
df_municipalities["region_id"] = df_municipalities["Code région"].astype("category")
- df_municipalities["departement_id"] = df_municipalities["Code départment"].astype("category")
- df_municipalities["commune_id"] = df_municipalities["Code commune"].astype("category")
+ df_municipalities["departement_id"] = df_municipalities["Code départment"].astype(
+ "category"
+ )
+ df_municipalities["commune_id"] = df_municipalities["Code commune"].astype(
+ "category"
+ )
df_regions["region_id"] = df_regions["Code région"].astype("category")
@@ -41,14 +59,22 @@ def execute(context):
requested_regions = set(df_codes["region_id"].astype(str).unique())
if len(requested_departements) > 0:
- df_municipalities = df_municipalities[df_municipalities["departement_id"].isin(requested_departements)]
+ df_municipalities = df_municipalities[
+ df_municipalities["departement_id"].isin(requested_departements)
+ ]
if len(requested_regions) > 0:
df_regions = df_regions[df_regions["region_id"].isin(requested_regions)]
- df_municipalities["region_id"] = df_municipalities["region_id"].cat.remove_unused_categories()
- df_municipalities["departement_id"] = df_municipalities["departement_id"].cat.remove_unused_categories()
- df_municipalities["commune_id"] = df_municipalities["commune_id"].cat.remove_unused_categories()
+ df_municipalities["region_id"] = df_municipalities[
+ "region_id"
+ ].cat.remove_unused_categories()
+ df_municipalities["departement_id"] = df_municipalities[
+ "departement_id"
+ ].cat.remove_unused_categories()
+ df_municipalities["commune_id"] = df_municipalities[
+ "commune_id"
+ ].cat.remove_unused_categories()
df_regions["region_id"] = df_regions["region_id"].cat.remove_unused_categories()
@@ -65,19 +91,46 @@ def execute(context):
df_regions["fleet"] = df_regions[count_column_name]
df_regions["age"] = df_regions[age_column_name]
- df_vehicle_fleet_counts = df_municipalities.groupby(["region_id", "commune_id", "critair","technology"])["fleet"].sum().reset_index().dropna()
- df_vehicle_age_counts = df_regions.groupby(["region_id", "critair", "technology", "age"])["fleet"].sum().reset_index().dropna()
+ df_vehicle_fleet_counts = (
+ df_municipalities.groupby(["region_id", "commune_id", "critair", "technology"])[
+ "fleet"
+ ]
+ .sum()
+ .reset_index()
+ .dropna()
+ )
+ df_vehicle_age_counts = (
+ df_regions.groupby(["region_id", "critair", "technology", "age"])["fleet"]
+ .sum()
+ .reset_index()
+ .dropna()
+ )
return df_vehicle_fleet_counts, df_vehicle_age_counts
+
def validate(context):
- municipalities_path = "{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_communes.zip")
- regions_path = "{}/{}/{}".format(context.config("data_path"), context.config("vehicles_path"), "parc_vp_regions.zip")
+ municipalities_path = "{}/{}/{}".format(
+ context.config("data_path"),
+ context.config("vehicles_path"),
+ "parc_vp_communes.zip",
+ )
+ regions_path = "{}/{}/{}".format(
+ context.config("data_path"),
+ context.config("vehicles_path"),
+ "parc_vp_regions.zip",
+ )
if not os.path.exists(municipalities_path):
- raise RuntimeError("Municipalities vehicle data is not available at {}".format(municipalities_path))
-
+ raise RuntimeError(
+ "Municipalities vehicle data is not available at {}".format(
+ municipalities_path
+ )
+ )
+
if not os.path.exists(regions_path):
- raise RuntimeError("Regions vehicle data is not available at {}".format(regions_path))
+ raise RuntimeError(
+ "Regions vehicle data is not available at {}".format(regions_path)
+ )
return os.path.getsize(municipalities_path) + os.path.getsize(regions_path)
diff --git a/data/vehicles/types.py b/data/vehicles/types.py
index b10b8c65..f8ef6828 100644
--- a/data/vehicles/types.py
+++ b/data/vehicles/types.py
@@ -4,18 +4,28 @@
This stage creates the various type of vehicles needed for the simulation with HBEFA emissions
"""
-HBEFA_TECH = ['petrol', 'diesel']
-HBEFA_EURO = ['1', '2', '3', '4', '5', '6ab', '6c', '6d']
+HBEFA_TECH = ["petrol", "diesel"]
+HBEFA_EURO = ["1", "2", "3", "4", "5", "6ab", "6c", "6d"]
+
def configure(context):
pass
+
def execute(context):
vehicle_types = [
{
- 'type_id': 'default_car', 'nb_seats': 4, 'length': 5.0, 'width': 1.0, 'pce': 1.0, 'mode': "car",
- 'hbefa_cat': "PASSENGER_CAR", 'hbefa_tech': "average", 'hbefa_size': "average", 'hbefa_emission': "average",
+ "type_id": "default_car",
+ "nb_seats": 4,
+ "length": 5.0,
+ "width": 1.0,
+ "pce": 1.0,
+ "mode": "car",
+ "hbefa_cat": "PASSENGER_CAR",
+ "hbefa_tech": "average",
+ "hbefa_size": "average",
+ "hbefa_emission": "average",
}
]
@@ -25,7 +35,7 @@ def execute(context):
id = "car_%s_%s" % (technology, euro)
- if technology == "diesel" and euro in ['2', '3']:
+ if technology == "diesel" and euro in ["2", "3"]:
euro += " (DPF)"
size = ">=2L" if technology == "petrol" else "<1,4L"
@@ -35,10 +45,17 @@ def execute(context):
emission = "PC %s Euro-%s" % (tech, euro)
- vehicle_types.append({
- 'type_id': id, 'length': 7.5, 'width': 1.0,
- 'hbefa_cat': "PASSENGER_CAR", 'hbefa_tech': tech, 'hbefa_size': size, 'hbefa_emission': emission,
- })
+ vehicle_types.append(
+ {
+ "type_id": id,
+ "length": 7.5,
+ "width": 1.0,
+ "hbefa_cat": "PASSENGER_CAR",
+ "hbefa_tech": tech,
+ "hbefa_size": size,
+ "hbefa_emission": emission,
+ }
+ )
df_types = pd.DataFrame.from_records(vehicle_types)
- return df_types
\ No newline at end of file
+ return df_types
diff --git a/docs/verify_data.py b/docs/verify_data.py
index f657dbff..777a6482 100644
--- a/docs/verify_data.py
+++ b/docs/verify_data.py
@@ -12,8 +12,8 @@
"https://www.insee.fr/fr/statistiques/6544333",
"https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZA_csv.zip",
"https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZD_csv.zip",
- "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZE_csv.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVIZE_csv.zip",
+ ],
},
{
"name": "OD Matrices 2019",
@@ -21,30 +21,30 @@
"https://www.insee.fr/fr/statistiques/6456056",
"https://www.insee.fr/fr/statistiques/6456052",
"https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip",
- "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip",
+ ],
},
{
"name": "Population totals 2019",
"urls": [
"https://www.insee.fr/fr/statistiques/6543200",
- "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019_csv.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019_csv.zip",
+ ],
},
{
"name": "Filosofi 2019",
"urls": [
"https://www.insee.fr/fr/statistiques/6036907",
"https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES_csv.zip",
- "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA_csv.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA_csv.zip",
+ ],
},
{
"name": "BPE 2021",
"urls": [
"https://www.insee.fr/fr/statistiques/3568638",
- "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip",
+ ],
},
{
"name": "ENTD 2008",
@@ -55,35 +55,32 @@
"https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_menage.csv",
"https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_individu.csv",
"https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/Q_ind_lieu_teg.csv",
- "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/K_deploc.csv"
- ]
+ "https://www.statistiques.developpement-durable.gouv.fr/sites/default/files/2019-01/K_deploc.csv",
+ ],
},
{
"name": "IRIS 2021",
"urls": [
"https://geoservices.ign.fr/contoursiris",
- "https://wxs.ign.fr/1yhlj2ehpqf3q6dt6a2y7b64/telechargement/inspire/CONTOURS-IRIS-PACK_2021-01$CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/file/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z"
- ]
+ "https://wxs.ign.fr/1yhlj2ehpqf3q6dt6a2y7b64/telechargement/inspire/CONTOURS-IRIS-PACK_2021-01$CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/file/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z",
+ ],
},
{
"name": "Zoning 2021",
"urls": [
"https://www.insee.fr/fr/information/2017499",
- "https://www.insee.fr/fr/statistiques/fichier/2017499/reference_IRIS_geo2021.zip"
- ]
+ "https://www.insee.fr/fr/statistiques/fichier/2017499/reference_IRIS_geo2021.zip",
+ ],
},
{
"name": "SIRENE",
"urls": [
"https://www.data.gouv.fr/fr/datasets/base-sirene-des-entreprises-et-de-leurs-etablissements-siren-siret/"
- ]
+ ],
},
-
{
"name": "SIRET géolocalisé",
- "urls": [
- "https://adresse.data.gouv.fr/donnees-nationales"
- ]
+ "urls": ["https://adresse.data.gouv.fr/donnees-nationales"],
},
# {
# "name": "BD-TOPO",
@@ -101,8 +98,8 @@
"https://download.geofabrik.de/europe/france/ile-de-france-latest.osm.pbf",
"https://download.geofabrik.de/europe/france/rhone-alpes-latest.osm.pbf",
"https://download.geofabrik.de/europe/france/languedoc-roussillon-latest.osm.pbf",
- "https://download.geofabrik.de/europe/france/midi-pyrenees-latest.osm.pbf"
- ]
+ "https://download.geofabrik.de/europe/france/midi-pyrenees-latest.osm.pbf",
+ ],
},
{
"name": "GTFS",
@@ -115,9 +112,9 @@
"https://download.data.grandlyon.com/files/rdata/tcl_sytral.tcltheorique/GTFS_TCL.ZIP",
"https://eu.ftp.opendatasoft.com/sncf/gtfs/export-ter-gtfs-last.zip",
"https://eu.ftp.opendatasoft.com/sncf/gtfs/export-intercites-gtfs-last.zip",
- "https://ressources.data.sncf.com/explore/dataset/horaires-des-train-voyages-tgvinouiouigo/files/538b55483fac4c1dad455022a0257014/download/"
- ]
- }
+ "https://ressources.data.sncf.com/explore/dataset/horaires-des-train-voyages-tgvinouiouigo/files/538b55483fac4c1dad455022a0257014/download/",
+ ],
+ },
]
# Start testing process
@@ -125,7 +122,7 @@
from urllib.request import urlopen
any_errors = False
-sleep_time = 10 # s
+sleep_time = 10 # s
for test in tests:
print("Testing %s ..." % test["name"])
diff --git a/documentation/info/collect.py b/documentation/info/collect.py
index aca5c7f1..f0df1d72 100644
--- a/documentation/info/collect.py
+++ b/documentation/info/collect.py
@@ -1,6 +1,7 @@
import numpy as np
import json
+
def configure(context):
context.stage("data.hts.comparison")
context.stage("data.census.cleaned")
@@ -12,6 +13,7 @@ def configure(context):
context.stage("data.census.filtered")
context.stage("data.sirene.localized")
+
def execute(context):
info = {}
@@ -26,12 +28,30 @@ def execute(context):
info["census"] = {
"number_of_households": len(df_census["household_id"].unique()),
"number_of_persons": len(df_census),
- "weighted_number_of_households": df_census[["household_id", "weight"]].drop_duplicates("household_id")["weight"].sum(),
+ "weighted_number_of_households": df_census[["household_id", "weight"]]
+ .drop_duplicates("household_id")["weight"]
+ .sum(),
"weighted_number_of_persons": df_census["weight"].sum(),
- "share_of_households_without_iris": np.sum(df_households[~(df_households["iris_id"] != "undefined") & (df_households["commune_id"] != "undefined")]["weight"]) / np.sum(df_households["weight"]),
- "share_of_households_without_commune": np.sum(df_households[~(df_households["iris_id"] != "undefined") & ~(df_households["commune_id"] != "undefined")]["weight"]) / np.sum(df_households["weight"]),
- "filtered_households_share": context.get_info("data.census.filtered", "filtered_households_share"),
- "filtered_persons_share": context.get_info("data.census.filtered", "filtered_persons_share"),
+ "share_of_households_without_iris": np.sum(
+ df_households[
+ ~(df_households["iris_id"] != "undefined")
+ & (df_households["commune_id"] != "undefined")
+ ]["weight"]
+ )
+ / np.sum(df_households["weight"]),
+ "share_of_households_without_commune": np.sum(
+ df_households[
+ ~(df_households["iris_id"] != "undefined")
+ & ~(df_households["commune_id"] != "undefined")
+ ]["weight"]
+ )
+ / np.sum(df_households["weight"]),
+ "filtered_households_share": context.get_info(
+ "data.census.filtered", "filtered_households_share"
+ ),
+ "filtered_persons_share": context.get_info(
+ "data.census.filtered", "filtered_persons_share"
+ ),
}
# OD data
@@ -39,7 +59,7 @@ def execute(context):
info["od"] = {
"number_of_work_commutes": len(df_od_work),
- "number_of_education_commutes": len(df_od_education)
+ "number_of_education_commutes": len(df_od_education),
}
# BPE
@@ -48,8 +68,12 @@ def execute(context):
info["bpe"] = {
"number_of_enterprises": len(df_bpe),
"number_of_shop_enterprises": int(np.sum(df_bpe["activity_type"] == "shop")),
- "number_of_leisure_enterprises": int(np.sum(df_bpe["activity_type"] == "leisure")),
- "number_of_education_enterprises": int(np.sum(df_bpe["activity_type"] == "education")),
+ "number_of_leisure_enterprises": int(
+ np.sum(df_bpe["activity_type"] == "leisure")
+ ),
+ "number_of_education_enterprises": int(
+ np.sum(df_bpe["activity_type"] == "education")
+ ),
"number_of_other_enterprises": int(np.sum(df_bpe["activity_type"] == "other")),
}
@@ -58,28 +82,37 @@ def execute(context):
info["zones"] = {
"number_of_municipalities": len(df_codes["commune_id"].unique()),
- "number_of_iris": len(df_codes["iris_id"].unique())
+ "number_of_iris": len(df_codes["iris_id"].unique()),
}
with open("%s/zones.json" % context.cache_path, "w+") as f:
- json.dump(info, f, indent = True)
+ json.dump(info, f, indent=True)
# Income
df_income_municipality = context.stage("data.income.municipality")
- df_income_municipality = df_income_municipality[(df_income_municipality["attribute"] == "all") & (df_income_municipality["value"] == "all")]
+ df_income_municipality = df_income_municipality[
+ (df_income_municipality["attribute"] == "all")
+ & (df_income_municipality["value"] == "all")
+ ]
df_income_region = context.stage("data.income.region")
info["income"] = {
"minimum_median": int(df_income_municipality["q5"].min()),
"maximum_median": int(df_income_municipality["q5"].max()),
"median_region": int(df_income_region[4]),
- "number_of_incomplete_distributions": int(np.sum(~df_income_municipality["is_missing"] & df_income_municipality["is_imputed"])),
- "number_of_missing_distributions": int(np.sum(df_income_municipality["is_missing"]))
+ "number_of_incomplete_distributions": int(
+ np.sum(
+ ~df_income_municipality["is_missing"]
+ & df_income_municipality["is_imputed"]
+ )
+ ),
+ "number_of_missing_distributions": int(
+ np.sum(df_income_municipality["is_missing"])
+ ),
}
-
# Output
with open("%s/info.json" % context.cache_path, "w+") as f:
- json.dump(info, f, indent = True)
+ json.dump(info, f, indent=True)
return info
diff --git a/documentation/info/tex.py b/documentation/info/tex.py
index 1e42fc00..b0730bae 100644
--- a/documentation/info/tex.py
+++ b/documentation/info/tex.py
@@ -1,29 +1,54 @@
import numpy as np
+
def configure(context):
context.stage("documentation.info.collect")
+
def execute(context):
info = context.stage("documentation.info.collect")
variables = {
- "infoBpeNumberOfEnterprises": "{:,d}".format(info["bpe"]["number_of_enterprises"]),
- "infoBpeNumberOfEducationEnterprises": "{:,d}".format(info["bpe"]["number_of_education_enterprises"]),
- "infoBpeNumberOfShopEnterprises": "{:,d}".format(info["bpe"]["number_of_shop_enterprises"]),
- "infoBpeNumberOfLeisureEnterprises": "{:,d}".format(info["bpe"]["number_of_leisure_enterprises"]),
- "infoBpeNumberOfOtherEnterprises": "{:,d}".format(info["bpe"]["number_of_other_enterprises"]),
-
- "infoZonesNumberOfMunicipalities": "{:,d}".format(info["zones"]["number_of_municipalities"]),
+ "infoBpeNumberOfEnterprises": "{:,d}".format(
+ info["bpe"]["number_of_enterprises"]
+ ),
+ "infoBpeNumberOfEducationEnterprises": "{:,d}".format(
+ info["bpe"]["number_of_education_enterprises"]
+ ),
+ "infoBpeNumberOfShopEnterprises": "{:,d}".format(
+ info["bpe"]["number_of_shop_enterprises"]
+ ),
+ "infoBpeNumberOfLeisureEnterprises": "{:,d}".format(
+ info["bpe"]["number_of_leisure_enterprises"]
+ ),
+ "infoBpeNumberOfOtherEnterprises": "{:,d}".format(
+ info["bpe"]["number_of_other_enterprises"]
+ ),
+ "infoZonesNumberOfMunicipalities": "{:,d}".format(
+ info["zones"]["number_of_municipalities"]
+ ),
"infoZonesNumberOfIris": "{:,d}".format(info["zones"]["number_of_iris"]),
-
- "infoIncomeMinimumMedian": "{:,.0f}".format(1e3 * np.round(info["income"]["minimum_median"] * 1e-3)),
- "infoIncomeMaximumMedian": "{:,.0f}".format(1e3 * np.round(info["income"]["maximum_median"] * 1e-3)),
- "infoIncomeMedianRegion": "{:,.0f}".format(1e3 * np.round(info["income"]["median_region"] * 1e-3)),
- "infoIncomeNumberOfIncompleteDistributions": "{:,d}".format(info["income"]["number_of_incomplete_distributions"]),
- "infoIncomeNumberOfMissingDistributions": "{:,d}".format(info["income"]["number_of_missing_distributions"]),
-
- "infoCensusFilteredHouseholds": "{:.2f}\\%".format(1e2 * info["census"]["filtered_households_share"]),
- "infoCensusFilteredPersons": "{:.2f}\\%".format(1e2 * info["census"]["filtered_persons_share"])
+ "infoIncomeMinimumMedian": "{:,.0f}".format(
+ 1e3 * np.round(info["income"]["minimum_median"] * 1e-3)
+ ),
+ "infoIncomeMaximumMedian": "{:,.0f}".format(
+ 1e3 * np.round(info["income"]["maximum_median"] * 1e-3)
+ ),
+ "infoIncomeMedianRegion": "{:,.0f}".format(
+ 1e3 * np.round(info["income"]["median_region"] * 1e-3)
+ ),
+ "infoIncomeNumberOfIncompleteDistributions": "{:,d}".format(
+ info["income"]["number_of_incomplete_distributions"]
+ ),
+ "infoIncomeNumberOfMissingDistributions": "{:,d}".format(
+ info["income"]["number_of_missing_distributions"]
+ ),
+ "infoCensusFilteredHouseholds": "{:.2f}\\%".format(
+ 1e2 * info["census"]["filtered_households_share"]
+ ),
+ "infoCensusFilteredPersons": "{:.2f}\\%".format(
+ 1e2 * info["census"]["filtered_persons_share"]
+ ),
}
latex = []
diff --git a/documentation/meta_output.py b/documentation/meta_output.py
index e21bfbf6..2937e29c 100644
--- a/documentation/meta_output.py
+++ b/documentation/meta_output.py
@@ -1,6 +1,7 @@
import os, datetime, json
import subprocess as sp
+
def configure(context):
context.stage("matsim.runtime.git")
context.config("output_path")
@@ -9,6 +10,7 @@ def configure(context):
for option in ("sampling_rate", "hts", "random_seed"):
context.config(option)
+
def get_version():
version_path = os.path.dirname(os.path.realpath(__file__))
version_path = os.path.realpath("{}/../VERSION".format(version_path))
@@ -16,28 +18,39 @@ def get_version():
with open(version_path) as f:
return f.read().strip()
+
def get_commit():
root_path = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.realpath("{}/..".format(root_path))
try:
- return sp.check_output(["git", "rev-parse", "HEAD"], cwd = root_path).strip().decode("utf-8")
+ return (
+ sp.check_output(["git", "rev-parse", "HEAD"], cwd=root_path)
+ .strip()
+ .decode("utf-8")
+ )
except sp.CalledProcessError:
return "unknown"
+
def execute(context):
# Write meta information
information = dict(
- sampling_rate = context.config("sampling_rate"),
- hts = context.config("hts"),
- random_seed = context.config("random_seed"),
- created = datetime.datetime.now(datetime.timezone.utc).isoformat(),
- version = get_version(),
- commit = get_commit()
+ sampling_rate=context.config("sampling_rate"),
+ hts=context.config("hts"),
+ random_seed=context.config("random_seed"),
+ created=datetime.datetime.now(datetime.timezone.utc).isoformat(),
+ version=get_version(),
+ commit=get_commit(),
)
- with open("%s/%smeta.json" % (context.config("output_path"), context.config("output_prefix")), "w+") as f:
- json.dump(information, f, indent = 4)
+ with open(
+ "%s/%smeta.json"
+ % (context.config("output_path"), context.config("output_prefix")),
+ "w+",
+ ) as f:
+ json.dump(information, f, indent=4)
+
def validate(context):
return get_version()
diff --git a/documentation/paper.py b/documentation/paper.py
index 1f0a783f..fb630c74 100644
--- a/documentation/paper.py
+++ b/documentation/paper.py
@@ -1,8 +1,9 @@
import shutil
+
def configure(context):
context.stage("documentation.plots.data.hts_comparison")
- #context.stage("documentation.plots.theory.sampling_error")
+ # context.stage("documentation.plots.theory.sampling_error")
context.stage("documentation.plots.monte_carlo")
context.stage("documentation.plots.income")
@@ -20,38 +21,73 @@ def configure(context):
context.config("paper_path")
+
def execute(context):
paper_path = context.config("paper_path")
# Copy plots and tables
mapping = {
- "hts_comparison_distance.pdf": ("documentation.plots.data.hts_comparison", "distance_distribution.pdf"),
- "hts_comparison_age.pdf": ("documentation.plots.data.hts_comparison", "age_distribution.pdf"),
-
- #"theory_sampling_error.pdf": ("documentation.plots.theory.sampling_error", "sampling_error.pdf"),
-
- #"sampling_sample_count.pdf": ("documentation.plots.sampling.sample_count", "sample_count.pdf"),
- #"sampling_error_probability.pdf": ("documentation.plots.sampling.error_probability", "error_probability.pdf"),
+ "hts_comparison_distance.pdf": (
+ "documentation.plots.data.hts_comparison",
+ "distance_distribution.pdf",
+ ),
+ "hts_comparison_age.pdf": (
+ "documentation.plots.data.hts_comparison",
+ "age_distribution.pdf",
+ ),
+ # "theory_sampling_error.pdf": ("documentation.plots.theory.sampling_error", "sampling_error.pdf"),
+ # "sampling_sample_count.pdf": ("documentation.plots.sampling.sample_count", "sample_count.pdf"),
+ # "sampling_error_probability.pdf": ("documentation.plots.sampling.error_probability", "error_probability.pdf"),
"monte_carlo.pdf": ("documentation.plots.monte_carlo", "monte_carlo.pdf"),
- "monte_carlo_table.tex": ("documentation.plots.monte_carlo", "monte_carlo_table.tex"),
-
- "income_distributions.pdf": ("documentation.plots.income", "income_distributions.pdf"),
-
- "socdem_comparison_persons.pdf": ("documentation.plots.sociodemographics.general", "person.pdf"),
- "socdem_comparison_households.pdf": ("documentation.plots.sociodemographics.general", "household.pdf"),
- "socdem_spatial_comparison.pdf": ("documentation.plots.sociodemographics.local", "comparison.pdf"),
- "activity_chain_comparison.pdf": ("documentation.plots.sociodemographics.chains", "activity_chains.pdf"),
-
- "commute_flow_bars.pdf": ("documentation.plots.commute_flow", "commute_flows.pdf"),
- "commute_flow_boxplot.pdf": ("documentation.plots.commute_flow", "commute_flow_boxplot.pdf"),
- "commute_distance_cdf.pdf": ("documentation.plots.commute_distance", "commute_distance_cdf.pdf"),
-
- "secloc_distributions.pdf": ("documentation.plots.secondary_locations", "input_distributions.pdf"),
- "secloc_output.pdf": ("documentation.plots.secondary_locations", "distance_distributions.pdf"),
-
+ "monte_carlo_table.tex": (
+ "documentation.plots.monte_carlo",
+ "monte_carlo_table.tex",
+ ),
+ "income_distributions.pdf": (
+ "documentation.plots.income",
+ "income_distributions.pdf",
+ ),
+ "socdem_comparison_persons.pdf": (
+ "documentation.plots.sociodemographics.general",
+ "person.pdf",
+ ),
+ "socdem_comparison_households.pdf": (
+ "documentation.plots.sociodemographics.general",
+ "household.pdf",
+ ),
+ "socdem_spatial_comparison.pdf": (
+ "documentation.plots.sociodemographics.local",
+ "comparison.pdf",
+ ),
+ "activity_chain_comparison.pdf": (
+ "documentation.plots.sociodemographics.chains",
+ "activity_chains.pdf",
+ ),
+ "commute_flow_bars.pdf": (
+ "documentation.plots.commute_flow",
+ "commute_flows.pdf",
+ ),
+ "commute_flow_boxplot.pdf": (
+ "documentation.plots.commute_flow",
+ "commute_flow_boxplot.pdf",
+ ),
+ "commute_distance_cdf.pdf": (
+ "documentation.plots.commute_distance",
+ "commute_distance_cdf.pdf",
+ ),
+ "secloc_distributions.pdf": (
+ "documentation.plots.secondary_locations",
+ "input_distributions.pdf",
+ ),
+ "secloc_output.pdf": (
+ "documentation.plots.secondary_locations",
+ "distance_distributions.pdf",
+ ),
"income.geojson": ("documentation.shapes", "income.geojson"),
"info.tex": ("documentation.info.tex", "info.tex"),
}
for target, (stage, path) in mapping.items():
- shutil.copy("%s/%s" % (context.path(stage), path), "%s/%s" % (paper_path, target))
+ shutil.copy(
+ "%s/%s" % (context.path(stage), path), "%s/%s" % (paper_path, target)
+ )
diff --git a/documentation/plots/commute_distance.py b/documentation/plots/commute_distance.py
index 7d24fb3b..a11086de 100644
--- a/documentation/plots/commute_distance.py
+++ b/documentation/plots/commute_distance.py
@@ -7,12 +7,18 @@
SAMPLING_RATE = 0.05
+
def configure(context):
- context.stage("analysis.reference.hts.commute_distance", alias = "hts")
- context.stage("analysis.synthesis.commute_distance", dict(sampling_rate = SAMPLING_RATE), alias = "data")
- context.stage("analysis.reference.od.commute_distance", alias = "census")
+ context.stage("analysis.reference.hts.commute_distance", alias="hts")
+ context.stage(
+ "analysis.synthesis.commute_distance",
+ dict(sampling_rate=SAMPLING_RATE),
+ alias="data",
+ )
+ context.stage("analysis.reference.od.commute_distance", alias="census")
context.config("hts")
+
def execute(context):
plotting.setup()
@@ -21,32 +27,57 @@ def execute(context):
census_data = context.stage("census")
hts_name = context.config("hts")
- plt.figure(figsize = plotting.SHORT_FIGSIZE)
+ plt.figure(figsize=plotting.SHORT_FIGSIZE)
parts = [
- { "slot": "work", "linestyle": "-", "title": "Work" },
- { "slot": "education", "linestyle": "--", "title": "Educ." }
+ {"slot": "work", "linestyle": "-", "title": "Work"},
+ {"slot": "education", "linestyle": "--", "title": "Educ."},
]
for part in parts:
slot = part["slot"]
- #plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0)
-
- plt.plot(data[slot]["mean"], data[slot]["cdf"], color = "k", linestyle = part["linestyle"], linewidth = 1.0)
- plt.fill_betweenx(data[slot]["cdf"], data[slot]["min"], data[slot]["max"], color = "k", linewidth = 0.0, alpha = 0.25)
-
- plt.plot(hts_data[slot]["euclidean_distance"] * 1e-3, hts_data[slot]["cdf"], color = plotting.COLORS[hts_name], linestyle = part["linestyle"], linewidth = 1.0)
-
- plt.plot([np.nan], color = "k", linewidth = 1.0, linestyle = part["linestyle"], label = part["title"])
-
- plt.plot([np.nan], color = "k", linewidth = 1.0, label = "Synthetic")
- plt.plot([np.nan], color = plotting.COLORS[hts_name], linewidth = 1.0, label = "HTS")
+ # plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0)
+
+ plt.plot(
+ data[slot]["mean"],
+ data[slot]["cdf"],
+ color="k",
+ linestyle=part["linestyle"],
+ linewidth=1.0,
+ )
+ plt.fill_betweenx(
+ data[slot]["cdf"],
+ data[slot]["min"],
+ data[slot]["max"],
+ color="k",
+ linewidth=0.0,
+ alpha=0.25,
+ )
+
+ plt.plot(
+ hts_data[slot]["euclidean_distance"] * 1e-3,
+ hts_data[slot]["cdf"],
+ color=plotting.COLORS[hts_name],
+ linestyle=part["linestyle"],
+ linewidth=1.0,
+ )
+
+ plt.plot(
+ [np.nan],
+ color="k",
+ linewidth=1.0,
+ linestyle=part["linestyle"],
+ label=part["title"],
+ )
+
+ plt.plot([np.nan], color="k", linewidth=1.0, label="Synthetic")
+ plt.plot([np.nan], color=plotting.COLORS[hts_name], linewidth=1.0, label="HTS")
plt.xlim([0, 40])
plt.ylim([0, 1])
- plt.legend(loc = "best", ncol = 2)
+ plt.legend(loc="best", ncol=2)
plt.grid()
plt.gca().set_axisbelow(True)
diff --git a/documentation/plots/commute_flow.py b/documentation/plots/commute_flow.py
index 2203b3f5..e3fe47f5 100644
--- a/documentation/plots/commute_flow.py
+++ b/documentation/plots/commute_flow.py
@@ -7,12 +7,18 @@
SAMPLING_RATE = 0.05
+
def configure(context):
context.config("hts")
- context.stage("analysis.reference.od.commute_flow", alias = "census")
- context.stage("analysis.reference.hts.commute_flow", alias = "hts")
- context.stage("analysis.synthesis.commute_flow", dict(sampling_rate = SAMPLING_RATE), alias = "data")
+ context.stage("analysis.reference.od.commute_flow", alias="census")
+ context.stage("analysis.reference.hts.commute_flow", alias="hts")
+ context.stage(
+ "analysis.synthesis.commute_flow",
+ dict(sampling_rate=SAMPLING_RATE),
+ alias="data",
+ )
+
def execute(context):
plotting.setup()
@@ -22,11 +28,11 @@ def execute(context):
df_hts, df_correction = context.stage("hts")
# PLOT: Work / education flows
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
figures = [
- { "slot": "work", "title": "Work", "top": 12 },
- { "slot": "education", "title": "Education", "top": 12, "factor": 0.7 }
+ {"slot": "work", "title": "Work", "top": 12},
+ {"slot": "education", "title": "Education", "top": 12, "factor": 0.7},
]
for index, figure in enumerate(figures):
@@ -34,35 +40,77 @@ def execute(context):
slot = figure["slot"]
df = context.stage("data")[slot]
- df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot])
- df = pd.merge(df, df_correction[slot], on = "home")
- df["scaled_reference"] = df["reference"] * (figure["factor"] if "factor" in figure else df["factor"])
+ df = pd.merge(
+ df,
+ df_census[slot].rename(columns={"weight": "reference"}),
+ on=["home", slot],
+ )
+ df = pd.merge(df, df_correction[slot], on="home")
+ df["scaled_reference"] = df["reference"] * (
+ figure["factor"] if "factor" in figure else df["factor"]
+ )
count = figure["top"]
- df = df.sort_values(by = "scaled_reference", ascending = False).head(count)
-
- plt.bar(np.arange(count), df["reference"], width = 0.4, align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"], alpha = 0.25)
- plt.bar(np.arange(count), df["scaled_reference"], width = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"])
- plt.bar(np.arange(count) + 0.4, df["mean"] / SAMPLING_RATE, width = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"])
+ df = df.sort_values(by="scaled_reference", ascending=False).head(count)
+
+ plt.bar(
+ np.arange(count),
+ df["reference"],
+ width=0.4,
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["census"],
+ alpha=0.25,
+ )
+ plt.bar(
+ np.arange(count),
+ df["scaled_reference"],
+ width=0.4,
+ label="Census",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["census"],
+ )
+ plt.bar(
+ np.arange(count) + 0.4,
+ df["mean"] / SAMPLING_RATE,
+ width=0.4,
+ label="Synthetic",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["synthetic"],
+ )
for index, (min, max) in enumerate(zip(df["min"].values, df["max"].values)):
index += 0.4 + 0.2
- plt.plot([index, index], [min / SAMPLING_RATE, max / SAMPLING_RATE], color = 'k', linewidth = 1.0)
+ plt.plot(
+ [index, index],
+ [min / SAMPLING_RATE, max / SAMPLING_RATE],
+ color="k",
+ linewidth=1.0,
+ )
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5))
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,)))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,))
+ )
origins, destinations = df["home"].values, df[figure["slot"]].values
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(count) + 0.4))
- plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%s\n%s" % item for item in zip(origins, destinations)]))
+ plt.gca().xaxis.set_major_formatter(
+ tck.FixedFormatter(["%s\n%s" % item for item in zip(origins, destinations)])
+ )
plt.ylabel("Commuters [x1000]")
- plt.legend(loc = "best")
+ plt.legend(loc="best")
plt.title(figure["title"])
plt.tight_layout()
@@ -70,11 +118,17 @@ def execute(context):
plt.close()
# PLOT: Scatter
- plt.figure(figsize = plotting.SHORT_FIGSIZE)
+ plt.figure(figsize=plotting.SHORT_FIGSIZE)
parts = [
- { "slot": "work", "title": "Work", "marker": ".", "color": "k" },
- { "slot": "education", "title": "Education", "factor": 0.7, "marker": ".", "color": plotting.COLORS[hts_name] }
+ {"slot": "work", "title": "Work", "marker": ".", "color": "k"},
+ {
+ "slot": "education",
+ "title": "Education",
+ "factor": 0.7,
+ "marker": ".",
+ "color": plotting.COLORS[hts_name],
+ },
]
minimum = np.inf
@@ -84,17 +138,33 @@ def execute(context):
slot = part["slot"]
df = context.stage("data")[slot]
- df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot])
- df = pd.merge(df, df_correction[slot], on = "home")
- df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"])
-
- plt.loglog(df["scaled_reference"], df["mean"] / SAMPLING_RATE, markersize = 2, marker = part["marker"], color = part["color"], linestyle = "none", label = part["title"])
+ df = pd.merge(
+ df,
+ df_census[slot].rename(columns={"weight": "reference"}),
+ on=["home", slot],
+ )
+ df = pd.merge(df, df_correction[slot], on="home")
+ df["scaled_reference"] = df["reference"] * (
+ part["factor"] if "factor" in part else df["factor"]
+ )
+
+ plt.loglog(
+ df["scaled_reference"],
+ df["mean"] / SAMPLING_RATE,
+ markersize=2,
+ marker=part["marker"],
+ color=part["color"],
+ linestyle="none",
+ label=part["title"],
+ )
minimum = np.minimum(minimum, df["scaled_reference"].min() * 0.9)
maximum = np.maximum(maximum, df["scaled_reference"].max() * 1.1)
x = np.linspace(minimum, maximum, 100)
- plt.fill_between(x, x * 0.8, x * 1.2, color = "k", alpha = 0.2, linewidth = 0.0, label = r"20% Error")
+ plt.fill_between(
+ x, x * 0.8, x * 1.2, color="k", alpha=0.2, linewidth=0.0, label=r"20% Error"
+ )
plt.xlim([minimum, maximum])
plt.ylim([minimum, maximum])
@@ -111,37 +181,60 @@ def execute(context):
plt.close()
# PLOT: Histogram
- plt.figure(figsize = plotting.SHORT_FIGSIZE)
+ plt.figure(figsize=plotting.SHORT_FIGSIZE)
parts = [
- { "slot": "work", "title": "Work" },
- { "slot": "education", "title": "Education", "factor": 0.7 }
+ {"slot": "work", "title": "Work"},
+ {"slot": "education", "title": "Education", "factor": 0.7},
]
for index, part in enumerate(parts):
slot = part["slot"]
df = context.stage("data")[slot]
- df = pd.merge(df, df_census[slot].rename(columns = { "weight": "reference" }), on = ["home", slot])
- df = pd.merge(df, df_correction[slot], on = "home")
- df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"])
-
- df["difference"] = 100 * (df["mean"] / SAMPLING_RATE - df["scaled_reference"]) / df["scaled_reference"]
+ df = pd.merge(
+ df,
+ df_census[slot].rename(columns={"weight": "reference"}),
+ on=["home", slot],
+ )
+ df = pd.merge(df, df_correction[slot], on="home")
+ df["scaled_reference"] = df["reference"] * (
+ part["factor"] if "factor" in part else df["factor"]
+ )
+
+ df["difference"] = (
+ 100
+ * (df["mean"] / SAMPLING_RATE - df["scaled_reference"])
+ / df["scaled_reference"]
+ )
min = df["difference"].min()
max = df["difference"].max()
mean = df["difference"].mean()
values = df["difference"].values
- outliers = values # values[(values < min) | (values > max)]
-
- plt.plot([index - 0.2, index + 0.2], [min, min], color = "k", linewidth = 1.0)
- plt.plot([index - 0.2, index + 0.2], [max, max], color = "k", linewidth = 1.0)
- plt.plot([index - 0.2, index + 0.2], [mean, mean], color = "k", linewidth = 1.0, linestyle = ":")
- plt.plot([index - 0.2, index - 0.2], [min, max], color = "k", linewidth = 1.0)
- plt.plot([index + 0.2, index + 0.2], [min, max], color = "k", linewidth = 1.0)
-
- plt.plot([index] * len(outliers), outliers, color = "k", marker = ".", markersize = 2, linestyle = "none")
+ outliers = values # values[(values < min) | (values > max)]
+
+ plt.plot([index - 0.2, index + 0.2], [min, min], color="k", linewidth=1.0)
+ plt.plot([index - 0.2, index + 0.2], [max, max], color="k", linewidth=1.0)
+ plt.plot(
+ [index - 0.2, index + 0.2],
+ [mean, mean],
+ color="k",
+ linewidth=1.0,
+ linestyle=":",
+ )
+ plt.plot([index - 0.2, index - 0.2], [min, max], color="k", linewidth=1.0)
+ plt.plot([index + 0.2, index + 0.2], [min, max], color="k", linewidth=1.0)
+
+ plt.plot(
+ [index] * len(outliers),
+ outliers,
+ color="k",
+ marker=".",
+ markersize=2,
+ linestyle="none",
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator([0, 1]))
plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["Work", "Education"]))
@@ -151,12 +244,14 @@ def execute(context):
plt.xlim([-0.5, 1.5])
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
- plt.bar([np.nan], [np.nan], color = "none", edgecolor = "k", linewidth = 1.0, label = "5% - 95%")
- plt.plot([np.nan], color = "k", linestyle = ":", label = "Mean")
+ plt.bar(
+ [np.nan], [np.nan], color="none", edgecolor="k", linewidth=1.0, label="5% - 95%"
+ )
+ plt.plot([np.nan], color="k", linestyle=":", label="Mean")
- plt.legend(loc = "best")
+ plt.legend(loc="best")
plt.tight_layout()
plt.savefig("%s/commute_flow_boxplot.pdf" % context.path())
diff --git a/documentation/plots/data/hts_chains.py b/documentation/plots/data/hts_chains.py
index 3da51ecf..1b47c36b 100644
--- a/documentation/plots/data/hts_chains.py
+++ b/documentation/plots/data/hts_chains.py
@@ -5,46 +5,74 @@
import matplotlib.ticker as tck
import documentation.plotting as plotting
+
def configure(context):
- context.stage("analysis.reference.hts.chains", { "hts": "egt" }, alias = "egt")
- context.stage("analysis.reference.hts.chains", { "hts": "entd" }, alias = "entd")
+ context.stage("analysis.reference.hts.chains", {"hts": "egt"}, alias="egt")
+ context.stage("analysis.reference.hts.chains", {"hts": "entd"}, alias="entd")
+
def execute(context):
plotting.setup()
marginal = ("age_range", "sex", "chain")
- df_egt = context.stage("egt")[marginal].rename(columns = { "weight": "egt" })
- df_entd = context.stage("entd")[marginal].rename(columns = { "weight": "entd" })
+ df_egt = context.stage("egt")[marginal].rename(columns={"weight": "egt"})
+ df_entd = context.stage("entd")[marginal].rename(columns={"weight": "entd"})
- df = pd.merge(df_egt, df_entd, on = ["age_range", "sex", "chain"])
+ df = pd.merge(df_egt, df_entd, on=["age_range", "sex", "chain"])
df = df[df["age_range"]]
- df_female = df[df["sex"] == "female"].sort_values(by = "egt", ascending = False).head(10)
- df_male = df[df["sex"] == "male"].sort_values(by = "egt", ascending = False).head(10)
+ df_female = (
+ df[df["sex"] == "female"].sort_values(by="egt", ascending=False).head(10)
+ )
+ df_male = df[df["sex"] == "male"].sort_values(by="egt", ascending=False).head(10)
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
- for index, (df, title) in enumerate(zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])):
+ for index, (df, title) in enumerate(
+ zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])
+ ):
plt.subplot(1, 2, index + 1)
- plt.bar(np.arange(10), df["egt"], width = 0.4, label = "EGT", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["egt"])
- plt.bar(np.arange(10) + 0.4, df["entd"], width = 0.4, label = "ENTD", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["entd"])
+ plt.bar(
+ np.arange(10),
+ df["egt"],
+ width=0.4,
+ label="EGT",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["egt"],
+ )
+ plt.bar(
+ np.arange(10) + 0.4,
+ df["entd"],
+ width=0.4,
+ label="ENTD",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["entd"],
+ )
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5))
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,)))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,))
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(10) + 0.4))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "\n".join(df["chain"].values[p]).upper()))
+ plt.gca().xaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "\n".join(df["chain"].values[p]).upper())
+ )
if index == 1:
plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000))
plt.gca().yaxis.get_label().set_visible(False)
- plt.legend(loc = "best", title = title)
+ plt.legend(loc="best", title=title)
if index == 0:
plt.ylabel("Number of persons [x1000]")
diff --git a/documentation/plots/data/hts_comparison.py b/documentation/plots/data/hts_comparison.py
index 4cb2d35e..c98186d0 100644
--- a/documentation/plots/data/hts_comparison.py
+++ b/documentation/plots/data/hts_comparison.py
@@ -6,9 +6,11 @@
import documentation.plotting as plotting
+
def configure(context):
context.stage("data.hts.comparison")
+
def execute(context):
plotting.setup()
@@ -22,20 +24,37 @@ def execute(context):
plt.figure()
- plt.bar(df_distance[f_entd]["distance_class"].values, df_distance[f_entd]["trip_weight"].values / 1e6, width = 0.4, label = "ENTD (Routed)", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white")
- plt.bar(df_distance[f_egt]["distance_class"].values + 0.4, df_distance[f_egt]["trip_weight"].values / 1e6, width = 0.4, label = "EGT (Euclidean)", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white")
+ plt.bar(
+ df_distance[f_entd]["distance_class"].values,
+ df_distance[f_entd]["trip_weight"].values / 1e6,
+ width=0.4,
+ label="ENTD (Routed)",
+ align="edge",
+ color=plotting.COLORS["entd"],
+ linewidth=0.5,
+ edgecolor="white",
+ )
+ plt.bar(
+ df_distance[f_egt]["distance_class"].values + 0.4,
+ df_distance[f_egt]["trip_weight"].values / 1e6,
+ width=0.4,
+ label="EGT (Euclidean)",
+ align="edge",
+ color=plotting.COLORS["egt"],
+ linewidth=0.5,
+ edgecolor="white",
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(0, 10, 2) + 0.4))
- plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)]))
-
- plt.gca().annotate(
- r"≥10 km",
- xy = (10.0, 8.0), xycoords = 'data', ha = "right"
+ plt.gca().xaxis.set_major_formatter(
+ tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)])
)
+ plt.gca().annotate(r"≥10 km", xy=(10.0, 8.0), xycoords="data", ha="right")
+
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
plt.xlabel("Trip distance")
plt.ylabel("Number of trips [$10^6$]")
@@ -55,37 +74,70 @@ def execute(context):
plt.figure()
- plt.bar(df_age[f_census]["age_class"].values, df_age[f_census]["person_weight"].values / 1e6, width = 0.25, label = "Census", align = "edge", color = plotting.COLORS["census"], linewidth = 0.5, edgecolor = "white")
- plt.bar(df_age[f_entd]["age_class"].values + 0.25, df_age[f_entd]["person_weight"].values / 1e6, width = 0.25, label = "ENTD", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white")
- plt.bar(df_age[f_egt]["age_class"].values + 0.5, df_age[f_egt]["person_weight"].values / 1e6, width = 0.25, label = "EGT", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white")
+ plt.bar(
+ df_age[f_census]["age_class"].values,
+ df_age[f_census]["person_weight"].values / 1e6,
+ width=0.25,
+ label="Census",
+ align="edge",
+ color=plotting.COLORS["census"],
+ linewidth=0.5,
+ edgecolor="white",
+ )
+ plt.bar(
+ df_age[f_entd]["age_class"].values + 0.25,
+ df_age[f_entd]["person_weight"].values / 1e6,
+ width=0.25,
+ label="ENTD",
+ align="edge",
+ color=plotting.COLORS["entd"],
+ linewidth=0.5,
+ edgecolor="white",
+ )
+ plt.bar(
+ df_age[f_egt]["age_class"].values + 0.5,
+ df_age[f_egt]["person_weight"].values / 1e6,
+ width=0.25,
+ label="EGT",
+ align="edge",
+ color=plotting.COLORS["egt"],
+ linewidth=0.5,
+ edgecolor="white",
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1000) + 0.75 / 2))
- plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)]))
+ plt.gca().xaxis.set_major_formatter(
+ tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)])
+ )
AGE_BOUNDS = ["<15", "15-29", "30-44", "45-59", "60-74", ">75"]
plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(AGE_BOUNDS))
plt.gca().annotate(
"A",
- xy = (1.5 + 0.5 * 0.25, 2.0), xycoords='data',
- xytext = (1.5 + 0.5 * 0.25, 2.35), textcoords='data',
- arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 },
- bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) },
- ha = 'center'
+ xy=(1.5 + 0.5 * 0.25, 2.0),
+ xycoords="data",
+ xytext=(1.5 + 0.5 * 0.25, 2.35),
+ textcoords="data",
+ arrowprops={"arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5},
+ bbox={"pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0)},
+ ha="center",
)
plt.gca().annotate(
"B",
- xy = (4.25 + 0.5 * 0.25, 1.3), xycoords='data',
- xytext = (4.25 + 0.5 * 0.25, 1.65), textcoords='data',
- arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 },
- bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) },
- ha = 'center'
+ xy=(4.25 + 0.5 * 0.25, 1.3),
+ xycoords="data",
+ xytext=(4.25 + 0.5 * 0.25, 1.65),
+ textcoords="data",
+ arrowprops={"arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5},
+ bbox={"pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0)},
+ ha="center",
)
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
plt.xlabel("Age")
plt.ylabel("Number of persons [x$10^6$]")
diff --git a/documentation/plots/income.py b/documentation/plots/income.py
index f24ae137..b5029c5b 100644
--- a/documentation/plots/income.py
+++ b/documentation/plots/income.py
@@ -8,18 +8,24 @@
SAMPLING_RATE = 0.05
+
def configure(context):
context.stage("data.income.municipality")
- context.stage("analysis.synthesis.income", dict(sampling_rate = SAMPLING_RATE), alias = "data")
+ context.stage(
+ "analysis.synthesis.income", dict(sampling_rate=SAMPLING_RATE), alias="data"
+ )
context.stage("analysis.reference.income")
+
def execute(context):
plotting.setup()
# Income imputation
df_income = context.stage("data.income.municipality")
- df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
+ df_income = df_income[
+ (df_income["attribute"] == "all") & (df_income["value"] == "all")
+ ]
df_imputed = df_income[df_income["is_imputed"]]
plt.figure()
@@ -29,8 +35,21 @@ def execute(context):
plt.plot([minimum, maximum], [minimum, maximum], "k--")
f = ~df_imputed["is_missing"]
- plt.plot(df_imputed[f]["reference_median"] * 1e-3, df_imputed[f]["q5"] * 1e-3, '.', markersize = 3, color = plotting.COLORSET[0], label = "y")
- plt.plot(df_imputed[~f]["reference_median"] * 1e-3, df_imputed[~f]["q5"] * 1e-3, 'x', markersize = 3, color = plotting.COLORSET[1])
+ plt.plot(
+ df_imputed[f]["reference_median"] * 1e-3,
+ df_imputed[f]["q5"] * 1e-3,
+ ".",
+ markersize=3,
+ color=plotting.COLORSET[0],
+ label="y",
+ )
+ plt.plot(
+ df_imputed[~f]["reference_median"] * 1e-3,
+ df_imputed[~f]["q5"] * 1e-3,
+ "x",
+ markersize=3,
+ color=plotting.COLORSET[1],
+ )
plt.xlabel("Reference median income [1000 EUR]")
plt.ylabel("Imputed median income [1000 EUR]")
@@ -47,23 +66,57 @@ def execute(context):
df_reference = context.stage("analysis.reference.income")
f = df_reference["source"] == "entd"
- plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["entd"], label = "ENTD", linewidth = 1.0)
+ plt.plot(
+ df_reference[f]["income"].values * 1e-3,
+ df_reference[f]["cdf"].values,
+ color=plotting.COLORS["entd"],
+ label="ENTD",
+ linewidth=1.0,
+ )
f = df_reference["source"] == "egt"
- plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["egt"], label = "EGT", linewidth = 1.0)
+ plt.plot(
+ df_reference[f]["income"].values * 1e-3,
+ df_reference[f]["cdf"].values,
+ color=plotting.COLORS["egt"],
+ label="EGT",
+ linewidth=1.0,
+ )
f = df_reference["source"] == "filo"
- plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color = plotting.COLORS["census"], label = "Tax data", linewidth = 1.0, marker = ".", markersize = 3)
-
- plt.plot(df_data["mean"].values * 1e-3, df_data["cdf"].values, color = "k", label = "Synthetic", linewidth = 1.0, linestyle = ":")
- plt.fill_betweenx(df_data["cdf"].values, df_data["min"].values * 1e-3, df_data["max"].values * 1e-3, color = "k", linewidth = 0.0, alpha = 0.25)
+ plt.plot(
+ df_reference[f]["income"].values * 1e-3,
+ df_reference[f]["cdf"].values,
+ color=plotting.COLORS["census"],
+ label="Tax data",
+ linewidth=1.0,
+ marker=".",
+ markersize=3,
+ )
+
+ plt.plot(
+ df_data["mean"].values * 1e-3,
+ df_data["cdf"].values,
+ color="k",
+ label="Synthetic",
+ linewidth=1.0,
+ linestyle=":",
+ )
+ plt.fill_betweenx(
+ df_data["cdf"].values,
+ df_data["min"].values * 1e-3,
+ df_data["max"].values * 1e-3,
+ color="k",
+ linewidth=0.0,
+ alpha=0.25,
+ )
plt.xlim([0, 60])
plt.xlabel("Household income [1000 EUR]")
plt.ylabel("Cumulative density")
- plt.legend(loc = "lower right")
+ plt.legend(loc="lower right")
plt.grid()
plt.tight_layout()
diff --git a/documentation/plots/language.py b/documentation/plots/language.py
index 03131b97..3d1f9d43 100644
--- a/documentation/plots/language.py
+++ b/documentation/plots/language.py
@@ -1,5 +1,7 @@
-
def get_source(source):
- if source == "egt": return "EGT"
- if source == "entd": return "ENTD"
- if source == "census": return "Census"
+ if source == "egt":
+ return "EGT"
+ if source == "entd":
+ return "ENTD"
+ if source == "census":
+ return "Census"
diff --git a/documentation/plots/matching.py b/documentation/plots/matching.py
index d1777a8e..05ae8b00 100644
--- a/documentation/plots/matching.py
+++ b/documentation/plots/matching.py
@@ -7,19 +7,25 @@
SAMPLING_RATE = 0.05
POPULATION_SAMPLES = 200
+
def configure(context):
- context.stage("analysis.matching", {
- "sampling_rate": SAMPLING_RATE,
- "analysis_populations": POPULATION_SAMPLES,
- }, alias = "data")
+ context.stage(
+ "analysis.matching",
+ {
+ "sampling_rate": SAMPLING_RATE,
+ "analysis_populations": POPULATION_SAMPLES,
+ },
+ alias="data",
+ )
+
def execute(context):
data = context.stage("data")
variables = max(data.keys()) + 1
means = [np.mean(data[v] / data[0]) for v in range(variables)]
- #mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)]
- #maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)]
+ # mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)]
+ # maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)]
mins = [np.min(data[v] / data[0]) for v in range(variables)]
maxs = [np.max(data[v] / data[0]) for v in range(variables)]
@@ -28,16 +34,27 @@ def execute(context):
plotting.setup()
plt.figure()
- plt.bar(range(variables), means, color = plotting.COLORS["synthetic"])
+ plt.bar(range(variables), means, color=plotting.COLORS["synthetic"])
for v, min, max in zip(range(variables), mins, maxs):
- plt.plot([v, v,], [min, max], linewidth = 1, label = "90% Conf.", color = "k")
+ plt.plot(
+ [
+ v,
+ v,
+ ],
+ [min, max],
+ linewidth=1,
+ label="90% Conf.",
+ color="k",
+ )
plt.xlabel("Variables")
plt.ylabel("Matching rate")
plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2))
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d%%" % (100 * x,)))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x,))
+ )
plt.tight_layout()
plt.savefig("%s/matching_rate.pdf" % context.path())
diff --git a/documentation/plots/monte_carlo.py b/documentation/plots/monte_carlo.py
index 122f11e4..cd71c018 100644
--- a/documentation/plots/monte_carlo.py
+++ b/documentation/plots/monte_carlo.py
@@ -9,10 +9,12 @@
from analysis.synthesis.statistics.monte_carlo import SAMPLING_RATES
from analysis.synthesis.statistics.monte_carlo import ACQUISITION_SAMPLE_SIZE
+
def configure(context):
context.stage("analysis.reference.census.sociodemographics")
context.stage("analysis.synthesis.statistics.monte_carlo")
+
SELECTED_MARGINAL = ("age_class", "employed")
SELECTED_VALUES = (3, True)
@@ -24,14 +26,13 @@ def configure(context):
"studies",
]
-ADDITIONAL_VALUES = [
- (3, True), (4, True), (5, True)
-]
+ADDITIONAL_VALUES = [(3, True), (4, True), (5, True)]
from analysis.marginals import AGE_CLASS_LABELS
ADDITIONAL_LABELS = AGE_CLASS_LABELS[3:6]
+
def select(reference, data, marginal, values):
df_marginal = data[marginal]
df_reference = reference[marginal]
@@ -44,6 +45,7 @@ def select(reference, data, marginal, values):
return df_marginal, reference_value
+
def execute(context):
data = context.stage("analysis.synthesis.statistics.monte_carlo")
@@ -55,13 +57,15 @@ def execute(context):
values = np.sort(df_marginal[(marginal,)].drop_duplicates().values)
for value in values:
- row = { "marginal": marginal, "value": value }
+ row = {"marginal": marginal, "value": value}
df_value = df_marginal[df_marginal[marginal] == value]
df_value = df_value[df_value["samples"] == ACQUISITION_SAMPLE_SIZE]
assert len(df_value) == len(SAMPLING_RATES)
- probabilities = df_value.sort_values(by = ["sampling_rate", "samples"])["error_probability"].values[:,0]
+ probabilities = df_value.sort_values(by=["sampling_rate", "samples"])[
+ "error_probability"
+ ].values[:, 0]
for sampling_rate, probability in zip(SAMPLING_RATES, probabilities):
row[sampling_rate] = probability
@@ -70,7 +74,7 @@ def execute(context):
df_table = pd.DataFrame.from_records(df_table)
df_table = create_table(df_table)
- df_table.to_latex("%s/monte_carlo_table.tex" % context.path(), escape = False)
+ df_table.to_latex("%s/monte_carlo_table.tex" % context.path(), escape=False)
# Prepare data for plotting
reference = context.stage("analysis.reference.census.sociodemographics")["person"]
@@ -78,52 +82,100 @@ def execute(context):
# Perform plotting
plotting.setup()
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
# ... subplot on nominal stratum values
plt.subplot(1, 2, 1)
- plt.title("(a) Monte Carlo analysis", fontsize = plotting.FONT_SIZE)
+ plt.title("(a) Monte Carlo analysis", fontsize=plotting.FONT_SIZE)
- df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, SELECTED_VALUES)
+ df_marginal, reference_value = select(
+ reference, data, SELECTED_MARGINAL, SELECTED_VALUES
+ )
assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES)
display_sampling_rates = [0.001, 0.01, 0.05]
for index, sampling_rate in enumerate([0.001, 0.01, 0.05]):
df_rate = df_marginal[df_marginal["sampling_rate"] == sampling_rate]
- df_rate = df_rate.sort_values(by = "samples")
- plt.fill_between(df_rate["samples"], df_rate[("weight", "q5")], df_rate[("weight", "q95")], alpha = 0.25 + index * 0.2, color = plotting.COLORSET[0], linewidth = 0.0)
-
- plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value] * 2, 'k--', label = "Ref. $y$", linewidth = 1.0)
- plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 0.99] * 2, 'k:', label = "1% Err.", linewidth = 1.0)
- plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2, 'k:', linewidth = 1.0)
+ df_rate = df_rate.sort_values(by="samples")
+ plt.fill_between(
+ df_rate["samples"],
+ df_rate[("weight", "q5")],
+ df_rate[("weight", "q95")],
+ alpha=0.25 + index * 0.2,
+ color=plotting.COLORSET[0],
+ linewidth=0.0,
+ )
+
+ plt.plot(
+ [1, ACQUISITION_SAMPLE_SIZE],
+ [reference_value] * 2,
+ "k--",
+ label="Ref. $y$",
+ linewidth=1.0,
+ )
+ plt.plot(
+ [1, ACQUISITION_SAMPLE_SIZE],
+ [reference_value * 0.99] * 2,
+ "k:",
+ label="1% Err.",
+ linewidth=1.0,
+ )
+ plt.plot(
+ [1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2, "k:", linewidth=1.0
+ )
plt.xlabel("Sample size $N$")
plt.ylabel("Stratum weight")
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6,)))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6,))
+ )
plt.grid()
plt.gca().set_axisbelow(True)
plt.xlim([1, ACQUISITION_SAMPLE_SIZE])
- plt.fill_between([np.nan], [np.nan], [np.nan], color = plotting.COLORSET[0], alpha = 0.25, label = "90% Conf.")
- plt.legend(loc = "lower center", ncol = 2)
+ plt.fill_between(
+ [np.nan],
+ [np.nan],
+ [np.nan],
+ color=plotting.COLORSET[0],
+ alpha=0.25,
+ label="90% Conf.",
+ )
+ plt.legend(loc="lower center", ncol=2)
# ... subplot on nominal stratum values
plt.subplot(1, 2, 2)
- plt.title("(b) Error probability", fontsize = plotting.FONT_SIZE)
+ plt.title("(b) Error probability", fontsize=plotting.FONT_SIZE)
for index, values in enumerate(ADDITIONAL_VALUES):
- df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, values)
+ df_marginal, reference_value = select(
+ reference, data, SELECTED_MARGINAL, values
+ )
assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES)
df_max = df_marginal[df_marginal["samples"] == ACQUISITION_SAMPLE_SIZE]
- df_max = df_max.sort_values(by = "sampling_rate")
-
- plt.plot(100 * np.array(SAMPLING_RATES), df_max[("error_probability", "mean")], color = plotting.COLORSET[index], label = "Age %s" % ADDITIONAL_LABELS[index], marker = ".", markersize = 3.0, linewidth = 1.0)
-
- plt.plot([0, 100 * max(SAMPLING_RATES)], [0.9] * 2, 'k:', label = "90% Prob.", linewidth = 1.0)
+ df_max = df_max.sort_values(by="sampling_rate")
+
+ plt.plot(
+ 100 * np.array(SAMPLING_RATES),
+ df_max[("error_probability", "mean")],
+ color=plotting.COLORSET[index],
+ label="Age %s" % ADDITIONAL_LABELS[index],
+ marker=".",
+ markersize=3.0,
+ linewidth=1.0,
+ )
+
+ plt.plot(
+ [0, 100 * max(SAMPLING_RATES)],
+ [0.9] * 2,
+ "k:",
+ label="90% Prob.",
+ linewidth=1.0,
+ )
plt.xlim([0, 100 * max(SAMPLING_RATES)])
plt.ylim([0, 1.0])
@@ -133,14 +185,16 @@ def execute(context):
plt.grid()
plt.gca().set_axisbelow(True)
- plt.legend(loc = "center", ncol = 1)
+ plt.legend(loc="center", ncol=1)
plt.tight_layout()
plt.savefig("%s/monte_carlo.pdf" % context.path())
plt.close()
+
import analysis.marginals
+
def label_row(row):
if row["marginal"] == "age_class":
return analysis.marginals.AGE_CLASS_LABELS[row["value"]]
@@ -157,28 +211,36 @@ def label_row(row):
elif row["marginal"] == "socioprofessional_class":
return analysis.marginals.SOCIOPROFESIONAL_CLASS_LABELS[row["value"]]
+
def bold_probability(x):
if x >= 0.9:
return "\\textbf{%.2f}" % x
else:
return "%.2f" % x
-def create_table(df_table):
- df_table["value"] = df_table.apply(label_row, axis = 1, raw = False)
- df_table["marginal"] = df_table["marginal"].map({
- "age_class": "Age",
- "sex": "Sex",
- "employed": "Employed",
- "studies": "Studies",
- "socioprofessional_class": "Socioprof. Cat."
- })
+def create_table(df_table):
+ df_table["value"] = df_table.apply(label_row, axis=1, raw=False)
+
+ df_table["marginal"] = df_table["marginal"].map(
+ {
+ "age_class": "Age",
+ "sex": "Sex",
+ "employed": "Employed",
+ "studies": "Studies",
+ "socioprofessional_class": "Socioprof. Cat.",
+ }
+ )
for sampling_rate in SAMPLING_RATES:
df_table[sampling_rate] = df_table[sampling_rate].apply(bold_probability)
- df_table.columns = ["Variable", "Stratum"] + ["%.1f%%" % (100 * s,) for s in SAMPLING_RATES]
+ df_table.columns = ["Variable", "Stratum"] + [
+ "%.1f%%" % (100 * s,) for s in SAMPLING_RATES
+ ]
df_table = df_table.set_index(["Variable", "Stratum"])
- df_table.columns = pd.MultiIndex.from_tuples([("Sampling rate $s$", str(s)) for s in SAMPLING_RATES])
+ df_table.columns = pd.MultiIndex.from_tuples(
+ [("Sampling rate $s$", str(s)) for s in SAMPLING_RATES]
+ )
return df_table
diff --git a/documentation/plots/secondary_locations.py b/documentation/plots/secondary_locations.py
index 296fb335..8b70da8d 100644
--- a/documentation/plots/secondary_locations.py
+++ b/documentation/plots/secondary_locations.py
@@ -3,6 +3,7 @@
import matplotlib.ticker as tck
import documentation.plotting as plotting
+
def configure(context):
context.stage("synthesis.population.spatial.secondary.distance_distributions")
@@ -11,17 +12,20 @@ def configure(context):
context.config("hts")
+
def execute(context):
plotting.setup()
hts_name = context.config("hts")
# PLOT: Input distributions
- distributions = context.stage("synthesis.population.spatial.secondary.distance_distributions")
+ distributions = context.stage(
+ "synthesis.population.spatial.secondary.distance_distributions"
+ )
plt.figure()
modes = list(context.stage("analysis.reference.hts.mode_distances").keys())
- #modes = ["car", "car_passenger", "pt", "bike", "walk"]
+ # modes = ["car", "car_passenger", "pt", "bike", "walk"]
for index, mode in enumerate(modes):
mode_distribution = distributions[mode]
@@ -36,21 +40,40 @@ def execute(context):
weights = distribution["weights"] / np.sum(distribution["weights"])
means.append(np.sum(weights * distribution["values"]))
- q10.append(distribution["values"][np.count_nonzero(distribution["cdf"] < 0.1)])
- q90.append(distribution["values"][np.count_nonzero(distribution["cdf"] < 0.9)])
+ q10.append(
+ distribution["values"][np.count_nonzero(distribution["cdf"] < 0.1)]
+ )
+ q90.append(
+ distribution["values"][np.count_nonzero(distribution["cdf"] < 0.9)]
+ )
if mode in ("car", "pt"):
- plt.fill_between([0.0] + list(bounds), q10, q90, color = plotting.COLORSET5[index], alpha = 0.25, linewidth = 0.0)
-
- plt.plot([0.0] + list(bounds), means, label = "%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)), linewidth = 1.0, marker = ".", markersize = 3, color = plotting.COLORSET5[index])
+ plt.fill_between(
+ [0.0] + list(bounds),
+ q10,
+ q90,
+ color=plotting.COLORSET5[index],
+ alpha=0.25,
+ linewidth=0.0,
+ )
+
+ plt.plot(
+ [0.0] + list(bounds),
+ means,
+ label="%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)),
+ linewidth=1.0,
+ marker=".",
+ markersize=3,
+ color=plotting.COLORSET5[index],
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 60 * 20))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: str(x // 60)))
+ plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: str(x // 60)))
plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 5 * 1000))
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: str(x // 1000)))
+ plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x, p: str(x // 1000)))
- plt.legend(loc = "upper left")
+ plt.legend(loc="upper left")
plt.xlim([0, 90 * 60 if hts_name == "egt" else 50 * 60])
plt.ylim([0, 45 * 1000 if hts_name == "egt" else 25 * 1000])
@@ -67,29 +90,49 @@ def execute(context):
df_synthetic = context.stage("analysis.synthesis.mode_distances")
reference_data = context.stage("analysis.reference.hts.mode_distances")
- plt.figure(figsize = (6.0, 2.5), dpi = 100) # 2.5 * 2.5
+ plt.figure(figsize=(6.0, 2.5), dpi=100) # 2.5 * 2.5
limits = dict(
- car = 20 * 1e3, car_passenger = 20 * 1e3, pt = 20 * 1e3,
- bike = 6 * 1e3, walk = 1 * 1e3
+ car=20 * 1e3, car_passenger=20 * 1e3, pt=20 * 1e3, bike=6 * 1e3, walk=1 * 1e3
)
- modes = ["car", "bike" if "bike" in modes else "walk" ]
+ modes = ["car", "bike" if "bike" in modes else "walk"]
for index, mode in enumerate(modes):
plt.subplot(1, 2, index + 1)
mode_reference = reference_data[mode]
- plt.plot(mode_reference["values"] * 1e-3, mode_reference["cdf"], linestyle = '--', color = "k", linewidth = 1.0, label = "HTS")
+ plt.plot(
+ mode_reference["values"] * 1e-3,
+ mode_reference["cdf"],
+ linestyle="--",
+ color="k",
+ linewidth=1.0,
+ label="HTS",
+ )
df_mode = df_synthetic[df_synthetic["mode"] == mode]
- plt.fill_betweenx(df_mode["cdf"], df_mode["min"]* 1e-3, df_mode["max"] * 1e-3, linewidth = 0.0, color = plotting.COLORS[hts_name], alpha = 0.25, label = "Range")
- plt.plot(df_mode["mean"] * 1e-3, df_mode["cdf"], color = plotting.COLORS[hts_name], linewidth = 1.0, label = "Synthetic")
+ plt.fill_betweenx(
+ df_mode["cdf"],
+ df_mode["min"] * 1e-3,
+ df_mode["max"] * 1e-3,
+ linewidth=0.0,
+ color=plotting.COLORS[hts_name],
+ alpha=0.25,
+ label="Range",
+ )
+ plt.plot(
+ df_mode["mean"] * 1e-3,
+ df_mode["cdf"],
+ color=plotting.COLORS[hts_name],
+ linewidth=1.0,
+ label="Synthetic",
+ )
plt.xlim([0, limits[mode] * 1e-3])
plt.ylim([0, 1])
- plt.title(plotting.MODE_LABELS[mode], fontsize = plotting.FONT_SIZE)
+ plt.title(plotting.MODE_LABELS[mode], fontsize=plotting.FONT_SIZE)
plt.xlabel("Euclidean distance [km]")
plt.grid()
@@ -97,7 +140,7 @@ def execute(context):
plt.ylabel("Cumulative density")
if index % 2 == 1:
- plt.legend(loc = "best")
+ plt.legend(loc="best")
plt.tight_layout()
plt.savefig("%s/distance_distributions.pdf" % context.path())
diff --git a/documentation/plots/sociodemographics/chains.py b/documentation/plots/sociodemographics/chains.py
index 6632e6de..fbed851c 100644
--- a/documentation/plots/sociodemographics/chains.py
+++ b/documentation/plots/sociodemographics/chains.py
@@ -7,16 +7,19 @@
SAMPLING_RATE = 0.05
+
def configure(context):
context.stage("analysis.reference.hts.chains")
context.stage(
"analysis.synthesis.sociodemographics.chains",
- dict(sampling_rate = SAMPLING_RATE), alias = "data"
+ dict(sampling_rate=SAMPLING_RATE),
+ alias="data",
)
context.config("hts")
+
def execute(context):
plotting.setup()
@@ -26,41 +29,76 @@ def execute(context):
# PLOT: Activity chains by sex
marginal = ("age_range", "sex", "chain")
- df = pd.merge(data[marginal], reference[marginal].rename(columns = { "weight": "reference" }))
+ df = pd.merge(
+ data[marginal], reference[marginal].rename(columns={"weight": "reference"})
+ )
df = df[df["age_range"]]
- df_female = df[df["sex"] == "female"].sort_values(by = "reference", ascending = False).head(10)
- df_male = df[df["sex"] == "male"].sort_values(by = "reference", ascending = False).head(10)
+ df_female = (
+ df[df["sex"] == "female"].sort_values(by="reference", ascending=False).head(10)
+ )
+ df_male = (
+ df[df["sex"] == "male"].sort_values(by="reference", ascending=False).head(10)
+ )
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
hts_name = context.config("hts")
- for index, (df, title) in enumerate(zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])):
+ for index, (df, title) in enumerate(
+ zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])
+ ):
plt.subplot(1, 2, index + 1)
- plt.bar(np.arange(10), df["reference"], width = 0.4, label = "HTS", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS[hts_name])
- plt.bar(np.arange(10) + 0.4, df["mean"] / SAMPLING_RATE, width = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"])
+ plt.bar(
+ np.arange(10),
+ df["reference"],
+ width=0.4,
+ label="HTS",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS[hts_name],
+ )
+ plt.bar(
+ np.arange(10) + 0.4,
+ df["mean"] / SAMPLING_RATE,
+ width=0.4,
+ label="Synthetic",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["synthetic"],
+ )
for location, (min, max) in enumerate(zip(df["min"].values, df["max"].values)):
location += 0.4 + 0.2
- plt.plot([location, location], [min / SAMPLING_RATE, max / SAMPLING_RATE], "k", linewidth = 1)
+ plt.plot(
+ [location, location],
+ [min / SAMPLING_RATE, max / SAMPLING_RATE],
+ "k",
+ linewidth=1,
+ )
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().xaxis.grid(alpha = 0.0)
+ plt.gca().xaxis.grid(alpha=0.0)
if hts_name == "egt":
plt.ylim([0, 3.5e5])
else:
plt.ylim([0, 5e5])
- plt.plot([np.nan], color = "k", linewidth = 1, label = "Range")
+ plt.plot([np.nan], color="k", linewidth=1, label="Range")
plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 1e5))
- plt.gca().yaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%d" % (x * 1e-3,)))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3,))
+ )
plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(10) + 0.4))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "\n".join(df["chain"].values[p]).upper()))
+ plt.gca().xaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "\n".join(df["chain"].values[p]).upper())
+ )
if index == 1:
plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000))
@@ -69,7 +107,7 @@ def execute(context):
handles, labels = plt.gca().get_legend_handles_labels()
handles = [handles[-2], handles[-1], handles[-3]]
labels = [labels[-2], labels[-1], labels[-3]]
- plt.legend(handles = handles, labels = labels, loc = "best", title = title)
+ plt.legend(handles=handles, labels=labels, loc="best", title=title)
if index == 0:
plt.ylabel("Number of persons [x1000]")
diff --git a/documentation/plots/sociodemographics/general.py b/documentation/plots/sociodemographics/general.py
index 869f0604..005451ae 100644
--- a/documentation/plots/sociodemographics/general.py
+++ b/documentation/plots/sociodemographics/general.py
@@ -9,6 +9,7 @@
SAMPLING_RATE = 0.05
+
def configure(context):
context.config("hts")
@@ -17,15 +18,18 @@ def configure(context):
context.stage(
"analysis.synthesis.sociodemographics.general",
- dict(sampling_rate = SAMPLING_RATE), alias = "data"
+ dict(sampling_rate=SAMPLING_RATE),
+ alias="data",
)
+
def get_reference(level, marginal, census, hts):
if (marginal,) in census[level]:
return census[level][(marginal,)]
else:
return hts[level][(marginal,)]
+
def prepare_reference(hts_marginals, census_marginals, level, marginal):
if (marginal,) in census_marginals[level]:
df = census_marginals[level][(marginal,)]
@@ -34,26 +38,34 @@ def prepare_reference(hts_marginals, census_marginals, level, marginal):
df = hts_marginals[level][(marginal,)]
df["reference_source"] = "hts"
- df = df.copy().rename(columns = { marginal: "value", "weight": "reference" })
+ df = df.copy().rename(columns={marginal: "value", "weight": "reference"})
df = df[["value", "reference", "reference_source"]]
- df = df.sort_values(by = "value")
+ df = df.sort_values(by="value")
return df
-def prepare_marginal(data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate):
- df = data_marginals[level][(marginal,)].copy().rename(columns = { marginal: "value" })
+
+def prepare_marginal(
+ data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate
+):
+ df = data_marginals[level][(marginal,)].copy().rename(columns={marginal: "value"})
df["attribute"] = marginal
df = df[["attribute", "value", "mean", "min", "max"]]
- df = df.sort_values(by = "value")
+ df = df.sort_values(by="value")
df["mean"] /= sampling_rate
df["min"] /= sampling_rate
df["max"] /= sampling_rate
- df = pd.merge(df, prepare_reference(hts_marginals, census_marginals, level, marginal), on = "value")
+ df = pd.merge(
+ df,
+ prepare_reference(hts_marginals, census_marginals, level, marginal),
+ on="value",
+ )
return df
+
def label(row):
if row["attribute"] == "age_class":
return "Age %s" % analysis.marginals.AGE_CLASS_LABELS[row["value"]]
@@ -77,22 +89,43 @@ def label(row):
return "SC %s" % analysis.marginals.SOCIOPROFESIONAL_CLASS_LABELS[row["value"]]
elif row["attribute"] == "household_size_class":
- return "Household size %s" % analysis.marginals.HOUSEHOLD_SIZE_LABELS[row["value"]]
+ return (
+ "Household size %s" % analysis.marginals.HOUSEHOLD_SIZE_LABELS[row["value"]]
+ )
elif row["attribute"] == "number_of_vehicles_class":
- return "No. vehicles %s" % analysis.marginals.NUMBER_OF_VEHICLES_LABELS[row["value"]]
+ return (
+ "No. vehicles %s"
+ % analysis.marginals.NUMBER_OF_VEHICLES_LABELS[row["value"]]
+ )
elif row["attribute"] == "number_of_bikes_class":
- return "No. bicycles %s" % analysis.marginals.NUMBER_OF_BIKES_LABELS[row["value"]]
+ return (
+ "No. bicycles %s" % analysis.marginals.NUMBER_OF_BIKES_LABELS[row["value"]]
+ )
+
def add_labels(df_figure):
- df_figure["label"] = df_figure.apply(label, axis = 1, raw = False)
+ df_figure["label"] = df_figure.apply(label, axis=1, raw=False)
+
+
+def prepare_data(
+ data_marginals, hts_marginals, census_marginals, level, marginals, sampling_rate
+):
+ return pd.concat(
+ [
+ prepare_marginal(
+ data_marginals,
+ hts_marginals,
+ census_marginals,
+ level,
+ marginal,
+ sampling_rate,
+ )
+ for marginal in marginals
+ ]
+ )
-def prepare_data(data_marginals, hts_marginals, census_marginals, level, marginals, sampling_rate):
- return pd.concat([
- prepare_marginal(data_marginals, hts_marginals, census_marginals, level, marginal, sampling_rate)
- for marginal in marginals
- ])
def reweight_hts(df_figure, hts_marginals, census_marginals, level):
hts_total = hts_marginals[level][tuple()]["weight"].values[0]
@@ -101,6 +134,7 @@ def reweight_hts(df_figure, hts_marginals, census_marginals, level):
f = df_figure["reference_source"] == "hts"
df_figure.loc[f, "reference"] *= census_total / hts_total
+
def execute(context):
plotting.setup()
@@ -110,19 +144,37 @@ def execute(context):
figures = [
dict(
- level = "person", label = "Number of persons", size = (6.0, 5.0),
- marginals = ["age_class", "sex", "employed", "studies", "has_license", "has_pt_subscription", "socioprofessional_class"]
+ level="person",
+ label="Number of persons",
+ size=(6.0, 5.0),
+ marginals=[
+ "age_class",
+ "sex",
+ "employed",
+ "studies",
+ "has_license",
+ "has_pt_subscription",
+ "socioprofessional_class",
+ ],
),
dict(
- level = "household", label = "Number of households", size = plotting.WIDE_FIGSIZE,
- marginals = ["household_size_class", "number_of_vehicles_class", "number_of_bikes_class"]
- )
+ level="household",
+ label="Number of households",
+ size=plotting.WIDE_FIGSIZE,
+ marginals=[
+ "household_size_class",
+ "number_of_vehicles_class",
+ "number_of_bikes_class",
+ ],
+ ),
]
for figure in figures:
- plt.figure(figsize = figure["size"])
+ plt.figure(figsize=figure["size"])
- df_figure = prepare_data(data, hts, census, figure["level"], figure["marginals"], SAMPLING_RATE)
+ df_figure = prepare_data(
+ data, hts, census, figure["level"], figure["marginals"], SAMPLING_RATE
+ )
reweight_hts(df_figure, hts, census, figure["level"])
add_labels(df_figure)
@@ -130,32 +182,80 @@ def execute(context):
locations = np.arange(len(df_figure))
f = (df_figure["reference_source"] == "census").values
- plt.barh(locations[f], df_figure["reference"].values[f], height = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"])
- plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"])
+ plt.barh(
+ locations[f],
+ df_figure["reference"].values[f],
+ height=0.4,
+ label="Census",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["census"],
+ )
+ plt.barh(
+ locations[f] + 0.4,
+ df_figure["mean"].values[f],
+ height=0.4,
+ label="Synthetic",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["synthetic"],
+ )
f = (df_figure["reference_source"] == "hts").values
hts_name = context.config("hts")
- plt.barh(locations[f], df_figure["reference"].values[f], height = 0.4, label = "HTS", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS[hts_name])
- plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height = 0.4, label = None, align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"])
+ plt.barh(
+ locations[f],
+ df_figure["reference"].values[f],
+ height=0.4,
+ label="HTS",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS[hts_name],
+ )
+ plt.barh(
+ locations[f] + 0.4,
+ df_figure["mean"].values[f],
+ height=0.4,
+ label=None,
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["synthetic"],
+ )
- for index, (min, max) in enumerate(zip(df_figure["min"].values, df_figure["max"].values)):
+ for index, (min, max) in enumerate(
+ zip(df_figure["min"].values, df_figure["max"].values)
+ ):
location = index + 0.4 + 0.2
- plt.plot([min, max], [location, location], "k", linewidth = 1, label = "Range")
+ plt.plot([min, max], [location, location], "k", linewidth=1, label="Range")
plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4))
- plt.gca().yaxis.set_major_formatter(tck.FixedFormatter(df_figure["label"].values))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FixedFormatter(df_figure["label"].values)
+ )
if figure["level"] == "person":
- plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1, 100) * 1e6 * 2))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%dM" % (x / 1e6,)))
+ plt.gca().xaxis.set_major_locator(
+ tck.FixedLocator(np.arange(1, 100) * 1e6 * 2)
+ )
+ plt.gca().xaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%dM" % (x / 1e6,))
+ )
if figure["level"] == "household":
- plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%.1fM" % (x / 1e6,)))
+ plt.gca().xaxis.set_major_locator(
+ tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5)
+ )
+ plt.gca().xaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%.1fM" % (x / 1e6,))
+ )
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().yaxis.grid(alpha = 0.0)
+ plt.gca().yaxis.grid(alpha=0.0)
plt.gca().invert_yaxis()
plt.xlabel(figure["label"])
@@ -163,7 +263,7 @@ def execute(context):
handles, labels = plt.gca().get_legend_handles_labels()
handles = [handles[-2], handles[-1], handles[-3], handles[-4]]
labels = [labels[-2], labels[-1], labels[-3], labels[-4]]
- plt.legend(handles = handles, labels = labels, loc = "best")
+ plt.legend(handles=handles, labels=labels, loc="best")
plt.tight_layout()
plt.savefig("%s/%s.pdf" % (context.path(), figure["level"]))
diff --git a/documentation/plots/sociodemographics/local.py b/documentation/plots/sociodemographics/local.py
index 6d831c12..1e0e9584 100644
--- a/documentation/plots/sociodemographics/local.py
+++ b/documentation/plots/sociodemographics/local.py
@@ -9,15 +9,18 @@
SAMPLING_RATE = 0.05
+
def configure(context):
context.stage("analysis.reference.census.sociodemographics")
context.stage(
"analysis.synthesis.sociodemographics.spatial",
- dict(sampling_rate = SAMPLING_RATE), alias = "data"
+ dict(sampling_rate=SAMPLING_RATE),
+ alias="data",
)
-def filter_commune(marginals, commune_id, levels = ["person", "household"]):
+
+def filter_commune(marginals, commune_id, levels=["person", "household"]):
result = {}
for level in levels:
@@ -26,7 +29,7 @@ def filter_commune(marginals, commune_id, levels = ["person", "household"]):
for attributes, df_marginal in marginals[level].items():
if "commune_id" in attributes:
f = df_marginal["commune_id"] == str(commune_id)
- df_marginal = df_marginal[f].drop(columns = ["commune_id"])
+ df_marginal = df_marginal[f].drop(columns=["commune_id"])
attributes = list(attributes)
attributes.remove("commune_id")
@@ -37,6 +40,7 @@ def filter_commune(marginals, commune_id, levels = ["person", "household"]):
return result
+
def execute(context):
plotting.setup()
@@ -44,20 +48,36 @@ def execute(context):
data = context.stage("data")
cases = [
- dict(commune = 75113, title = "13th Arrondissement"),
- dict(commune = 94028, title = "Alfortville"),
+ dict(commune=75113, title="13th Arrondissement"),
+ dict(commune=94028, title="Alfortville"),
]
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
for case_index, case in enumerate(cases):
case_census = filter_commune(census, case["commune"])
case_data = filter_commune(data, case["commune"])
- df_case = pd.concat([
- prepare_data(case_data, case_census, case_census, "household", ["household_size_class"], SAMPLING_RATE),
- prepare_data(case_data, case_census, case_census, "person", ["age_class"], SAMPLING_RATE),
- ])
+ df_case = pd.concat(
+ [
+ prepare_data(
+ case_data,
+ case_census,
+ case_census,
+ "household",
+ ["household_size_class"],
+ SAMPLING_RATE,
+ ),
+ prepare_data(
+ case_data,
+ case_census,
+ case_census,
+ "person",
+ ["age_class"],
+ SAMPLING_RATE,
+ ),
+ ]
+ )
add_labels(df_case)
@@ -67,36 +87,60 @@ def execute(context):
reference_values = df_case["reference"].values
mean_values = df_case["mean"].values
- plt.barh(locations, df_case["reference"].values, height = 0.4, label = "Census", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["census"])
- plt.barh(locations + 0.4, df_case["mean"].values, height = 0.4, label = "Synthetic", align = "edge", linewidth = 0.5, edgecolor = "white", color = plotting.COLORS["synthetic"])
-
- for index, (min, max) in enumerate(zip(df_case["min"].values, df_case["max"].values)):
+ plt.barh(
+ locations,
+ df_case["reference"].values,
+ height=0.4,
+ label="Census",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["census"],
+ )
+ plt.barh(
+ locations + 0.4,
+ df_case["mean"].values,
+ height=0.4,
+ label="Synthetic",
+ align="edge",
+ linewidth=0.5,
+ edgecolor="white",
+ color=plotting.COLORS["synthetic"],
+ )
+
+ for index, (min, max) in enumerate(
+ zip(df_case["min"].values, df_case["max"].values)
+ ):
location = index + 0.4 + 0.2
- plt.plot([min, max], [location, location], "k", linewidth = 1, label = "Range")
+ plt.plot([min, max], [location, location], "k", linewidth=1, label="Range")
plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4))
if case_index == 0:
- plt.gca().yaxis.set_major_formatter(tck.FixedFormatter(df_case["label"].values))
+ plt.gca().yaxis.set_major_formatter(
+ tck.FixedFormatter(df_case["label"].values)
+ )
else:
plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100))
- plt.gca().xaxis.set_major_formatter(tck.FuncFormatter(lambda x,p: "%dk" % (x // 1000,)))
+ plt.gca().xaxis.set_major_formatter(
+ tck.FuncFormatter(lambda x, p: "%dk" % (x // 1000,))
+ )
plt.grid()
plt.gca().set_axisbelow(True)
- plt.gca().yaxis.grid(alpha = 0.0)
+ plt.gca().yaxis.grid(alpha=0.0)
plt.gca().invert_yaxis()
plt.xlabel("Number of persons / households")
plt.title(case["title"])
- #plt.ylim([len(locations) + 2.5, -0.5])
+ # plt.ylim([len(locations) + 2.5, -0.5])
if case_index == 1:
handles, labels = plt.gca().get_legend_handles_labels()
handles = [handles[-2], handles[-1], handles[-3]]
labels = [labels[-2], labels[-1], labels[-3]]
- plt.legend(handles = handles, labels = labels, loc = (0.05, 0.32), framealpha = 1.0)
+ plt.legend(handles=handles, labels=labels, loc=(0.05, 0.32), framealpha=1.0)
plt.tight_layout()
plt.savefig("%s/comparison.pdf" % (context.path(),))
diff --git a/documentation/plots/sociodemographics/utils.py b/documentation/plots/sociodemographics/utils.py
index 14bfcb45..849aacb6 100644
--- a/documentation/plots/sociodemographics/utils.py
+++ b/documentation/plots/sociodemographics/utils.py
@@ -1,15 +1,20 @@
def create_labels(df, marginals):
- labels = df.apply(lambda x: "%s %s" % (
- marginals[x["marginal"]]["category_label"],
- marginals[x["marginal"]]["label"]
- ), axis = 1)
+ labels = df.apply(
+ lambda x: "%s %s"
+ % (
+ marginals[x["marginal"]]["category_label"],
+ marginals[x["marginal"]]["label"],
+ ),
+ axis=1,
+ )
labels = labels.str.replace("Number of", "No.")
labels = labels.str.replace("Socioprof. Cat.", "SC")
return labels.values
-def filter_marginals(df, marginal_level, marginals, blacklist = set()):
+
+def filter_marginals(df, marginal_level, marginals, blacklist=set()):
df = df[df["marginal_level"] == marginal_level]
df = df[df["marginal"].isin(marginals.keys())]
df = df[~df["marginal"].isin(blacklist)]
diff --git a/documentation/plots/theory/sampling_error.py b/documentation/plots/theory/sampling_error.py
index 9dca3897..dc6ae222 100644
--- a/documentation/plots/theory/sampling_error.py
+++ b/documentation/plots/theory/sampling_error.py
@@ -4,6 +4,7 @@
import scipy.stats as stats
import documentation.plotting as plotting
+
def get_count_distribution(ns, w, s):
l, u = np.floor(w), np.ceil(w)
p = w - l
@@ -13,6 +14,7 @@ def get_count_distribution(ns, w, s):
return p * Fu + (1 - p) * Fl
+
def get_error_probability(ws, s, q):
probabilities = []
@@ -25,23 +27,25 @@ def get_error_probability(ws, s, q):
return probabilities
+
def configure(context):
pass
+
def execute(context):
plotting.setup()
q = 0.01
- plt.figure(figsize = plotting.WIDE_FIGSIZE)
+ plt.figure(figsize=plotting.WIDE_FIGSIZE)
for s, color in zip([0.01, 0.1, 0.25], ["#000000", "#777777", "#cccccc"]):
ws = np.linspace(0, 2000, 10000)
probs = get_error_probability(ws, s, q)
- plt.plot(ws, probs, ".", label = "s = %.2f" % s, color = color, markersize = 2)
+ plt.plot(ws, probs, ".", label="s = %.2f" % s, color=color, markersize=2)
- plt.legend(loc = "best")
+ plt.legend(loc="best")
plt.grid()
plt.xlabel("Reference weight")
plt.ylabel("Probability")
diff --git a/documentation/plotting.py b/documentation/plotting.py
index 7900d8de..a23b8102 100644
--- a/documentation/plotting.py
+++ b/documentation/plotting.py
@@ -10,27 +10,28 @@
DPI = 300
FONT_SIZE = 8
-COLORSET = palettable.colorbrewer.qualitative.Set2_4.mpl_colors
-COLORSET5 = palettable.colorbrewer.qualitative.Set2_5.mpl_colors
+COLORSET = palettable.colorbrewer.qualitative.Set2_4.mpl_colors
+COLORSET5 = palettable.colorbrewer.qualitative.Set2_5.mpl_colors
COLORS = {
"census": COLORSET[2],
"entd": COLORSET[0],
"egt": COLORSET[1],
- "synthetic": "#cccccc", #COLORSET[3]
+ "synthetic": "#cccccc", # COLORSET[3]
}
MODE_LABELS = dict(
- car = "Car driver",
- car_passenger = "Car passenger",
- pt = "Public transport",
- bike = "Bicycle",
- walk = "Walking"
+ car="Car driver",
+ car_passenger="Car passenger",
+ pt="Public transport",
+ bike="Bicycle",
+ walk="Walking",
)
+
def setup():
- plt.rc("font", family = "serif", size = FONT_SIZE)
- plt.rc("figure", dpi = DPI, figsize = SHORT_FIGSIZE)
- plt.rc("legend", fontsize = FONT_SIZE, loc = "best", fancybox = False)
- plt.rc("grid", linewidth = 0.5)
- plt.rc("patch", linewidth = 0.5)
- plt.rc("mathtext", fontset = "cm")
+ plt.rc("font", family="serif", size=FONT_SIZE)
+ plt.rc("figure", dpi=DPI, figsize=SHORT_FIGSIZE)
+ plt.rc("legend", fontsize=FONT_SIZE, loc="best", fancybox=False)
+ plt.rc("grid", linewidth=0.5)
+ plt.rc("patch", linewidth=0.5)
+ plt.rc("mathtext", fontset="cm")
diff --git a/documentation/shapes.py b/documentation/shapes.py
index dd92ff63..a57d8ec5 100644
--- a/documentation/shapes.py
+++ b/documentation/shapes.py
@@ -4,28 +4,34 @@
import matplotlib.ticker as tck
import palettable
+
def configure(context):
context.stage("data.income.municipality")
context.stage("data.spatial.municipalities")
context.stage("data.bpe.cleaned")
+
def execute(context):
df_communes = context.stage("data.spatial.municipalities")
# Spatial income distribution
df_income = context.stage("data.income.municipality")
- df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
- df_income = pd.merge(df_communes, df_income, how = "inner", on = "commune_id")
+ df_income = df_income[
+ (df_income["attribute"] == "all") & (df_income["value"] == "all")
+ ]
+ df_income = pd.merge(df_communes, df_income, how="inner", on="commune_id")
df_income["is_imputed"] = df_income["is_imputed"].astype(int)
df_income["commune_id"] = df_income["commune_id"].astype(str)
df_income["departement_id"] = df_income["departement_id"].astype(str)
- df_income.to_file("%s/income.geojson" % context.cache_path, driver = "GeoJSON")
+ df_income.to_file("%s/income.geojson" % context.cache_path, driver="GeoJSON")
# Enterprises
- df_bpe = context.stage("data.bpe.cleaned")[["enterprise_id", "geometry", "imputed", "commune_id"]].copy()
+ df_bpe = context.stage("data.bpe.cleaned")[
+ ["enterprise_id", "geometry", "imputed", "commune_id"]
+ ].copy()
df_bpe["imputed"] = df_bpe["imputed"].astype(int)
df_bpe["commune_id"] = df_bpe["commune_id"].astype(str)
- df_bpe = df_bpe.iloc[np.random.choice(len(df_bpe), size = 10000, replace = False)]
+ df_bpe = df_bpe.iloc[np.random.choice(len(df_bpe), size=10000, replace=False)]
df_bpe.to_file("%s/bpe.shp" % context.cache_path)
return context.cache_path
diff --git a/matsim/output.py b/matsim/output.py
index 2f616403..520a9a19 100644
--- a/matsim/output.py
+++ b/matsim/output.py
@@ -1,10 +1,11 @@
import shutil
+
def configure(context):
if context.config("run_matsim", True):
# allow disabling performing one run of the simulation
context.stage("matsim.simulation.run")
-
+
context.stage("matsim.simulation.prepare")
context.stage("matsim.runtime.eqasim")
@@ -14,14 +15,14 @@ def configure(context):
need_osm = context.config("export_detailed_network", False)
if need_osm:
context.stage("matsim.scenario.supply.osm")
-
context.stage("documentation.meta_output")
+
def execute(context):
config_path = "%s/%s" % (
context.path("matsim.simulation.prepare"),
- context.stage("matsim.simulation.prepare")
+ context.stage("matsim.simulation.prepare"),
)
file_names = [
@@ -32,23 +33,33 @@ def execute(context):
"%snetwork.xml.gz" % context.config("output_prefix"),
"%stransit_schedule.xml.gz" % context.config("output_prefix"),
"%stransit_vehicles.xml.gz" % context.config("output_prefix"),
- "%sconfig.xml" % context.config("output_prefix")
+ "%sconfig.xml" % context.config("output_prefix"),
]
for name in file_names:
shutil.copy(
"%s/%s" % (context.path("matsim.simulation.prepare"), name),
- "%s/%s" % (context.config("output_path"), name)
+ "%s/%s" % (context.config("output_path"), name),
)
if context.config("export_detailed_network"):
shutil.copy(
- "%s/%s" % (context.path("matsim.scenario.supply.osm"), "detailed_network.csv"),
- "%s/%s" % (context.config("output_path"), "%sdetailed_network.csv" % context.config("output_prefix"))
+ "%s/%s"
+ % (context.path("matsim.scenario.supply.osm"), "detailed_network.csv"),
+ "%s/%s"
+ % (
+ context.config("output_path"),
+ "%sdetailed_network.csv" % context.config("output_prefix"),
+ ),
)
-
+
if context.config("write_jar"):
shutil.copy(
- "%s/%s" % (context.path("matsim.runtime.eqasim"), context.stage("matsim.runtime.eqasim")),
- "%s/%srun.jar" % (context.config("output_path"), context.config("output_prefix"))
+ "%s/%s"
+ % (
+ context.path("matsim.runtime.eqasim"),
+ context.stage("matsim.runtime.eqasim"),
+ ),
+ "%s/%srun.jar"
+ % (context.config("output_path"), context.config("output_prefix")),
)
diff --git a/matsim/runtime/eqasim.py b/matsim/runtime/eqasim.py
index 72e4846e..6a315f67 100644
--- a/matsim/runtime/eqasim.py
+++ b/matsim/runtime/eqasim.py
@@ -9,6 +9,7 @@
DEFAULT_EQASIM_BRANCH = "develop"
DEFAULT_EQASIM_COMMIT = "ece4932"
+
def configure(context):
context.stage("matsim.runtime.git")
context.stage("matsim.runtime.java")
@@ -20,6 +21,7 @@ def configure(context):
context.config("eqasim_repository", "https://github.com/eqasim-org/eqasim-java.git")
context.config("eqasim_path", "")
+
def run(context, command, arguments):
version = context.config("eqasim_version")
@@ -27,10 +29,12 @@ def run(context, command, arguments):
context.stage("matsim.runtime.eqasim")
jar_path = "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % (
- context.path("matsim.runtime.eqasim"), version
+ context.path("matsim.runtime.eqasim"),
+ version,
)
java.run(context, command, arguments, jar_path)
+
def execute(context):
version = context.config("eqasim_version")
@@ -39,33 +43,61 @@ def execute(context):
# Clone repository and checkout version
branch = context.config("eqasim_branch")
- git.run(context, [
- "clone", "--single-branch", "-b", branch,
- context.config("eqasim_repository"), "eqasim-java"
- ])
+ git.run(
+ context,
+ [
+ "clone",
+ "--single-branch",
+ "-b",
+ branch,
+ context.config("eqasim_repository"),
+ "eqasim-java",
+ ],
+ )
# Select the configured commit or tag
commit = context.config("eqasim_commit")
- git.run(context, [
- "checkout", commit
- ], cwd = "{}/eqasim-java".format(context.path()))
+ git.run(
+ context, ["checkout", commit], cwd="{}/eqasim-java".format(context.path())
+ )
# Build eqasim
- maven.run(context, ["-Pstandalone", "--projects", "ile_de_france", "--also-make", "package", "-DskipTests=true"], cwd = "%s/eqasim-java" % context.path())
-
- if not os.path.exists("{}/eqasim-java/ile_de_france/target/ile_de_france-{}.jar".format(context.path(), version)):
- raise RuntimeError("The JAR was not created correctly. Wrong eqasim_version specified?")
+ maven.run(
+ context,
+ [
+ "-Pstandalone",
+ "--projects",
+ "ile_de_france",
+ "--also-make",
+ "package",
+ "-DskipTests=true",
+ ],
+ cwd="%s/eqasim-java" % context.path(),
+ )
+
+ if not os.path.exists(
+ "{}/eqasim-java/ile_de_france/target/ile_de_france-{}.jar".format(
+ context.path(), version
+ )
+ ):
+ raise RuntimeError(
+ "The JAR was not created correctly. Wrong eqasim_version specified?"
+ )
# Special case: We provide the jar directly. This is mainly used for
# creating input to unit tests of the eqasim-java package.
else:
os.makedirs("%s/eqasim-java/ile_de_france/target" % context.path())
- shutil.copy(context.config("eqasim_path"),
- "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % (context.path(), version))
+ shutil.copy(
+ context.config("eqasim_path"),
+ "%s/eqasim-java/ile_de_france/target/ile_de_france-%s.jar"
+ % (context.path(), version),
+ )
return "eqasim-java/ile_de_france/target/ile_de_france-%s.jar" % version
+
def validate(context):
path = context.config("eqasim_path")
@@ -74,12 +106,14 @@ def validate(context):
if not os.path.exists(path):
raise RuntimeError("Cannot find eqasim at: %s" % path)
-
+
if context.config("eqasim_tag") is None:
if context.config("eqasim_commit") is None:
raise RuntimeError("Either eqasim commit or tag must be defined")
-
- if (context.config("eqasim_tag") is None) == (context.config("eqasim_commit") is None):
+
+ if (context.config("eqasim_tag") is None) == (
+ context.config("eqasim_commit") is None
+ ):
raise RuntimeError("Eqasim commit and tag must not be defined at the same time")
return os.path.getmtime(path)
diff --git a/matsim/runtime/git.py b/matsim/runtime/git.py
index 08585b00..27e5551f 100644
--- a/matsim/runtime/git.py
+++ b/matsim/runtime/git.py
@@ -1,12 +1,14 @@
import subprocess as sp
import shutil
+
def configure(context):
context.config("git_binary", "git")
-def run(context, arguments = [], cwd = None, catch_output = False):
+
+def run(context, arguments=[], cwd=None, catch_output=False):
"""
- This function calls git.
+ This function calls git.
"""
# Make sure there is a dependency
context.stage("matsim.runtime.git")
@@ -14,28 +16,29 @@ def run(context, arguments = [], cwd = None, catch_output = False):
if cwd is None:
cwd = context.path()
- command_line = [
- shutil.which(context.config("git_binary"))
- ] + arguments
+ command_line = [shutil.which(context.config("git_binary"))] + arguments
if catch_output:
- return sp.check_output(command_line, cwd = cwd).decode("utf-8").strip()
+ return sp.check_output(command_line, cwd=cwd).decode("utf-8").strip()
else:
- return_code = sp.check_call(command_line, cwd = cwd)
+ return_code = sp.check_call(command_line, cwd=cwd)
if not return_code == 0:
raise RuntimeError("Git return code: %d" % return_code)
+
def validate(context):
if shutil.which(context.config("git_binary")) in ["", None]:
- raise RuntimeError("Cannot find git binary at: %s" % context.config("git_binary"))
+ raise RuntimeError(
+ "Cannot find git binary at: %s" % context.config("git_binary")
+ )
- if not b"2." in sp.check_output([
- shutil.which(context.config("git_binary")),
- "--version"
- ], stderr = sp.STDOUT):
+ if not b"2." in sp.check_output(
+ [shutil.which(context.config("git_binary")), "--version"], stderr=sp.STDOUT
+ ):
print("WARNING! Git of at least version 2.x.x is recommended!")
+
def execute(context):
pass
diff --git a/matsim/runtime/java.py b/matsim/runtime/java.py
index 1cfe5702..94c0f1a5 100644
--- a/matsim/runtime/java.py
+++ b/matsim/runtime/java.py
@@ -1,16 +1,27 @@
import subprocess as sp
import os, shutil
+
def configure(context):
context.config("java_binary", "java")
context.config("java_memory", "50G")
-def run(context, entry_point, arguments = [], class_path = None, vm_arguments = [], cwd = None, memory = None, mode = "raise"):
+
+def run(
+ context,
+ entry_point,
+ arguments=[],
+ class_path=None,
+ vm_arguments=[],
+ cwd=None,
+ memory=None,
+ mode="raise",
+):
"""
- This function calls java code. There are three modes:
- - return_code: Returns the return code of the Java call
- - output: Returns the output of the Java call
- - raise (default): Raises an exception if the return code is not zero
+ This function calls java code. There are three modes:
+ - return_code: Returns the return code of the Java call
+ - output: Returns the output of the Java call
+ - raise (default): Raises an exception if the return code is not zero
"""
# Make sure there is a dependency
context.stage("matsim.runtime.java")
@@ -25,7 +36,7 @@ def run(context, entry_point, arguments = [], class_path = None, vm_arguments =
vm_arguments = [
"-Xmx" + memory,
"-Djava.io.tmpdir=%s" % temp_path,
- "-Dmatsim.useLocalDtds=true"
+ "-Dmatsim.useLocalDtds=true",
] + vm_arguments
# Prepare classpath
@@ -37,38 +48,43 @@ def run(context, entry_point, arguments = [], class_path = None, vm_arguments =
cwd = context.path()
# Prepare command line
- command_line = [
- shutil.which(context.config("java_binary")),
- "-cp", class_path
- ] + vm_arguments + [
- entry_point
- ] + arguments
+ command_line = (
+ [shutil.which(context.config("java_binary")), "-cp", class_path]
+ + vm_arguments
+ + [entry_point]
+ + arguments
+ )
command_line = list(map(str, command_line))
print("Executing java:", " ".join(command_line))
if mode == "raise" or mode == "return_code":
- return_code = sp.check_call(command_line, cwd = cwd)
+ return_code = sp.check_call(command_line, cwd=cwd)
if not return_code == 0:
raise RuntimeError("Java return code: %d" % return_code)
return return_code
elif mode == "output":
- return sp.check_output(command_line, cwd = cwd)
+ return sp.check_output(command_line, cwd=cwd)
else:
- raise RuntimeError("Mode is expected to be one of 'raise', 'return_code' or 'output'")
+ raise RuntimeError(
+ "Mode is expected to be one of 'raise', 'return_code' or 'output'"
+ )
+
def validate(context):
if shutil.which(context.config("java_binary")) in ["", None]:
- raise RuntimeError("Cannot find Java binary at: %s" % context.config("java_binary"))
+ raise RuntimeError(
+ "Cannot find Java binary at: %s" % context.config("java_binary")
+ )
- if not b"11" in sp.check_output([
- shutil.which(context.config("java_binary")),
- "-version"
- ], stderr = sp.STDOUT):
+ if not b"11" in sp.check_output(
+ [shutil.which(context.config("java_binary")), "-version"], stderr=sp.STDOUT
+ ):
print("WARNING! A Java JDK of at least version 11 is recommended.")
+
def execute(context):
pass
diff --git a/matsim/runtime/maven.py b/matsim/runtime/maven.py
index a4832617..587dec63 100644
--- a/matsim/runtime/maven.py
+++ b/matsim/runtime/maven.py
@@ -1,13 +1,15 @@
import subprocess as sp
import os, shutil
+
def configure(context):
context.config("maven_binary", "mvn")
context.config("maven_skip_tests", False)
-def run(context, arguments = [], cwd = None):
+
+def run(context, arguments=[], cwd=None):
"""
- This function calls Maven.
+ This function calls Maven.
"""
# Make sure there is a dependency
context.stage("matsim.runtime.maven")
@@ -20,31 +22,32 @@ def run(context, arguments = [], cwd = None):
if not os.path.exists(temp_path):
os.mkdir(temp_path)
- vm_arguments = [
- "-Djava.io.tmpdir=%s" % temp_path
- ]
+ vm_arguments = ["-Djava.io.tmpdir=%s" % temp_path]
if context.config("maven_skip_tests"):
vm_arguments.append("-DskipTests=true")
- command_line = [
- shutil.which(context.config("maven_binary"))
- ] + vm_arguments + arguments
+ command_line = (
+ [shutil.which(context.config("maven_binary"))] + vm_arguments + arguments
+ )
- return_code = sp.check_call(command_line, cwd = cwd)
+ return_code = sp.check_call(command_line, cwd=cwd)
if not return_code == 0:
raise RuntimeError("Maven return code: %d" % return_code)
+
def validate(context):
if shutil.which(context.config("maven_binary")) in ["", None]:
- raise RuntimeError("Cannot find Maven binary at: %s" % context.config("maven_binary"))
+ raise RuntimeError(
+ "Cannot find Maven binary at: %s" % context.config("maven_binary")
+ )
- if not b"3." in sp.check_output([
- shutil.which(context.config("maven_binary")),
- "-version"
- ], stderr = sp.STDOUT):
+ if not b"3." in sp.check_output(
+ [shutil.which(context.config("maven_binary")), "-version"], stderr=sp.STDOUT
+ ):
print("WARNING! Maven of at least version 3.x.x is recommended!")
+
def execute(context):
pass
diff --git a/matsim/runtime/pt2matsim.py b/matsim/runtime/pt2matsim.py
index ef837fd6..62573a33 100644
--- a/matsim/runtime/pt2matsim.py
+++ b/matsim/runtime/pt2matsim.py
@@ -5,6 +5,7 @@
import matsim.runtime.java as java
import matsim.runtime.maven as maven
+
def configure(context):
context.stage("matsim.runtime.git")
context.stage("matsim.runtime.java")
@@ -13,6 +14,7 @@ def configure(context):
context.config("pt2matsim_version", "22.3")
context.config("pt2matsim_branch", "v22.3")
+
def run(context, command, arguments, vm_arguments=[]):
version = context.config("pt2matsim_version")
@@ -20,29 +22,43 @@ def run(context, command, arguments, vm_arguments=[]):
context.stage("matsim.runtime.pt2matsim")
jar_path = "%s/pt2matsim/target/pt2matsim-%s-shaded.jar" % (
- context.path("matsim.runtime.pt2matsim"), version
+ context.path("matsim.runtime.pt2matsim"),
+ version,
)
java.run(context, command, arguments, jar_path, vm_arguments)
+
def execute(context):
version = context.config("pt2matsim_version")
branch = context.config("pt2matsim_branch")
# Clone repository and checkout version
- git.run(context, [
- "clone", "https://github.com/matsim-org/pt2matsim.git",
- "--branch", branch,
- "--single-branch", "pt2matsim",
- "--depth", "1"
- ])
+ git.run(
+ context,
+ [
+ "clone",
+ "https://github.com/matsim-org/pt2matsim.git",
+ "--branch",
+ branch,
+ "--single-branch",
+ "pt2matsim",
+ "--depth",
+ "1",
+ ],
+ )
# Build pt2matsim
- maven.run(context, ["package", "-DskipTests=true"], cwd = "%s/pt2matsim" % context.path())
+ maven.run(
+ context, ["package", "-DskipTests=true"], cwd="%s/pt2matsim" % context.path()
+ )
jar_path = "%s/pt2matsim/target/pt2matsim-%s-shaded.jar" % (context.path(), version)
# Test pt2matsim
- java.run(context, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", [
- "test_config.xml"
- ], jar_path)
+ java.run(
+ context,
+ "org.matsim.pt2matsim.run.CreateDefaultOsmConfig",
+ ["test_config.xml"],
+ jar_path,
+ )
assert os.path.exists("%s/test_config.xml" % context.path())
diff --git a/matsim/scenario/facilities.py b/matsim/scenario/facilities.py
index 4bc223a9..f721fa8c 100644
--- a/matsim/scenario/facilities.py
+++ b/matsim/scenario/facilities.py
@@ -5,28 +5,31 @@
import matsim.writers as writers
+
def configure(context):
context.stage("synthesis.locations.secondary")
context.stage("synthesis.population.spatial.home.locations")
context.stage("synthesis.population.spatial.primary.locations")
-HOME_FIELDS = [
- "household_id", "geometry"
-]
-PRIMARY_FIELDS = [
- "location_id", "geometry", "is_work"
-]
+HOME_FIELDS = ["household_id", "geometry"]
+
+PRIMARY_FIELDS = ["location_id", "geometry", "is_work"]
SECONDARY_FIELDS = [
- "location_id", "geometry", "offers_leisure", "offers_shop", "offers_other"
+ "location_id",
+ "geometry",
+ "offers_leisure",
+ "offers_shop",
+ "offers_other",
]
+
def execute(context):
output_path = "%s/facilities.xml.gz" % context.path()
- with gzip.open(output_path, 'wb+') as writer:
- with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer:
+ with gzip.open(output_path, "wb+") as writer:
+ with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer:
writer = writers.FacilitiesWriter(writer)
writer.start_facilities()
@@ -35,13 +38,16 @@ def execute(context):
df_homes = context.stage("synthesis.population.spatial.home.locations")
df_homes = df_homes[HOME_FIELDS]
- with context.progress(total = len(df_homes), label = "Writing home facilities ...") as progress:
- for item in df_homes.itertuples(index = False):
+ with context.progress(
+ total=len(df_homes), label="Writing home facilities ..."
+ ) as progress:
+ for item in df_homes.itertuples(index=False):
geometry = item[HOME_FIELDS.index("geometry")]
writer.start_facility(
"home_%s" % item[HOME_FIELDS.index("household_id")],
- geometry.x, geometry.y
+ geometry.x,
+ geometry.y,
)
writer.add_activity("home")
@@ -49,7 +55,9 @@ def execute(context):
# Write primary
- df_work, df_education = context.stage("synthesis.population.spatial.primary.locations")
+ df_work, df_education = context.stage(
+ "synthesis.population.spatial.primary.locations"
+ )
df_work = df_work.drop_duplicates("location_id").copy()
df_education = df_education.drop_duplicates("location_id").copy()
@@ -60,16 +68,21 @@ def execute(context):
df_locations = pd.concat([df_work, df_education])
df_locations = df_locations[PRIMARY_FIELDS]
- with context.progress(total = len(df_locations), label = "Writing primary facilities ...") as progress:
- for item in df_locations.itertuples(index = False):
+ with context.progress(
+ total=len(df_locations), label="Writing primary facilities ..."
+ ) as progress:
+ for item in df_locations.itertuples(index=False):
geometry = item[PRIMARY_FIELDS.index("geometry")]
writer.start_facility(
str(item[PRIMARY_FIELDS.index("location_id")]),
- geometry.x, geometry.y
+ geometry.x,
+ geometry.y,
)
- writer.add_activity("work" if item[PRIMARY_FIELDS.index("is_work")] else "education")
+ writer.add_activity(
+ "work" if item[PRIMARY_FIELDS.index("is_work")] else "education"
+ )
writer.end_facility()
# Write secondary
@@ -77,13 +90,16 @@ def execute(context):
df_locations = context.stage("synthesis.locations.secondary")
df_locations = df_locations[SECONDARY_FIELDS]
- with context.progress(total = len(df_locations), label = "Writing secondary facilities ...") as progress:
- for item in df_locations.itertuples(index = False):
+ with context.progress(
+ total=len(df_locations), label="Writing secondary facilities ..."
+ ) as progress:
+ for item in df_locations.itertuples(index=False):
geometry = item[SECONDARY_FIELDS.index("geometry")]
writer.start_facility(
item[SECONDARY_FIELDS.index("location_id")],
- geometry.x, geometry.y
+ geometry.x,
+ geometry.y,
)
for purpose in ("shop", "leisure", "other"):
diff --git a/matsim/scenario/households.py b/matsim/scenario/households.py
index 2f47cfee..0d33bd48 100644
--- a/matsim/scenario/households.py
+++ b/matsim/scenario/households.py
@@ -5,42 +5,69 @@
import matsim.writers as writers
+
def configure(context):
context.stage("synthesis.population.enriched")
-FIELDS = ["household_id", "person_id", "household_income", "car_availability", "bike_availability", "census_household_id"]
+
+FIELDS = [
+ "household_id",
+ "person_id",
+ "household_income",
+ "car_availability",
+ "bike_availability",
+ "census_household_id",
+]
+
def add_household(writer, household, member_ids):
writer.start_household(household[FIELDS.index("household_id")])
writer.add_members(member_ids)
writer.start_attributes()
- writer.add_attribute("carAvailability", "java.lang.String", household[FIELDS.index("car_availability")])
- writer.add_attribute("bikeAvailability", "java.lang.String", household[FIELDS.index("bike_availability")])
- writer.add_attribute("household_income", "java.lang.Double", household[FIELDS.index("household_income")])
- writer.add_attribute("censusId", "java.lang.Long", household[FIELDS.index("census_household_id")])
+ writer.add_attribute(
+ "carAvailability",
+ "java.lang.String",
+ household[FIELDS.index("car_availability")],
+ )
+ writer.add_attribute(
+ "bikeAvailability",
+ "java.lang.String",
+ household[FIELDS.index("bike_availability")],
+ )
+ writer.add_attribute(
+ "household_income",
+ "java.lang.Double",
+ household[FIELDS.index("household_income")],
+ )
+ writer.add_attribute(
+ "censusId", "java.lang.Long", household[FIELDS.index("census_household_id")]
+ )
writer.end_attributes()
writer.end_household()
+
def execute(context):
output_path = "%s/households.xml.gz" % context.path()
df_persons = context.stage("synthesis.population.enriched")
- df_persons = df_persons.sort_values(by = ["household_id", "person_id"])
+ df_persons = df_persons.sort_values(by=["household_id", "person_id"])
df_persons = df_persons[FIELDS]
current_members = []
current_household_id = None
current_household = None
- with gzip.open(output_path, 'wb+') as writer:
- with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer:
+ with gzip.open(output_path, "wb+") as writer:
+ with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer:
writer = writers.HouseholdsWriter(writer)
writer.start_households()
- with context.progress(total = len(df_persons), label = "Writing households ...") as progress:
- for item in df_persons.itertuples(index = False):
+ with context.progress(
+ total=len(df_persons), label="Writing households ..."
+ ) as progress:
+ for item in df_persons.itertuples(index=False):
if current_household_id != item[FIELDS.index("household_id")]:
if not current_household_id is None:
add_household(writer, current_household, current_members)
diff --git a/matsim/scenario/population.py b/matsim/scenario/population.py
index 2fc0fa4d..2f7e52eb 100644
--- a/matsim/scenario/population.py
+++ b/matsim/scenario/population.py
@@ -7,6 +7,7 @@
import matsim.writers as writers
from matsim.writers import backlog_iterator
+
def configure(context):
context.stage("synthesis.population.enriched")
@@ -16,59 +17,125 @@ def configure(context):
context.stage("synthesis.population.trips")
context.stage("synthesis.vehicles.vehicles")
+
PERSON_FIELDS = [
- "person_id", "household_income", "car_availability", "bike_availability",
- "census_household_id", "census_person_id", "household_id",
- "has_license", "has_pt_subscription", "is_passenger",
- "hts_id", "hts_household_id",
- "age", "employed", "sex"
+ "person_id",
+ "household_income",
+ "car_availability",
+ "bike_availability",
+ "census_household_id",
+ "census_person_id",
+ "household_id",
+ "has_license",
+ "has_pt_subscription",
+ "is_passenger",
+ "hts_id",
+ "hts_household_id",
+ "age",
+ "employed",
+ "sex",
]
ACTIVITY_FIELDS = [
- "person_id", "start_time", "end_time", "purpose", "geometry", "location_id"
+ "person_id",
+ "start_time",
+ "end_time",
+ "purpose",
+ "geometry",
+ "location_id",
]
-TRIP_FIELDS = [
- "person_id", "mode", "departure_time", "travel_time"
-]
+TRIP_FIELDS = ["person_id", "mode", "departure_time", "travel_time"]
+
+VEHICLE_FIELDS = ["owner_id", "vehicle_id", "mode"]
-VEHICLE_FIELDS = [
- "owner_id", "vehicle_id", "mode"
-]
def add_person(writer, person, activities, trips, vehicles):
writer.start_person(person[PERSON_FIELDS.index("person_id")])
writer.start_attributes()
- writer.add_attribute("householdId", "java.lang.Integer", person[PERSON_FIELDS.index("household_id")])
- writer.add_attribute("householdIncome", "java.lang.Double", person[PERSON_FIELDS.index("household_income")])
-
- writer.add_attribute("carAvailability", "java.lang.String", person[PERSON_FIELDS.index("car_availability")])
- writer.add_attribute("bikeAvailability", "java.lang.String", person[PERSON_FIELDS.index("bike_availability")])
-
- writer.add_attribute("censusHouseholdId", "java.lang.Long", person[PERSON_FIELDS.index("census_household_id")])
- writer.add_attribute("censusPersonId", "java.lang.Long", person[PERSON_FIELDS.index("census_person_id")])
-
- writer.add_attribute("htsHouseholdId", "java.lang.Long", person[PERSON_FIELDS.index("hts_household_id")])
- writer.add_attribute("htsPersonId", "java.lang.Long", person[PERSON_FIELDS.index("hts_id")])
-
- writer.add_attribute("hasPtSubscription", "java.lang.Boolean", person[PERSON_FIELDS.index("has_pt_subscription")])
- writer.add_attribute("hasLicense", "java.lang.String", writer.yes_no(person[PERSON_FIELDS.index("has_license")]))
-
- writer.add_attribute("isPassenger", "java.lang.Boolean", person[PERSON_FIELDS.index("is_passenger")])
+ writer.add_attribute(
+ "householdId", "java.lang.Integer", person[PERSON_FIELDS.index("household_id")]
+ )
+ writer.add_attribute(
+ "householdIncome",
+ "java.lang.Double",
+ person[PERSON_FIELDS.index("household_income")],
+ )
+
+ writer.add_attribute(
+ "carAvailability",
+ "java.lang.String",
+ person[PERSON_FIELDS.index("car_availability")],
+ )
+ writer.add_attribute(
+ "bikeAvailability",
+ "java.lang.String",
+ person[PERSON_FIELDS.index("bike_availability")],
+ )
+
+ writer.add_attribute(
+ "censusHouseholdId",
+ "java.lang.Long",
+ person[PERSON_FIELDS.index("census_household_id")],
+ )
+ writer.add_attribute(
+ "censusPersonId",
+ "java.lang.Long",
+ person[PERSON_FIELDS.index("census_person_id")],
+ )
+
+ writer.add_attribute(
+ "htsHouseholdId",
+ "java.lang.Long",
+ person[PERSON_FIELDS.index("hts_household_id")],
+ )
+ writer.add_attribute(
+ "htsPersonId", "java.lang.Long", person[PERSON_FIELDS.index("hts_id")]
+ )
+
+ writer.add_attribute(
+ "hasPtSubscription",
+ "java.lang.Boolean",
+ person[PERSON_FIELDS.index("has_pt_subscription")],
+ )
+ writer.add_attribute(
+ "hasLicense",
+ "java.lang.String",
+ writer.yes_no(person[PERSON_FIELDS.index("has_license")]),
+ )
+
+ writer.add_attribute(
+ "isPassenger", "java.lang.Boolean", person[PERSON_FIELDS.index("is_passenger")]
+ )
writer.add_attribute("age", "java.lang.Integer", person[PERSON_FIELDS.index("age")])
- writer.add_attribute("employed", "java.lang.String", person[PERSON_FIELDS.index("employed")])
- writer.add_attribute("sex", "java.lang.String", person[PERSON_FIELDS.index("sex")][0])
-
- writer.add_attribute("vehicles", "org.matsim.vehicles.PersonVehicles", "{{{content}}}".format(content = ",".join([
- "\"{mode}\":\"{id}\"".format(mode = v[VEHICLE_FIELDS.index("mode")], id = v[VEHICLE_FIELDS.index("vehicle_id")])
- for v in vehicles
- ])))
+ writer.add_attribute(
+ "employed", "java.lang.String", person[PERSON_FIELDS.index("employed")]
+ )
+ writer.add_attribute(
+ "sex", "java.lang.String", person[PERSON_FIELDS.index("sex")][0]
+ )
+
+ writer.add_attribute(
+ "vehicles",
+ "org.matsim.vehicles.PersonVehicles",
+ "{{{content}}}".format(
+ content=",".join(
+ [
+ '"{mode}":"{id}"'.format(
+ mode=v[VEHICLE_FIELDS.index("mode")],
+ id=v[VEHICLE_FIELDS.index("vehicle_id")],
+ )
+ for v in vehicles
+ ]
+ )
+ ),
+ )
writer.end_attributes()
- writer.start_plan(selected = True)
+ writer.start_plan(selected=True)
for activity, trip in itertools.zip_longest(activities, trips):
start_time = activity[ACTIVITY_FIELDS.index("start_time")]
@@ -80,58 +147,71 @@ def add_person(writer, person, activities, trips, vehicles):
location_id = "home_%s" % person[PERSON_FIELDS.index("household_id")]
location = writer.location(
- geometry.x, geometry.y,
- None if location_id == -1 else location_id
+ geometry.x, geometry.y, None if location_id == -1 else location_id
)
writer.add_activity(
- type = activity[ACTIVITY_FIELDS.index("purpose")],
- location = location,
- start_time = None if np.isnan(start_time) else start_time,
- end_time = None if np.isnan(end_time) else end_time
+ type=activity[ACTIVITY_FIELDS.index("purpose")],
+ location=location,
+ start_time=None if np.isnan(start_time) else start_time,
+ end_time=None if np.isnan(end_time) else end_time,
)
if not trip is None:
writer.add_leg(
- mode = trip[TRIP_FIELDS.index("mode")],
- departure_time = trip[TRIP_FIELDS.index("departure_time")],
- travel_time = trip[TRIP_FIELDS.index("travel_time")]
+ mode=trip[TRIP_FIELDS.index("mode")],
+ departure_time=trip[TRIP_FIELDS.index("departure_time")],
+ travel_time=trip[TRIP_FIELDS.index("travel_time")],
)
writer.end_plan()
writer.end_person()
+
def execute(context):
output_path = "%s/population.xml.gz" % context.path()
df_persons = context.stage("synthesis.population.enriched")
- df_persons = df_persons.sort_values(by = ["household_id", "person_id"])
+ df_persons = df_persons.sort_values(by=["household_id", "person_id"])
df_persons = df_persons[PERSON_FIELDS]
- df_activities = context.stage("synthesis.population.activities").sort_values(by = ["person_id", "activity_index"])
- df_locations = context.stage("synthesis.population.spatial.locations")[[
- "person_id", "activity_index", "geometry", "location_id"]].sort_values(by = ["person_id", "activity_index"])
+ df_activities = context.stage("synthesis.population.activities").sort_values(
+ by=["person_id", "activity_index"]
+ )
+ df_locations = context.stage("synthesis.population.spatial.locations")[
+ ["person_id", "activity_index", "geometry", "location_id"]
+ ].sort_values(by=["person_id", "activity_index"])
- df_activities = pd.merge(df_activities, df_locations, how = "left", on = ["person_id", "activity_index"])
- #df_activities["location_id"] = df_activities["location_id"].fillna(-1).astype(int)
+ df_activities = pd.merge(
+ df_activities, df_locations, how="left", on=["person_id", "activity_index"]
+ )
+ # df_activities["location_id"] = df_activities["location_id"].fillna(-1).astype(int)
df_trips = context.stage("synthesis.population.trips")
df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"]
df_vehicles = context.stage("synthesis.vehicles.vehicles")[1]
- df_vehicles = df_vehicles.sort_values(by = ["owner_id"])
+ df_vehicles = df_vehicles.sort_values(by=["owner_id"])
- with gzip.open(output_path, 'wb+') as writer:
- with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer:
+ with gzip.open(output_path, "wb+") as writer:
+ with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer:
writer = writers.PopulationWriter(writer)
writer.start_population()
- activity_iterator = backlog_iterator(iter(df_activities[ACTIVITY_FIELDS].itertuples(index = False)))
- trip_iterator = backlog_iterator(iter(df_trips[TRIP_FIELDS].itertuples(index = False)))
- vehicle_iterator = backlog_iterator(iter(df_vehicles[VEHICLE_FIELDS].itertuples(index = False)))
+ activity_iterator = backlog_iterator(
+ iter(df_activities[ACTIVITY_FIELDS].itertuples(index=False))
+ )
+ trip_iterator = backlog_iterator(
+ iter(df_trips[TRIP_FIELDS].itertuples(index=False))
+ )
+ vehicle_iterator = backlog_iterator(
+ iter(df_vehicles[VEHICLE_FIELDS].itertuples(index=False))
+ )
- with context.progress(total = len(df_persons), label = "Writing population ...") as progress:
- for person in df_persons.itertuples(index = False):
+ with context.progress(
+ total=len(df_persons), label="Writing population ..."
+ ) as progress:
+ for person in df_persons.itertuples(index=False):
person_id = person[PERSON_FIELDS.index("person_id")]
activities = []
@@ -142,7 +222,10 @@ def execute(context):
while activity_iterator.has_next():
activity = activity_iterator.next()
- if not activity[ACTIVITY_FIELDS.index("person_id")] == person_id:
+ if (
+ not activity[ACTIVITY_FIELDS.index("person_id")]
+ == person_id
+ ):
activity_iterator.previous()
break
else:
diff --git a/matsim/scenario/supply/gtfs.py b/matsim/scenario/supply/gtfs.py
index 0635cc0f..83991cdb 100644
--- a/matsim/scenario/supply/gtfs.py
+++ b/matsim/scenario/supply/gtfs.py
@@ -2,6 +2,7 @@
import matsim.runtime.pt2matsim as pt2matsim
+
def configure(context):
context.stage("matsim.runtime.java")
context.stage("matsim.runtime.pt2matsim")
@@ -10,21 +11,26 @@ def configure(context):
context.config("gtfs_date", "dayWithMostServices")
+
def execute(context):
gtfs_path = "%s/output" % context.path("data.gtfs.cleaned")
crs = context.stage("synthesis.population.spatial.home.locations").crs
- pt2matsim.run(context, "org.matsim.pt2matsim.run.Gtfs2TransitSchedule", [
- gtfs_path,
- context.config("gtfs_date"), crs,
- "%s/transit_schedule.xml.gz" % context.path(),
- "%s/transit_vehicles.xml.gz" % context.path()
- ])
+ pt2matsim.run(
+ context,
+ "org.matsim.pt2matsim.run.Gtfs2TransitSchedule",
+ [
+ gtfs_path,
+ context.config("gtfs_date"),
+ crs,
+ "%s/transit_schedule.xml.gz" % context.path(),
+ "%s/transit_vehicles.xml.gz" % context.path(),
+ ],
+ )
- assert(os.path.exists("%s/transit_schedule.xml.gz" % context.path()))
- assert(os.path.exists("%s/transit_vehicles.xml.gz" % context.path()))
+ assert os.path.exists("%s/transit_schedule.xml.gz" % context.path())
+ assert os.path.exists("%s/transit_vehicles.xml.gz" % context.path())
return dict(
- schedule_path = "transit_schedule.xml.gz",
- vehicles_path = "transit_vehicles.xml.gz"
+ schedule_path="transit_schedule.xml.gz", vehicles_path="transit_vehicles.xml.gz"
)
diff --git a/matsim/scenario/supply/osm.py b/matsim/scenario/supply/osm.py
index f723104e..f9ea0485 100644
--- a/matsim/scenario/supply/osm.py
+++ b/matsim/scenario/supply/osm.py
@@ -2,6 +2,7 @@
import matsim.runtime.pt2matsim as pt2matsim
+
def configure(context):
context.stage("matsim.runtime.java")
context.stage("matsim.runtime.pt2matsim")
@@ -10,12 +11,15 @@ def configure(context):
context.config("export_detailed_network", False)
+
def execute(context):
osm_path = "%s/output.osm.gz" % context.path("data.osm.cleaned")
crs = context.stage("data.spatial.iris").crs
- pt2matsim.run(context, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig",
- arguments=["config_template.xml"]
+ pt2matsim.run(
+ context,
+ "org.matsim.pt2matsim.run.CreateDefaultOsmConfig",
+ arguments=["config_template.xml"],
)
with open("%s/config_template.xml" % context.path()) as f_read:
@@ -23,17 +27,17 @@ def execute(context):
content = content.replace(
'',
- '' % osm_path
+ '' % osm_path,
)
content = content.replace(
'',
- ''.format(crs)
+ ''.format(crs),
)
content = content.replace(
'',
- ''
+ '',
)
if context.config("export_detailed_network"):
@@ -43,22 +47,24 @@ def execute(context):
)
content = content.replace(
- '',
+ "",
"""
- """
+ """,
)
with open("%s/config.xml" % context.path(), "w+") as f_write:
f_write.write(content)
- pt2matsim.run(context, "org.matsim.pt2matsim.run.Osm2MultimodalNetwork",
- arguments=["config.xml"]
+ pt2matsim.run(
+ context,
+ "org.matsim.pt2matsim.run.Osm2MultimodalNetwork",
+ arguments=["config.xml"],
)
- assert(os.path.exists("%s/network.xml.gz" % context.path()))
+ assert os.path.exists("%s/network.xml.gz" % context.path())
return "network.xml.gz"
diff --git a/matsim/scenario/supply/processed.py b/matsim/scenario/supply/processed.py
index f75fc130..448df94f 100644
--- a/matsim/scenario/supply/processed.py
+++ b/matsim/scenario/supply/processed.py
@@ -2,6 +2,7 @@
import matsim.runtime.pt2matsim as pt2matsim
+
def configure(context):
context.stage("matsim.runtime.java")
context.stage("matsim.runtime.pt2matsim")
@@ -12,68 +13,71 @@ def configure(context):
context.config("data_path")
context.config("processes")
+
def execute(context):
# Prepare input paths
network_path = "%s/%s" % (
context.path("matsim.scenario.supply.osm"),
- context.stage("matsim.scenario.supply.osm")
+ context.stage("matsim.scenario.supply.osm"),
)
schedule_path = "%s/%s" % (
context.path("matsim.scenario.supply.gtfs"),
- context.stage("matsim.scenario.supply.gtfs")["schedule_path"]
+ context.stage("matsim.scenario.supply.gtfs")["schedule_path"],
)
# Create and modify config file
- pt2matsim.run(context, "org.matsim.pt2matsim.run.CreateDefaultPTMapperConfig", [
- "config_template.xml"
- ])
+ pt2matsim.run(
+ context,
+ "org.matsim.pt2matsim.run.CreateDefaultPTMapperConfig",
+ ["config_template.xml"],
+ )
with open("%s/config_template.xml" % context.path()) as f_read:
content = f_read.read()
content = content.replace(
'',
- '' % network_path
+ '' % network_path,
)
content = content.replace(
'',
- '' % schedule_path
+ '' % schedule_path,
)
content = content.replace(
'',
- '' % context.config("processes")
+ '' % context.config("processes"),
)
content = content.replace(
'',
- ''
+ '',
)
content = content.replace(
'',
- ''
+ '',
)
content = content.replace(
'',
- ''
+ '',
)
content = content.replace(
'',
- ''
+ '',
)
with open("%s/config.xml" % context.path(), "w+") as f_write:
f_write.write(content)
# Run mapping process
- pt2matsim.run(context, "org.matsim.pt2matsim.run.PublicTransitMapper", [
- "config.xml"
- ])
+ pt2matsim.run(
+ context, "org.matsim.pt2matsim.run.PublicTransitMapper", ["config.xml"]
+ )
- assert(os.path.exists("%s/network.xml.gz" % context.path()))
- assert(os.path.exists("%s/schedule.xml.gz" % context.path()))
+ assert os.path.exists("%s/network.xml.gz" % context.path())
+ assert os.path.exists("%s/schedule.xml.gz" % context.path())
return dict(
- network_path = "network.xml.gz",
- schedule_path = "schedule.xml.gz",
- #plausibility_path = "allPlausibilityWarnings.xml.gz"
+ network_path="network.xml.gz",
+ schedule_path="schedule.xml.gz",
+ # plausibility_path = "allPlausibilityWarnings.xml.gz"
)
diff --git a/matsim/scenario/vehicles.py b/matsim/scenario/vehicles.py
index 63205fc3..9530bbdc 100644
--- a/matsim/scenario/vehicles.py
+++ b/matsim/scenario/vehicles.py
@@ -5,52 +5,59 @@
import matsim.writers as writers
+
def configure(context):
context.stage("synthesis.vehicles.vehicles")
+
TYPE_FIELDS = ["type_id", "nb_seats", "length", "width", "pce", "mode"]
VEHICLE_FIELDS = ["vehicle_id", "type_id", "critair", "technology", "age", "euro"]
+
def execute(context):
output_path = "%s/vehicles.xml.gz" % context.path()
df_vehicle_types, df_vehicles = context.stage("synthesis.vehicles.vehicles")
- with gzip.open(output_path, 'wb+') as writer:
- with io.BufferedWriter(writer, buffer_size = 2 * 1024**3) as writer:
+ with gzip.open(output_path, "wb+") as writer:
+ with io.BufferedWriter(writer, buffer_size=2 * 1024**3) as writer:
writer = writers.VehiclesWriter(writer)
writer.start_vehicles()
- with context.progress(total = len(df_vehicle_types), label = "Writing vehicles types ...") as progress:
+ with context.progress(
+ total=len(df_vehicle_types), label="Writing vehicles types ..."
+ ) as progress:
for type in df_vehicle_types.to_dict(orient="records"):
writer.add_type(
type["type_id"],
length=type["length"],
width=type["width"],
- engine_attributes = {
+ engine_attributes={
"HbefaVehicleCategory": type["hbefa_cat"],
"HbefaTechnology": type["hbefa_tech"],
"HbefaSizeClass": type["hbefa_size"],
- "HbefaEmissionsConcept": type["hbefa_emission"]
- }
+ "HbefaEmissionsConcept": type["hbefa_emission"],
+ },
)
progress.update()
- with context.progress(total = len(df_vehicles), label = "Writing vehicles ...") as progress:
+ with context.progress(
+ total=len(df_vehicles), label="Writing vehicles ..."
+ ) as progress:
for vehicle in df_vehicles.to_dict(orient="records"):
writer.add_vehicle(
vehicle["vehicle_id"],
vehicle["type_id"],
- attributes = {
+ attributes={
"critair": vehicle["critair"],
"technology": vehicle["technology"],
"age": vehicle["age"],
- "euro": vehicle["euro"]
- }
+ "euro": vehicle["euro"],
+ },
)
progress.update()
writer.end_vehicles()
- return "vehicles.xml.gz"
\ No newline at end of file
+ return "vehicles.xml.gz"
diff --git a/matsim/simulation/prepare.py b/matsim/simulation/prepare.py
index 7a73e6d8..39176e31 100644
--- a/matsim/simulation/prepare.py
+++ b/matsim/simulation/prepare.py
@@ -3,9 +3,10 @@
import matsim.runtime.eqasim as eqasim
+
def configure(context):
context.config("mode_choice", False)
-
+
context.stage("matsim.scenario.population")
context.stage("matsim.scenario.households")
context.stage("matsim.scenario.vehicles")
@@ -26,148 +27,245 @@ def configure(context):
context.config("output_prefix", "ile_de_france_")
+
def execute(context):
# Prepare input files
facilities_path = "%s/%s" % (
context.path("matsim.scenario.facilities"),
- context.stage("matsim.scenario.facilities")
+ context.stage("matsim.scenario.facilities"),
)
population_path = "%s/%s" % (
context.path("matsim.scenario.population"),
- context.stage("matsim.scenario.population")
+ context.stage("matsim.scenario.population"),
)
network_path = "%s/%s" % (
context.path("matsim.scenario.supply.processed"),
- context.stage("matsim.scenario.supply.processed")["network_path"]
+ context.stage("matsim.scenario.supply.processed")["network_path"],
)
- eqasim.run(context, "org.eqasim.core.scenario.preparation.RunPreparation", [
- "--input-facilities-path", facilities_path,
- "--output-facilities-path", "%sfacilities.xml.gz" % context.config("output_prefix"),
- "--input-population-path", population_path,
- "--output-population-path", "prepared_population.xml.gz",
- "--input-network-path", network_path,
- "--output-network-path", "%snetwork.xml.gz" % context.config("output_prefix"),
- "--threads", context.config("processes")
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.preparation.RunPreparation",
+ [
+ "--input-facilities-path",
+ facilities_path,
+ "--output-facilities-path",
+ "%sfacilities.xml.gz" % context.config("output_prefix"),
+ "--input-population-path",
+ population_path,
+ "--output-population-path",
+ "prepared_population.xml.gz",
+ "--input-network-path",
+ network_path,
+ "--output-network-path",
+ "%snetwork.xml.gz" % context.config("output_prefix"),
+ "--threads",
+ context.config("processes"),
+ ],
+ )
- assert os.path.exists("%s/%sfacilities.xml.gz" % (context.path(), context.config("output_prefix")))
+ assert os.path.exists(
+ "%s/%sfacilities.xml.gz" % (context.path(), context.config("output_prefix"))
+ )
assert os.path.exists("%s/prepared_population.xml.gz" % context.path())
- assert os.path.exists("%s/%snetwork.xml.gz" % (context.path(), context.config("output_prefix")))
+ assert os.path.exists(
+ "%s/%snetwork.xml.gz" % (context.path(), context.config("output_prefix"))
+ )
# Copy remaining input files
households_path = "%s/%s" % (
context.path("matsim.scenario.households"),
- context.stage("matsim.scenario.households")
+ context.stage("matsim.scenario.households"),
+ )
+ shutil.copy(
+ households_path,
+ "%s/%shouseholds.xml.gz"
+ % (context.cache_path, context.config("output_prefix")),
)
- shutil.copy(households_path, "%s/%shouseholds.xml.gz" % (context.cache_path, context.config("output_prefix")))
transit_schedule_path = "%s/%s" % (
context.path("matsim.scenario.supply.processed"),
- context.stage("matsim.scenario.supply.processed")["schedule_path"]
+ context.stage("matsim.scenario.supply.processed")["schedule_path"],
+ )
+ shutil.copy(
+ transit_schedule_path,
+ "%s/%stransit_schedule.xml.gz"
+ % (context.cache_path, context.config("output_prefix")),
)
- shutil.copy(transit_schedule_path, "%s/%stransit_schedule.xml.gz" % (context.cache_path, context.config("output_prefix")))
transit_vehicles_path = "%s/%s" % (
context.path("matsim.scenario.supply.gtfs"),
- context.stage("matsim.scenario.supply.gtfs")["vehicles_path"]
+ context.stage("matsim.scenario.supply.gtfs")["vehicles_path"],
+ )
+ shutil.copy(
+ transit_vehicles_path,
+ "%s/%stransit_vehicles.xml.gz"
+ % (context.cache_path, context.config("output_prefix")),
)
- shutil.copy(transit_vehicles_path, "%s/%stransit_vehicles.xml.gz" % (context.cache_path, context.config("output_prefix")))
vehicles_path = "%s/%s" % (
context.path("matsim.scenario.vehicles"),
- context.stage("matsim.scenario.vehicles")
+ context.stage("matsim.scenario.vehicles"),
+ )
+ shutil.copy(
+ vehicles_path,
+ "%s/%svehicles.xml.gz" % (context.cache_path, context.config("output_prefix")),
)
- shutil.copy(vehicles_path, "%s/%svehicles.xml.gz" % (context.cache_path, context.config("output_prefix")))
# Generate base configuration
- eqasim.run(context, "org.eqasim.core.scenario.config.RunGenerateConfig", [
- "--sample-size", context.config("sampling_rate"),
- "--threads", context.config("processes"),
- "--prefix", context.config("output_prefix"),
- "--random-seed", context.config("random_seed"),
- "--output-path", "generic_config.xml"
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.config.RunGenerateConfig",
+ [
+ "--sample-size",
+ context.config("sampling_rate"),
+ "--threads",
+ context.config("processes"),
+ "--prefix",
+ context.config("output_prefix"),
+ "--random-seed",
+ context.config("random_seed"),
+ "--output-path",
+ "generic_config.xml",
+ ],
+ )
assert os.path.exists("%s/generic_config.xml" % context.path())
# Adapt config for Île-de-France
- eqasim.run(context, "org.eqasim.ile_de_france.scenario.RunAdaptConfig", [
- "--input-path", "generic_config.xml",
- "--output-path", "%sconfig.xml" % context.config("output_prefix"),
- "--prefix", context.config("output_prefix")
- ])
- assert os.path.exists("%s/%sconfig.xml" % (context.path(), context.config("output_prefix")))
+ eqasim.run(
+ context,
+ "org.eqasim.ile_de_france.scenario.RunAdaptConfig",
+ [
+ "--input-path",
+ "generic_config.xml",
+ "--output-path",
+ "%sconfig.xml" % context.config("output_prefix"),
+ "--prefix",
+ context.config("output_prefix"),
+ ],
+ )
+ assert os.path.exists(
+ "%s/%sconfig.xml" % (context.path(), context.config("output_prefix"))
+ )
# Add urban attributes to population and network
# (but only if Paris is included in the scenario!)
df_codes = context.stage("data.spatial.codes")
if "75" in df_codes["departement_id"].unique().astype(str):
- df_shape = context.stage("data.spatial.departments")[["departement_id", "geometry"]].rename(
- columns = dict(departement_id = "id")
- )
+ df_shape = context.stage("data.spatial.departments")[
+ ["departement_id", "geometry"]
+ ].rename(columns=dict(departement_id="id"))
df_shape["id"] = df_shape["id"].astype(str)
if "75" in df_shape["id"].unique():
df_shape.to_file("%s/departments.shp" % context.path())
- eqasim.run(context, "org.eqasim.core.scenario.spatial.RunImputeSpatialAttribute", [
- "--input-population-path", "prepared_population.xml.gz",
- "--output-population-path", "prepared_population.xml.gz",
- "--input-network-path", "%snetwork.xml.gz" % context.config("output_prefix"),
- "--output-network-path", "%snetwork.xml.gz" % context.config("output_prefix"),
- "--shape-path", "departments.shp",
- "--shape-attribute", "id",
- "--shape-value", "75",
- "--attribute", "isUrban"
- ])
-
- eqasim.run(context, "org.eqasim.core.scenario.spatial.RunAdjustCapacity", [
- "--input-path", "%snetwork.xml.gz" % context.config("output_prefix"),
- "--output-path", "%snetwork.xml.gz" % context.config("output_prefix"),
- "--shape-path", "departments.shp",
- "--shape-attribute", "id",
- "--shape-value", "75",
- "--factor", str(0.8)
- ])
-
-
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.spatial.RunImputeSpatialAttribute",
+ [
+ "--input-population-path",
+ "prepared_population.xml.gz",
+ "--output-population-path",
+ "prepared_population.xml.gz",
+ "--input-network-path",
+ "%snetwork.xml.gz" % context.config("output_prefix"),
+ "--output-network-path",
+ "%snetwork.xml.gz" % context.config("output_prefix"),
+ "--shape-path",
+ "departments.shp",
+ "--shape-attribute",
+ "id",
+ "--shape-value",
+ "75",
+ "--attribute",
+ "isUrban",
+ ],
+ )
+
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.spatial.RunAdjustCapacity",
+ [
+ "--input-path",
+ "%snetwork.xml.gz" % context.config("output_prefix"),
+ "--output-path",
+ "%snetwork.xml.gz" % context.config("output_prefix"),
+ "--shape-path",
+ "departments.shp",
+ "--shape-attribute",
+ "id",
+ "--shape-value",
+ "75",
+ "--factor",
+ str(0.8),
+ ],
+ )
+
# Optionally, perform mode choice
if context.config("mode_choice"):
- eqasim.run(context, "org.eqasim.core.standalone_mode_choice.RunStandaloneModeChoice", [
- "--config-path", "%sconfig.xml" % context.config("output_prefix"),
- "--config:standaloneModeChoice.outputDirectory", "mode_choice",
- "--config:global.numberOfThreads", context.config("processes"),
- "--write-output-csv-trips", "true",
- "--skip-scenario-check", "true",
- "--config:plans.inputPlansFile", "prepared_population.xml.gz",
- "--eqasim-configurator-class", "org.eqasim.ile_de_france.IDFConfigurator",
- "--mode-choice-configurator-class", "org.eqasim.ile_de_france.IDFStandaloneModeChoiceConfigurator"
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.core.standalone_mode_choice.RunStandaloneModeChoice",
+ [
+ "--config-path",
+ "%sconfig.xml" % context.config("output_prefix"),
+ "--config:standaloneModeChoice.outputDirectory",
+ "mode_choice",
+ "--config:global.numberOfThreads",
+ context.config("processes"),
+ "--write-output-csv-trips",
+ "true",
+ "--skip-scenario-check",
+ "true",
+ "--config:plans.inputPlansFile",
+ "prepared_population.xml.gz",
+ "--eqasim-configurator-class",
+ "org.eqasim.ile_de_france.IDFConfigurator",
+ "--mode-choice-configurator-class",
+ "org.eqasim.ile_de_france.IDFStandaloneModeChoiceConfigurator",
+ ],
+ )
assert os.path.exists("%s/mode_choice/output_plans.xml.gz" % context.path())
assert os.path.exists("%s/mode_choice/output_trips.csv" % context.path())
assert os.path.exists("%s/mode_choice/output_pt_legs.csv" % context.path())
- shutil.copy("%s/mode_choice/output_plans.xml.gz" % context.path(),
- "%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix")))
+ shutil.copy(
+ "%s/mode_choice/output_plans.xml.gz" % context.path(),
+ "%s/%spopulation.xml.gz"
+ % (context.path(), context.config("output_prefix")),
+ )
else:
# Route population
- eqasim.run(context, "org.eqasim.core.scenario.routing.RunPopulationRouting", [
- "--config-path", "%sconfig.xml" % context.config("output_prefix"),
- "--output-path", "%spopulation.xml.gz" % context.config("output_prefix"),
- "--threads", context.config("processes"),
- "--config:plans.inputPlansFile", "prepared_population.xml.gz"
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.routing.RunPopulationRouting",
+ [
+ "--config-path",
+ "%sconfig.xml" % context.config("output_prefix"),
+ "--output-path",
+ "%spopulation.xml.gz" % context.config("output_prefix"),
+ "--threads",
+ context.config("processes"),
+ "--config:plans.inputPlansFile",
+ "prepared_population.xml.gz",
+ ],
+ )
- assert os.path.exists("%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix")))
+ assert os.path.exists(
+ "%s/%spopulation.xml.gz" % (context.path(), context.config("output_prefix"))
+ )
# Validate scenario
- eqasim.run(context, "org.eqasim.core.scenario.validation.RunScenarioValidator", [
- "--config-path", "%sconfig.xml" % context.config("output_prefix")
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.core.scenario.validation.RunScenarioValidator",
+ ["--config-path", "%sconfig.xml" % context.config("output_prefix")],
+ )
# Cleanup
os.remove("%s/prepared_population.xml.gz" % context.path())
diff --git a/matsim/simulation/run.py b/matsim/simulation/run.py
index fb6773b2..69da3376 100644
--- a/matsim/simulation/run.py
+++ b/matsim/simulation/run.py
@@ -3,23 +3,33 @@
import matsim.runtime.eqasim as eqasim
+
def configure(context):
context.stage("matsim.simulation.prepare")
context.stage("matsim.runtime.java")
context.stage("matsim.runtime.eqasim")
+
def execute(context):
config_path = "%s/%s" % (
context.path("matsim.simulation.prepare"),
- context.stage("matsim.simulation.prepare")
+ context.stage("matsim.simulation.prepare"),
)
# Run routing
- eqasim.run(context, "org.eqasim.ile_de_france.RunSimulation", [
- "--config-path", config_path,
- "--config:controler.lastIteration", str(1),
- "--config:controler.writeEventsInterval", str(1),
- "--config:controler.writePlansInterval", str(1),
- ])
+ eqasim.run(
+ context,
+ "org.eqasim.ile_de_france.RunSimulation",
+ [
+ "--config-path",
+ config_path,
+ "--config:controler.lastIteration",
+ str(1),
+ "--config:controler.writeEventsInterval",
+ str(1),
+ "--config:controler.writePlansInterval",
+ str(1),
+ ],
+ )
assert os.path.exists("%s/simulation_output/output_events.xml.gz" % context.path())
diff --git a/matsim/writers.py b/matsim/writers.py
index da99084f..94a5ced9 100644
--- a/matsim/writers.py
+++ b/matsim/writers.py
@@ -1,6 +1,7 @@
import numpy as np
from xml.sax.saxutils import escape
+
class XmlWriter:
def __init__(self, writer):
self.writer = writer
@@ -37,22 +38,26 @@ def time(self, time):
time = int(time)
hours = time // 3600
minutes = (time % 3600) // 60
- seconds = (time % 60)
+ seconds = time % 60
return "%02d:%02d:%02d" % (hours, minutes, seconds)
- def location(self, x, y, facility_id = None):
+ def location(self, x, y, facility_id=None):
return (x, y, None if facility_id is None else facility_id)
+
def _write_preface_attributes(writer, attributes):
if len(attributes) > 0:
- writer._write_line('')
+ writer._write_line("")
writer.indent += 1
for item in attributes.items():
- writer._write_line('%s' % item)
+ writer._write_line(
+ '%s' % item
+ )
writer.indent -= 1
- writer._write_line('')
+ writer._write_line("")
+
class PopulationWriter(XmlWriter):
POPULATION_SCOPE = 0
@@ -64,11 +69,13 @@ class PopulationWriter(XmlWriter):
def __init__(self, writer):
XmlWriter.__init__(self, writer)
- def start_population(self, attributes = {}):
+ def start_population(self, attributes={}):
self._require_scope(None)
self._write_line('')
- self._write_line('')
- self._write_line('')
+ self._write_line(
+ ''
+ )
+ self._write_line("")
self.scope = self.POPULATION_SCOPE
self.indent += 1
@@ -78,7 +85,7 @@ def start_population(self, attributes = {}):
def end_population(self):
self._require_scope(self.POPULATION_SCOPE)
self.indent -= 1
- self._write_line('')
+ self._write_line("")
self.scope = self.FINISHED_SCOPE
def start_person(self, person_id):
@@ -91,11 +98,11 @@ def end_person(self):
self._require_scope(self.PERSON_SCOPE)
self.indent -= 1
self.scope = self.POPULATION_SCOPE
- self._write_line('')
+ self._write_line("")
def start_attributes(self):
# We don't require any scope here because attributes can be almost anywhere
- self._write_line('')
+ self._write_line("")
self.indent += 1
# And we need to remember which scope we were in before starting the attributes
self._pre_attributes_scope = self.scope
@@ -106,13 +113,13 @@ def end_attributes(self):
self.indent -= 1
# Resetting the scope that we were in before starting the attributes
self.scope = self._pre_attributes_scope
- self._write_line('')
+ self._write_line("")
def add_attribute(self, name, type, value):
self._require_scope(self.ATTRIBUTES_SCOPE)
- self._write_line('%s' % (
- name, type, value
- ))
+ self._write_line(
+ '%s' % (name, type, value)
+ )
def start_plan(self, selected):
self._require_scope(self.PERSON_SCOPE)
@@ -124,33 +131,37 @@ def end_plan(self):
self._require_scope(self.PLAN_SCOPE)
self.indent -= 1
self.scope = self.PERSON_SCOPE
- self._write_line('')
+ self._write_line("")
- def add_activity(self, type, location, start_time = None, end_time = None):
+ def add_activity(self, type, location, start_time=None, end_time=None):
self._require_scope(self.PLAN_SCOPE)
self._write_indent()
- self._write('\n')
+ if location[2] is not None:
+ self._write('facility="%s" ' % str(location[2]))
+ if start_time is not None:
+ self._write('start_time="%s" ' % self.time(start_time))
+ if end_time is not None:
+ self._write('end_time="%s" ' % self.time(end_time))
+ self._write("/>\n")
def add_leg(self, mode, departure_time, travel_time):
self._require_scope(self.PLAN_SCOPE)
self._write_indent()
- self._write('\n')
+ self._write(">\n")
self.start_attributes()
- self.add_attribute('routingMode', 'java.lang.String', mode)
+ self.add_attribute("routingMode", "java.lang.String", mode)
self.end_attributes()
- self._write_line('')
+ self._write_line("")
+
class HouseholdsWriter(XmlWriter):
HOUSEHOLDS_SCOPE = 0
@@ -161,10 +172,12 @@ class HouseholdsWriter(XmlWriter):
def __init__(self, writer):
XmlWriter.__init__(self, writer)
- def start_households(self, attributes = {}):
+ def start_households(self, attributes={}):
self._require_scope(None)
self._write_line('')
- self._write_line('')
+ self._write_line(
+ ''
+ )
self.scope = self.HOUSEHOLDS_SCOPE
self.indent += 1
@@ -173,7 +186,7 @@ def start_households(self, attributes = {}):
def end_households(self):
self._require_scope(self.HOUSEHOLDS_SCOPE)
- self._write_line('')
+ self._write_line("")
self.scope = self.FINISHED_SCOPE
def start_household(self, household_id):
@@ -186,11 +199,11 @@ def end_household(self):
self._require_scope(self.HOUSEHOLD_SCOPE)
self.indent -= 1
self.scope = self.HOUSEHOLDS_SCOPE
- self._write_line('')
+ self._write_line("")
def start_attributes(self):
self._require_scope(self.HOUSEHOLD_SCOPE)
- self._write_line('')
+ self._write_line("")
self.indent += 1
self.scope = self.ATTRIBUTES_SCOPE
@@ -198,26 +211,28 @@ def end_attributes(self):
self._require_scope(self.ATTRIBUTES_SCOPE)
self.indent -= 1
self.scope = self.HOUSEHOLD_SCOPE
- self._write_line('')
+ self._write_line("")
def add_attribute(self, name, type, value):
self._require_scope(self.ATTRIBUTES_SCOPE)
- self._write_line('%s' % (
- name, type, value
- ))
+ self._write_line(
+ '%s' % (name, type, value)
+ )
def add_members(self, person_ids):
self._require_scope(self.HOUSEHOLD_SCOPE)
- self._write_line('')
+ self._write_line("")
self.indent += 1
- for person_id in person_ids: self._write_line('' % person_id)
+ for person_id in person_ids:
+ self._write_line('' % person_id)
self.indent -= 1
- self._write_line('')
+ self._write_line("")
def add_income(self, income):
self._require_scope(self.HOUSEHOLD_SCOPE)
self._write_line('%f' % income)
+
class FacilitiesWriter(XmlWriter):
FACILITIES_SCOPE = 0
FINISHED_SCOPE = 1
@@ -226,11 +241,13 @@ class FacilitiesWriter(XmlWriter):
def __init__(self, writer):
XmlWriter.__init__(self, writer)
- def start_facilities(self, attributes = {}):
+ def start_facilities(self, attributes={}):
self._require_scope(None)
self._write_line('')
- self._write_line('')
- self._write_line('')
+ self._write_line(
+ ''
+ )
+ self._write_line("")
self.scope = self.FACILITIES_SCOPE
self.indent += 1
@@ -240,14 +257,12 @@ def start_facilities(self, attributes = {}):
def end_facilities(self):
self._require_scope(self.FACILITIES_SCOPE)
self.indent -= 1
- self._write_line('')
+ self._write_line("")
self.scope = self.FINISHED_SCOPE
def start_facility(self, facility_id, x, y):
self._require_scope(self.FACILITIES_SCOPE)
- self._write_line('' % (
- str(facility_id), x, y
- ))
+ self._write_line('' % (str(facility_id), x, y))
self.indent += 1
self.scope = self.FACILITY_SCOPE
@@ -256,7 +271,7 @@ def end_facility(self):
self._require_scope(self.FACILITY_SCOPE)
self.indent -= 1
self.scope = self.FACILITIES_SCOPE
- self._write_line('')
+ self._write_line("")
def add_activity(self, purpose):
self._require_scope(self.FACILITY_SCOPE)
@@ -270,10 +285,12 @@ class VehiclesWriter(XmlWriter):
def __init__(self, writer):
XmlWriter.__init__(self, writer)
- def start_vehicles(self, attributes = {}):
+ def start_vehicles(self, attributes={}):
self._require_scope(None)
self._write_line('')
- self._write_line('')
+ self._write_line(
+ ''
+ )
self.scope = self.VEHICLES_SCOPE
self.indent += 1
@@ -283,40 +300,58 @@ def start_vehicles(self, attributes = {}):
def end_vehicles(self):
self._require_scope(self.VEHICLES_SCOPE)
self.indent -= 1
- self._write_line('')
+ self._write_line("")
self.scope = self.FINISHED_SCOPE
- def add_type(self, vehicle_type_id, nb_seats = 4, length = 5.0, width = 1.0, pce = 1.0, mode = "car", attributes = {}, engine_attributes = {}):
+ def add_type(
+ self,
+ vehicle_type_id,
+ nb_seats=4,
+ length=5.0,
+ width=1.0,
+ pce=1.0,
+ mode="car",
+ attributes={},
+ engine_attributes={},
+ ):
self._require_scope(self.VEHICLES_SCOPE)
self._write_line('' % str(vehicle_type_id))
self.indent += 1
if len(attributes) > 0:
- self._write_line('')
+ self._write_line("")
self.indent += 1
for key, item in attributes.items():
- self._write_line('%s' % (key, escape(item)))
+ self._write_line(
+ '%s'
+ % (key, escape(item))
+ )
self.indent -= 1
- self._write_line('')
+ self._write_line("")
if not np.isnan(nb_seats):
- self._write_line('' % nb_seats)
+ self._write_line(
+ '' % nb_seats
+ )
self._write_line('' % length)
self._write_line('' % width)
if len(engine_attributes) > 0:
- self._write_line('')
+ self._write_line("")
self.indent += 1
- self._write_line('')
+ self._write_line("")
self.indent += 1
for key, item in engine_attributes.items():
- self._write_line('%s' % (key, escape(item)))
+ self._write_line(
+ '%s'
+ % (key, escape(item))
+ )
self.indent -= 1
- self._write_line('')
+ self._write_line("")
self.indent -= 1
- self._write_line('')
+ self._write_line("")
if not np.isnan(pce):
self._write_line('' % pce)
@@ -324,29 +359,35 @@ def add_type(self, vehicle_type_id, nb_seats = 4, length = 5.0, width = 1.0, pce
self._write_line('' % mode)
self.indent -= 1
- self._write_line('')
-
+ self._write_line("")
- def add_vehicle(self, vehicle_id, type_id, attributes = {}):
+ def add_vehicle(self, vehicle_id, type_id, attributes={}):
self._require_scope(self.VEHICLES_SCOPE)
if len(attributes) > 0:
- self._write_line('' % (str(vehicle_id), str(type_id)))
+ self._write_line(
+ '' % (str(vehicle_id), str(type_id))
+ )
self.indent += 1
- self._write_line('')
+ self._write_line("")
self.indent += 1
for key, item in attributes.items():
- self._write_line('%s' % (str(key), str(item)))
+ self._write_line(
+ '%s'
+ % (str(key), str(item))
+ )
self.indent -= 1
- self._write_line('')
+ self._write_line("")
self.indent -= 1
- self._write_line('')
+ self._write_line("")
else:
- self._write_line('' % (str(vehicle_id), str(type_id)))
+ self._write_line(
+ '' % (str(vehicle_id), str(type_id))
+ )
class backlog_iterator:
- def __init__(self, iterable, backlog = 1):
+ def __init__(self, iterable, backlog=1):
self.iterable = iterable
self.forward_log = []
self.backward_log = [None] * (backlog + 1)
diff --git a/scripts/verify_data.py b/scripts/verify_data.py
index 93b77d4f..55a37b2f 100644
--- a/scripts/verify_data.py
+++ b/scripts/verify_data.py
@@ -1,34 +1,43 @@
import requests
import time
-# The goal of this script is to verify the availability of the data
+# The goal of this script is to verify the availability of the data
# that is needed to set up the pipeline
-sleep_time = 5 # seconds
-timeout = 30 # seconds
+sleep_time = 5 # seconds
+timeout = 30 # seconds
retries = 3
+
class Report:
def __init__(self):
self.sources = []
def register(self, name, url):
- self.sources.append({ "name": name, "url": url })
+ self.sources.append({"name": name, "url": url})
def validate(self):
failed = []
with requests.Session() as session:
- session.headers.update({ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0" })
+ session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0"
+ }
+ )
for index, source in enumerate(self.sources):
- print("[{}/{}] Checking {} ...".format(index + 1, len(self.sources), source["name"]))
-
+ print(
+ "[{}/{}] Checking {} ...".format(
+ index + 1, len(self.sources), source["name"]
+ )
+ )
+
retry = 0
success = False
while not success and retry < retries:
try:
- response = session.head(source["url"], timeout = timeout)
+ response = session.head(source["url"], timeout=timeout)
source["status"] = response.status_code
success = True
except TimeoutError:
@@ -38,54 +47,59 @@ def validate(self):
print(e)
retry += 1
- print(" Status {} (retry {}/{})".format(source["status"], retry, retries))
-
+ print(
+ " Status {} (retry {}/{})".format(
+ source["status"], retry, retries
+ )
+ )
+
time.sleep(sleep_time)
if source["status"] != 200:
failed.append(source["name"])
-
+
print("Done.")
print("Missing: ", len(failed))
print(failed)
return len(failed) == 0
+
report = Report()
report.register(
"Census data (RP 2019)",
- "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVI_csv.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVI_csv.zip",
)
report.register(
"Population totals (RP 2019)",
- "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019.zip",
)
report.register(
"Origin-destination data (RP-MOBPRO 2019)",
- "https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip",
)
report.register(
"Origin-destination data (RP-MOBSCO 2019)",
- "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip",
)
report.register(
"Income tax data (Filosofi 2019), municipalities",
- "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES.zip",
)
report.register(
"Income tax data (Filosofi 2019), administrative",
- "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA.zip",
)
report.register(
"Service and facility census (BPE 2021)",
- "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip",
)
entd_sources = [
@@ -94,60 +108,66 @@ def validate(self):
(2556, "Q_menage"),
(2565, "Q_individu"),
(2566, "Q_ind_lieu_teg"),
- (2568, "K_deploc")
+ (2568, "K_deploc"),
]
for identifier, name in entd_sources:
report.register(
"National household travel survey (ENTD 2008), {}".format(name),
- "https://www.statistiques.developpement-durable.gouv.fr/media/{}/download?inline".format(identifier)
+ "https://www.statistiques.developpement-durable.gouv.fr/media/{}/download?inline".format(
+ identifier
+ ),
)
report.register(
"IRIS zoning system (2021)",
- "https://data.geopf.fr/telechargement/download/CONTOURS-IRIS/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z"
+ "https://data.geopf.fr/telechargement/download/CONTOURS-IRIS/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z",
)
report.register(
"Zoning registry (2021)",
- "https://www.insee.fr/fr/statistiques/fichier/7708995/reference_IRIS_geo2021.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/7708995/reference_IRIS_geo2021.zip",
)
report.register(
"Enterprise census (SIRENE), Etablissement",
- "https://files.data.gouv.fr/insee-sirene/StockEtablissement_utf8.zip"
+ "https://files.data.gouv.fr/insee-sirene/StockEtablissement_utf8.zip",
)
report.register(
"Enterprise census (SIRENE), Unité Legale",
- "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip"
+ "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip",
)
report.register(
"Enterprise census (SIRENE), Géolocalisé",
- "https://files.data.gouv.fr/insee-sirene-geo/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip"
+ "https://files.data.gouv.fr/insee-sirene-geo/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip",
)
for department in (75, 77, 78, 91, 92, 93, 94, 95):
report.register(
"Buildings database (BD TOPO), {}".format(department),
- "https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15.7z".format(department, department)
+ "https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15.7z".format(
+ department, department
+ ),
)
for department in (75, 77, 78, 91, 92, 93, 94, 95):
report.register(
"Adresses database (BAN), {}".format(department),
- "https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{}.csv.gz".format(department)
+ "https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{}.csv.gz".format(
+ department
+ ),
)
report.register(
"Population projections",
- "https://www.insee.fr/fr/statistiques/fichier/5894093/00_central.xlsx"
+ "https://www.insee.fr/fr/statistiques/fichier/5894093/00_central.xlsx",
)
report.register(
"Urban type",
- "https://www.insee.fr/fr/statistiques/fichier/4802589/UU2020_au_01-01-2023.zip"
+ "https://www.insee.fr/fr/statistiques/fichier/4802589/UU2020_au_01-01-2023.zip",
)
exit(0 if report.validate() else 1)
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
index 45a32a70..3a0998b6 100644
--- a/synthesis/locations/education.py
+++ b/synthesis/locations/education.py
@@ -3,13 +3,15 @@
import pandas as pd
import geopandas as gpd
+
def configure(context):
context.stage("data.spatial.municipalities")
- if context.config("education_location_source","bpe") == "addresses":
- context.stage("data.external.education", alias = "location_source")
+ if context.config("education_location_source", "bpe") == "addresses":
+ context.stage("data.external.education", alias="location_source")
else:
- context.stage("data.bpe.cleaned", alias = "location_source")
+ context.stage("data.bpe.cleaned", alias="location_source")
+
EDUCATION_WEIGHT_MAP = [
("C101", 100), # Preschools
@@ -25,6 +27,7 @@ def configure(context):
("C501", 2000), # University
]
+
def fake_education(missing_communes, c, df_locations, df_zones):
# Fake education destinations as the centroid of zones that have no other destinations
print(
@@ -50,49 +53,80 @@ def fake_education(missing_communes, c, df_locations, df_zones):
return df_added
+
def execute(context):
df_locations = context.stage("location_source")
df_locations = df_locations[df_locations["activity_type"] == "education"]
- df_locations = df_locations[["education_type", "commune_id","weight", "geometry"]].copy()
+ df_locations = df_locations[
+ ["education_type", "commune_id", "weight", "geometry"]
+ ].copy()
df_locations["fake"] = False
# Add education destinations to the centroid of zones that have no other destinations
df_zones = context.stage("data.spatial.municipalities")
- required_communes = set(df_zones["commune_id"].unique())
-
- if context.config("education_location_source") != 'bpe': # either weighted or addresses
+ required_communes = set(df_zones["commune_id"].unique())
+
+ if (
+ context.config("education_location_source") != "bpe"
+ ): # either weighted or addresses
for prefix, weight in EDUCATION_WEIGHT_MAP:
- df_locations.loc[df_locations["education_type"]==prefix, "weight"] = (
+ df_locations.loc[df_locations["education_type"] == prefix, "weight"] = (
weight
- )
- if context.config("education_location_source") != 'bpe' :
+ )
+ if context.config("education_location_source") != "bpe":
-
# Add education destinations in function of level education
for c in ["C1", "C2", "C3"]:
- missing_communes = required_communes - set(df_locations[df_locations["education_type"].str.startswith(c)]["commune_id"].unique())
+ missing_communes = required_communes - set(
+ df_locations[df_locations["education_type"].str.startswith(c)][
+ "commune_id"
+ ].unique()
+ )
if len(missing_communes) > 0:
- df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
-
+ df_locations = pd.concat(
+ [
+ df_locations,
+ fake_education(missing_communes, c, df_locations, df_zones),
+ ]
+ )
+
# Add education destinations for last level education
- missing_communes = required_communes - set(df_locations[~(df_locations["education_type"].str.startswith(("C1", "C2", "C3")))]["commune_id"].unique())
+ missing_communes = required_communes - set(
+ df_locations[
+ ~(df_locations["education_type"].str.startswith(("C1", "C2", "C3")))
+ ]["commune_id"].unique()
+ )
if len(missing_communes) > 0:
- df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
- else :
+ df_locations = pd.concat(
+ [
+ df_locations,
+ fake_education(missing_communes, "C4", df_locations, df_zones),
+ ]
+ )
+ else:
missing_communes = required_communes - set(df_locations["commune_id"].unique())
if len(missing_communes) > 0:
- df_locations = pd.concat([df_locations,fake_education(missing_communes, "C0", df_locations, df_zones)])
- df_locations["education_type"] = df_locations["education_type"].str[:2].astype("category")
+ df_locations = pd.concat(
+ [
+ df_locations,
+ fake_education(missing_communes, "C0", df_locations, df_zones),
+ ]
+ )
+ df_locations["education_type"] = (
+ df_locations["education_type"].str[:2].astype("category")
+ )
# Define identifiers
- df_locations["location_id"]= np.arange(len(df_locations))
+ df_locations["location_id"] = np.arange(len(df_locations))
df_locations["location_id"] = "edu_" + df_locations["location_id"].astype(str)
-
- return df_locations[["location_id","education_type", "commune_id","weight","fake", "geometry"]]
+
+ return df_locations[
+ ["location_id", "education_type", "commune_id", "weight", "fake", "geometry"]
+ ]
diff --git a/synthesis/locations/home/addresses.py b/synthesis/locations/home/addresses.py
index 01410a13..34a29455 100644
--- a/synthesis/locations/home/addresses.py
+++ b/synthesis/locations/home/addresses.py
@@ -18,57 +18,65 @@
If no adresses matches a building, its centroid is taken as the unique address.
"""
+
def configure(context):
context.stage("data.bdtopo.raw")
-
+
context.config("home_address_buffer", 5.0)
context.config("home_location_weight", "housing")
if context.config("home_location_source", "addresses") == "addresses":
context.stage("data.ban.raw")
+
def execute(context):
# Load buildings
df_buildings = context.stage("data.bdtopo.raw")
- print("Number of buildings:", + len(df_buildings))
+ print("Number of buildings:", +len(df_buildings))
if context.config("home_location_source") == "buildings":
- df_addresses = pd.DataFrame({
- "building_id": [], "housing": [], "geometry": []
- })
+ df_addresses = pd.DataFrame({"building_id": [], "housing": [], "geometry": []})
- else: # addresses
+ else: # addresses
# Load addresses
df_addresses = context.stage("data.ban.raw")[["geometry"]].copy()
- print("Number of addresses:", + len(df_addresses))
+ print("Number of addresses:", +len(df_addresses))
# Buffer buildings to capture adresses in their vicinity
df_buffer = df_buildings[["building_id", "housing", "geometry"]].copy()
df_buffer["geometry"] = df_buffer.buffer(context.config("home_address_buffer"))
# Find close-by addresses
- df_addresses = gpd.sjoin(df_addresses, df_buffer, predicate = "within")[[
- "building_id", "housing", "geometry"]]
-
+ df_addresses = gpd.sjoin(df_addresses, df_buffer, predicate="within")[
+ ["building_id", "housing", "geometry"]
+ ]
+
# Create missing addresses by using centroids
- df_missing = df_buildings[~df_buildings["building_id"].isin(df_addresses["building_id"])].copy()
+ df_missing = df_buildings[
+ ~df_buildings["building_id"].isin(df_addresses["building_id"])
+ ].copy()
df_missing["geometry"] = df_missing["geometry"].centroid
df_missing = df_missing[["building_id", "housing", "geometry"]]
# Put together matched and missing addresses
df_addresses = pd.concat([df_addresses, df_missing])
- df_addresses = gpd.GeoDataFrame(df_addresses, crs = df_buildings.crs).rename(columns={"building_id":"home_location_id"})
+ df_addresses = gpd.GeoDataFrame(df_addresses, crs=df_buildings.crs).rename(
+ columns={"building_id": "home_location_id"}
+ )
# Obtain weights for all addresses
if context.config("home_location_weight") == "housing":
- df_count = df_addresses.groupby("home_location_id").size().reset_index(name = "count")
- df_addresses = pd.merge(df_addresses, df_count, on = "home_location_id")
+ df_count = (
+ df_addresses.groupby("home_location_id").size().reset_index(name="count")
+ )
+ df_addresses = pd.merge(df_addresses, df_count, on="home_location_id")
df_addresses["weight"] = df_addresses["housing"] / df_addresses["count"]
else:
df_addresses["weight"] = 1.0
-
+
return df_addresses[["home_location_id", "weight", "geometry"]]
+
def validate(context):
- assert context.config("home_location_source") in ("addresses", "buildings","tiles")
+ assert context.config("home_location_source") in ("addresses", "buildings", "tiles")
assert context.config("home_location_weight") in ("uniform", "housing")
diff --git a/synthesis/locations/home/locations.py b/synthesis/locations/home/locations.py
index 391748ec..40b012d2 100644
--- a/synthesis/locations/home/locations.py
+++ b/synthesis/locations/home/locations.py
@@ -7,27 +7,30 @@
home activities.
"""
+
def configure(context):
context.stage("data.spatial.iris")
if context.config("home_location_source", "addresses") == "tiles":
- context.stage("data.tiles.raw", alias = "location_source")
+ context.stage("data.tiles.raw", alias="location_source")
else:
- context.stage("synthesis.locations.home.addresses", alias = "location_source")
+ context.stage("synthesis.locations.home.addresses", alias="location_source")
+
def execute(context):
# Find required IRIS
df_iris = context.stage("data.spatial.iris")
required_iris = set(df_iris["iris_id"].unique())
-
+
# Load all addresses and add IRIS information
df_addresses = context.stage("location_source")
print("Imputing IRIS into addresses ...")
-
- df_addresses = gpd.sjoin(df_addresses,
- df_iris[["iris_id", "commune_id", "geometry"]], predicate = "within")
+
+ df_addresses = gpd.sjoin(
+ df_addresses, df_iris[["iris_id", "commune_id", "geometry"]], predicate="within"
+ )
del df_addresses["index_right"]
-
+
df_addresses.loc[df_addresses["iris_id"].isna(), "iris_id"] = "unknown"
df_addresses["iris_id"] = df_addresses["iris_id"].astype("category")
@@ -37,21 +40,30 @@ def execute(context):
missing_iris = required_iris - set(df_addresses["iris_id"].unique())
if len(missing_iris) > 0:
- print("Adding homes at the centroid of %d/%d IRIS without BDTOPO observations" % (
- len(missing_iris), len(required_iris)))
+ print(
+ "Adding homes at the centroid of %d/%d IRIS without BDTOPO observations"
+ % (len(missing_iris), len(required_iris))
+ )
df_added = []
for iris_id in sorted(missing_iris):
- centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[0]
+ centroid = df_iris[df_iris["iris_id"] == iris_id]["geometry"].centroid.iloc[
+ 0
+ ]
- df_added.append({
- "iris_id": iris_id, "geometry": centroid,
- "commune_id": iris_id[:5],
- "weight" : 1,
- "home_location_id": -1
- })
+ df_added.append(
+ {
+ "iris_id": iris_id,
+ "geometry": centroid,
+ "commune_id": iris_id[:5],
+ "weight": 1,
+ "home_location_id": -1,
+ }
+ )
- df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_addresses.crs)
+ df_added = gpd.GeoDataFrame(
+ pd.DataFrame.from_records(df_added), crs=df_addresses.crs
+ )
df_added["fake"] = True
df_addresses = pd.concat([df_addresses, df_added])
diff --git a/synthesis/locations/home/output.py b/synthesis/locations/home/output.py
index 54c33ec6..926170bd 100644
--- a/synthesis/locations/home/output.py
+++ b/synthesis/locations/home/output.py
@@ -1,5 +1,6 @@
import geopandas as gpd
+
def configure(context):
context.config("output_path")
context.config("output_prefix", "ile_de_france_")
@@ -7,19 +8,26 @@ def configure(context):
context.stage("data.bdtopo.raw")
context.stage("synthesis.locations.home.locations")
+
def execute(context):
# Load data
- df_buildings = context.stage("data.bdtopo.raw")[[
- "building_id", "housing", "geometry"]]
-
- df_locations = context.stage("synthesis.locations.home.locations")[[
- "location_id", "weight", "building_id", "geometry"]]
+ df_buildings = context.stage("data.bdtopo.raw")[
+ ["building_id", "housing", "geometry"]
+ ]
+
+ df_locations = context.stage("synthesis.locations.home.locations")[
+ ["location_id", "weight", "building_id", "geometry"]
+ ]
# Write into same file with multiple layers
- df_buildings.to_file("%s/%shousing.gpkg" % (
- context.config("output_path"), context.config("output_prefix")
- ), layer = "buildings")
+ df_buildings.to_file(
+ "%s/%shousing.gpkg"
+ % (context.config("output_path"), context.config("output_prefix")),
+ layer="buildings",
+ )
- df_locations.to_file("%s/%shousing.gpkg" % (
- context.config("output_path"), context.config("output_prefix")
- ), layer = "addresses")
+ df_locations.to_file(
+ "%s/%shousing.gpkg"
+ % (context.config("output_path"), context.config("output_prefix")),
+ layer="addresses",
+ )
diff --git a/synthesis/locations/secondary.py b/synthesis/locations/secondary.py
index c5446359..40bef2c1 100644
--- a/synthesis/locations/secondary.py
+++ b/synthesis/locations/secondary.py
@@ -3,20 +3,24 @@
import pandas as pd
import geopandas as gpd
+
def configure(context):
context.stage("data.bpe.cleaned")
context.stage("data.spatial.municipalities")
+
def execute(context):
- df_locations = context.stage("data.bpe.cleaned")[[
- "enterprise_id", "activity_type", "commune_id", "geometry"
- ]].copy()
+ df_locations = context.stage("data.bpe.cleaned")[
+ ["enterprise_id", "activity_type", "commune_id", "geometry"]
+ ].copy()
df_locations["destination_id"] = np.arange(len(df_locations))
# Attach attributes for activity types
df_locations["offers_leisure"] = df_locations["activity_type"] == "leisure"
df_locations["offers_shop"] = df_locations["activity_type"] == "shop"
- df_locations["offers_other"] = ~(df_locations["offers_leisure"] | df_locations["offers_shop"])
+ df_locations["offers_other"] = ~(
+ df_locations["offers_leisure"] | df_locations["offers_shop"]
+ )
# Define new IDs
df_locations["location_id"] = np.arange(len(df_locations))
diff --git a/synthesis/locations/work.py b/synthesis/locations/work.py
index 0fc9bcee..c4178244 100644
--- a/synthesis/locations/work.py
+++ b/synthesis/locations/work.py
@@ -11,14 +11,16 @@
place at their centroid to be in line with INSEE OD data.
"""
+
def configure(context):
context.stage("data.sirene.localized")
context.stage("data.spatial.municipalities")
+
def execute(context):
- df_workplaces = context.stage("data.sirene.localized")[[
- "commune_id", "minimum_employees", "maximum_employees", "geometry"
- ]].copy()
+ df_workplaces = context.stage("data.sirene.localized")[
+ ["commune_id", "minimum_employees", "maximum_employees", "geometry"]
+ ].copy()
# Use minimum number of employees as weight
df_workplaces["employees"] = df_workplaces["minimum_employees"]
@@ -30,19 +32,29 @@ def execute(context):
missing_communes = required_communes - set(df_workplaces["commune_id"].unique())
if len(missing_communes) > 0:
- print("Adding work places at the centroid of %d/%d communes without SIRENE observations" % (
- len(missing_communes), len(required_communes)))
+ print(
+ "Adding work places at the centroid of %d/%d communes without SIRENE observations"
+ % (len(missing_communes), len(required_communes))
+ )
df_added = []
for commune_id in missing_communes:
- centroid = df_zones[df_zones["commune_id"] == commune_id]["geometry"].centroid.iloc[0]
+ centroid = df_zones[df_zones["commune_id"] == commune_id][
+ "geometry"
+ ].centroid.iloc[0]
- df_added.append({
- "commune_id": commune_id, "employees": 1.0, "geometry": centroid,
- })
+ df_added.append(
+ {
+ "commune_id": commune_id,
+ "employees": 1.0,
+ "geometry": centroid,
+ }
+ )
- df_added = gpd.GeoDataFrame(pd.DataFrame.from_records(df_added), crs = df_workplaces.crs)
+ df_added = gpd.GeoDataFrame(
+ pd.DataFrame.from_records(df_added), crs=df_workplaces.crs
+ )
df_added["fake"] = True
df_workplaces = pd.concat([df_workplaces, df_added])
diff --git a/synthesis/output.py b/synthesis/output.py
index 84c52a36..eeea93fc 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -7,6 +7,7 @@
import math
import numpy as np
+
def configure(context):
context.stage("synthesis.population.enriched")
@@ -22,7 +23,7 @@ def configure(context):
context.config("output_path")
context.config("output_prefix", "ile_de_france_")
context.config("output_formats", ["csv", "gpkg"])
-
+
if context.config("mode_choice", False):
context.stage("matsim.simulation.prepare")
@@ -33,8 +34,9 @@ def validate(context):
if not os.path.isdir(output_path):
raise RuntimeError("Output directory must exist: %s" % output_path)
+
def clean_gpkg(path):
- '''
+ """
Make GPKG files time and OS independent.
In GeoPackage metadata:
@@ -42,21 +44,28 @@ def clean_gpkg(path):
- round coordinates.
This allow for comparison of output digests between runs and between OS.
- '''
+ """
conn = sqlite3.connect(path)
cur = conn.cursor()
for table_name, min_x, min_y, max_x, max_y in cur.execute(
"SELECT table_name, min_x, min_y, max_x, max_y FROM gpkg_contents"
):
cur.execute(
- "UPDATE gpkg_contents " +
- "SET last_change='2000-01-01T00:00:00Z', min_x=?, min_y=?, max_x=?, max_y=? " +
- "WHERE table_name=?",
- (math.floor(min_x), math.floor(min_y), math.ceil(max_x), math.ceil(max_y), table_name)
+ "UPDATE gpkg_contents "
+ + "SET last_change='2000-01-01T00:00:00Z', min_x=?, min_y=?, max_x=?, max_y=? "
+ + "WHERE table_name=?",
+ (
+ math.floor(min_x),
+ math.floor(min_y),
+ math.ceil(max_x),
+ math.ceil(max_y),
+ table_name,
+ ),
)
conn.commit()
conn.close()
+
def execute(context):
output_path = context.config("output_path")
output_prefix = context.config("output_prefix")
@@ -64,121 +73,237 @@ def execute(context):
# Prepare persons
df_persons = context.stage("synthesis.population.enriched").rename(
- columns = { "has_license": "has_driving_license" }
+ columns={"has_license": "has_driving_license"}
)
- df_persons = df_persons[[
- "person_id", "household_id",
- "age", "employed", "sex", "socioprofessional_class",
- "has_driving_license", "has_pt_subscription",
- "census_person_id", "hts_id"
- ]]
+ df_persons = df_persons[
+ [
+ "person_id",
+ "household_id",
+ "age",
+ "employed",
+ "sex",
+ "socioprofessional_class",
+ "has_driving_license",
+ "has_pt_subscription",
+ "census_person_id",
+ "hts_id",
+ ]
+ ]
if "csv" in output_formats:
- df_persons.to_csv("%s/%spersons.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
+ df_persons.to_csv(
+ "%s/%spersons.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
if "parquet" in output_formats:
df_persons.to_parquet("%s/%spersons.parquet" % (output_path, output_prefix))
# Prepare activities
df_activities = context.stage("synthesis.population.activities").rename(
- columns = { "trip_index": "following_trip_index" }
+ columns={"trip_index": "following_trip_index"}
)
df_activities = pd.merge(
- df_activities, df_persons[["person_id", "household_id"]], on = "person_id")
+ df_activities, df_persons[["person_id", "household_id"]], on="person_id"
+ )
- df_activities["preceding_trip_index"] = df_activities["following_trip_index"].shift(1)
+ df_activities["preceding_trip_index"] = df_activities["following_trip_index"].shift(
+ 1
+ )
df_activities.loc[df_activities["is_first"], "preceding_trip_index"] = -1
- df_activities["preceding_trip_index"] = df_activities["preceding_trip_index"].astype(int)
+ df_activities["preceding_trip_index"] = df_activities[
+ "preceding_trip_index"
+ ].astype(int)
# Prepare spatial data sets
- df_locations = context.stage("synthesis.population.spatial.locations")[[
- "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry"
- ]]
+ df_locations = context.stage("synthesis.population.spatial.locations")[
+ [
+ "person_id",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "activity_index",
+ "geometry",
+ ]
+ ]
- df_activities = pd.merge(df_activities, df_locations[[
- "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry"
- ]], how = "left", on = ["person_id", "activity_index"])
+ df_activities = pd.merge(
+ df_activities,
+ df_locations[
+ [
+ "person_id",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "activity_index",
+ "geometry",
+ ]
+ ],
+ how="left",
+ on=["person_id", "activity_index"],
+ )
# Prepare spatial activities
- df_spatial = gpd.GeoDataFrame(df_activities[[
- "person_id", "household_id", "activity_index",
- "iris_id", "commune_id","departement_id","region_id",
- "preceding_trip_index", "following_trip_index",
- "purpose", "start_time", "end_time",
- "is_first", "is_last", "geometry"
- ]], crs = df_locations.crs)
- df_spatial = df_spatial.astype({'purpose': 'str', "departement_id": 'str'})
+ df_spatial = gpd.GeoDataFrame(
+ df_activities[
+ [
+ "person_id",
+ "household_id",
+ "activity_index",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "preceding_trip_index",
+ "following_trip_index",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ "geometry",
+ ]
+ ],
+ crs=df_locations.crs,
+ )
+ df_spatial = df_spatial.astype({"purpose": "str", "departement_id": "str"})
# Write activities
- df_activities = df_activities[[
- "person_id", "household_id", "activity_index",
- "iris_id", "commune_id","departement_id","region_id",
- "preceding_trip_index", "following_trip_index",
- "purpose", "start_time", "end_time",
- "is_first", "is_last"
- ]]
+ df_activities = df_activities[
+ [
+ "person_id",
+ "household_id",
+ "activity_index",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "preceding_trip_index",
+ "following_trip_index",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ ]
+ ]
if "csv" in output_formats:
- df_activities.to_csv("%s/%sactivities.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
+ df_activities.to_csv(
+ "%s/%sactivities.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
if "parquet" in output_formats:
- df_activities.to_parquet("%s/%sactivities.parquet" % (output_path, output_prefix))
+ df_activities.to_parquet(
+ "%s/%sactivities.parquet" % (output_path, output_prefix)
+ )
# Prepare households
- df_households = context.stage("synthesis.population.enriched").rename(
- columns = { "household_income": "income" }
- ).drop_duplicates("household_id")
-
- df_households = pd.merge(df_households,df_activities[df_activities["purpose"] == "home"][["household_id",
- "iris_id", "commune_id","departement_id","region_id"]].drop_duplicates("household_id"),how="left")
- df_households = df_households[[
- "household_id","iris_id", "commune_id", "departement_id","region_id",
- "car_availability", "bike_availability",
- "number_of_vehicles", "number_of_bikes",
- "income",
- "census_household_id"
- ]]
+ df_households = (
+ context.stage("synthesis.population.enriched")
+ .rename(columns={"household_income": "income"})
+ .drop_duplicates("household_id")
+ )
+
+ df_households = pd.merge(
+ df_households,
+ df_activities[df_activities["purpose"] == "home"][
+ ["household_id", "iris_id", "commune_id", "departement_id", "region_id"]
+ ].drop_duplicates("household_id"),
+ how="left",
+ )
+ df_households = df_households[
+ [
+ "household_id",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "car_availability",
+ "bike_availability",
+ "number_of_vehicles",
+ "number_of_bikes",
+ "income",
+ "census_household_id",
+ ]
+ ]
if "csv" in output_formats:
- df_households.to_csv("%s/%shouseholds.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
+ df_households.to_csv(
+ "%s/%shouseholds.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
if "parquet" in output_formats:
- df_households.to_parquet("%s/%shouseholds.parquet" % (output_path, output_prefix))
+ df_households.to_parquet(
+ "%s/%shouseholds.parquet" % (output_path, output_prefix)
+ )
# Prepare trips
df_trips = context.stage("synthesis.population.trips").rename(
- columns = {
- "is_first_trip": "is_first",
- "is_last_trip": "is_last"
- }
+ columns={"is_first_trip": "is_first", "is_last_trip": "is_last"}
)
df_trips["preceding_activity_index"] = df_trips["trip_index"]
df_trips["following_activity_index"] = df_trips["trip_index"] + 1
- df_trips = df_trips[[
- "person_id", "trip_index",
- "preceding_activity_index", "following_activity_index",
- "departure_time", "arrival_time",
- "preceding_purpose", "following_purpose",
- "is_first", "is_last"
- ]]
+ df_trips = df_trips[
+ [
+ "person_id",
+ "trip_index",
+ "preceding_activity_index",
+ "following_activity_index",
+ "departure_time",
+ "arrival_time",
+ "preceding_purpose",
+ "following_purpose",
+ "is_first",
+ "is_last",
+ ]
+ ]
if context.config("mode_choice"):
df_mode_choice = pd.read_csv(
- "{}/mode_choice/output_trips.csv".format(context.path("matsim.simulation.prepare"), output_prefix),
- delimiter = ";")
+ "{}/mode_choice/output_trips.csv".format(
+ context.path("matsim.simulation.prepare"), output_prefix
+ ),
+ delimiter=";",
+ )
df_mode_choice = df_mode_choice.rename(columns={"person_trip_id": "trip_index"})
columns_to_keep = ["person_id", "trip_index"]
- columns_to_keep.extend([c for c in df_trips.columns if c not in df_mode_choice.columns])
+ columns_to_keep.extend(
+ [c for c in df_trips.columns if c not in df_mode_choice.columns]
+ )
df_trips = df_trips[columns_to_keep]
- df_trips = pd.merge(df_trips, df_mode_choice, on = [
- "person_id", "trip_index"], how="left", validate = "one_to_one")
+ df_trips = pd.merge(
+ df_trips,
+ df_mode_choice,
+ on=["person_id", "trip_index"],
+ how="left",
+ validate="one_to_one",
+ )
- shutil.copy("%s/mode_choice/output_pt_legs.csv" % (context.path("matsim.simulation.prepare")),
- "%s/%spt_legs.csv" % (output_path, output_prefix))
+ shutil.copy(
+ "%s/mode_choice/output_pt_legs.csv"
+ % (context.path("matsim.simulation.prepare")),
+ "%s/%spt_legs.csv" % (output_path, output_prefix),
+ )
- assert not np.any(df_trips["mode"].isna())
+ assert not np.any(df_trips["mode"].isna())
if "csv" in output_formats:
- df_trips.to_csv("%s/%strips.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
+ df_trips.to_csv(
+ "%s/%strips.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
if "parquet" in output_formats:
df_trips.to_csv("%s/%strips.parquet" % (output_path, output_prefix))
@@ -186,30 +311,48 @@ def execute(context):
df_vehicle_types, df_vehicles = context.stage("synthesis.vehicles.vehicles")
if "csv" in output_formats:
- df_vehicle_types.to_csv("%s/%svehicle_types.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
- df_vehicles.to_csv("%s/%svehicles.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n")
+ df_vehicle_types.to_csv(
+ "%s/%svehicle_types.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
+ df_vehicles.to_csv(
+ "%s/%svehicles.csv" % (output_path, output_prefix),
+ sep=";",
+ index=None,
+ lineterminator="\n",
+ )
if "parquet" in output_formats:
- df_vehicle_types.to_parquet("%s/%svehicle_types.parquet" % (output_path, output_prefix))
+ df_vehicle_types.to_parquet(
+ "%s/%svehicle_types.parquet" % (output_path, output_prefix)
+ )
df_vehicles.to_parquet("%s/%svehicles.parquet" % (output_path, output_prefix))
-
if "gpkg" in output_formats:
path = "%s/%sactivities.gpkg" % (output_path, output_prefix)
- df_spatial.to_file(path, driver = "GPKG")
+ df_spatial.to_file(path, driver="GPKG")
clean_gpkg(path)
if "geoparquet" in output_formats:
path = "%s/%sactivities.geoparquet" % (output_path, output_prefix)
df_spatial.to_parquet(path)
# Write spatial homes
- df_spatial_homes = df_spatial[
- df_spatial["purpose"] == "home"
- ].drop_duplicates("household_id")[[
- "household_id","iris_id", "commune_id","departement_id","region_id", "geometry"
- ]]
+ df_spatial_homes = df_spatial[df_spatial["purpose"] == "home"].drop_duplicates(
+ "household_id"
+ )[
+ [
+ "household_id",
+ "iris_id",
+ "commune_id",
+ "departement_id",
+ "region_id",
+ "geometry",
+ ]
+ ]
if "gpkg" in output_formats:
path = "%s/%shomes.gpkg" % (output_path, output_prefix)
- df_spatial_homes.to_file(path, driver = "GPKG")
+ df_spatial_homes.to_file(path, driver="GPKG")
clean_gpkg(path)
if "geoparquet" in output_formats:
path = "%s/%shomes.geoparquet" % (output_path, output_prefix)
@@ -217,8 +360,12 @@ def execute(context):
# Write spatial commutes
df_spatial = pd.merge(
- df_spatial[df_spatial["purpose"] == "home"].drop_duplicates("person_id")[["person_id", "geometry"]].rename(columns = { "geometry": "home_geometry" }),
- df_spatial[df_spatial["purpose"] == "work"].drop_duplicates("person_id")[["person_id", "geometry"]].rename(columns = { "geometry": "work_geometry" })
+ df_spatial[df_spatial["purpose"] == "home"]
+ .drop_duplicates("person_id")[["person_id", "geometry"]]
+ .rename(columns={"geometry": "home_geometry"}),
+ df_spatial[df_spatial["purpose"] == "work"]
+ .drop_duplicates("person_id")[["person_id", "geometry"]]
+ .rename(columns={"geometry": "work_geometry"}),
)
df_spatial["geometry"] = [
@@ -226,38 +373,50 @@ def execute(context):
for od in zip(df_spatial["home_geometry"], df_spatial["work_geometry"])
]
- df_spatial = df_spatial.drop(columns = ["home_geometry", "work_geometry"])
+ df_spatial = df_spatial.drop(columns=["home_geometry", "work_geometry"])
if "gpkg" in output_formats:
path = "%s/%scommutes.gpkg" % (output_path, output_prefix)
- df_spatial.to_file(path, driver = "GPKG")
+ df_spatial.to_file(path, driver="GPKG")
clean_gpkg(path)
if "geoparquet" in output_formats:
path = "%s/%scommutes.geoparquet" % (output_path, output_prefix)
df_spatial.to_parquet(path)
# Write spatial trips
- df_spatial = pd.merge(df_trips, df_locations[[
- "person_id", "activity_index", "geometry"
- ]].rename(columns = {
- "activity_index": "preceding_activity_index",
- "geometry": "preceding_geometry"
- }), how = "left", on = ["person_id", "preceding_activity_index"])
-
- df_spatial = pd.merge(df_spatial, df_locations[[
- "person_id", "activity_index", "geometry"
- ]].rename(columns = {
- "activity_index": "following_activity_index",
- "geometry": "following_geometry"
- }), how = "left", on = ["person_id", "following_activity_index"])
+ df_spatial = pd.merge(
+ df_trips,
+ df_locations[["person_id", "activity_index", "geometry"]].rename(
+ columns={
+ "activity_index": "preceding_activity_index",
+ "geometry": "preceding_geometry",
+ }
+ ),
+ how="left",
+ on=["person_id", "preceding_activity_index"],
+ )
+
+ df_spatial = pd.merge(
+ df_spatial,
+ df_locations[["person_id", "activity_index", "geometry"]].rename(
+ columns={
+ "activity_index": "following_activity_index",
+ "geometry": "following_geometry",
+ }
+ ),
+ how="left",
+ on=["person_id", "following_activity_index"],
+ )
df_spatial["geometry"] = [
geo.LineString(od)
- for od in zip(df_spatial["preceding_geometry"], df_spatial["following_geometry"])
+ for od in zip(
+ df_spatial["preceding_geometry"], df_spatial["following_geometry"]
+ )
]
- df_spatial = df_spatial.drop(columns = ["preceding_geometry", "following_geometry"])
+ df_spatial = df_spatial.drop(columns=["preceding_geometry", "following_geometry"])
- df_spatial = gpd.GeoDataFrame(df_spatial, crs = df_locations.crs)
+ df_spatial = gpd.GeoDataFrame(df_spatial, crs=df_locations.crs)
df_spatial["following_purpose"] = df_spatial["following_purpose"].astype(str)
df_spatial["preceding_purpose"] = df_spatial["preceding_purpose"].astype(str)
@@ -266,7 +425,7 @@ def execute(context):
if "gpkg" in output_formats:
path = "%s/%strips.gpkg" % (output_path, output_prefix)
- df_spatial.to_file(path, driver = "GPKG")
+ df_spatial.to_file(path, driver="GPKG")
clean_gpkg(path)
if "geoparquet" in output_formats:
path = "%s/%strips.geoparquet" % (output_path, output_prefix)
diff --git a/synthesis/population/activities.py b/synthesis/population/activities.py
index 27d3367a..e0c9590f 100644
--- a/synthesis/population/activities.py
+++ b/synthesis/population/activities.py
@@ -7,15 +7,22 @@
Transforms the synthetic trip table into a synthetic activity table.
"""
+
def configure(context):
context.stage("synthesis.population.enriched")
context.stage("synthesis.population.trips")
+
def execute(context):
df_activities = context.stage("synthesis.population.trips")
# Add trip count
- counts = df_activities.groupby("person_id").size().reset_index(name = "trip_count")["trip_count"].values
+ counts = (
+ df_activities.groupby("person_id")
+ .size()
+ .reset_index(name="trip_count")["trip_count"]
+ .values
+ )
df_activities["trip_count"] = np.hstack([[count] * count for count in counts])
# Shift times and types of trips to arrive at activities
@@ -43,14 +50,40 @@ def execute(context):
df_last["activity_index"] = df_last["trip_count"]
df_last["trip_index"] = -1
- df_activities = pd.concat([
- df_activities[["person_id", "activity_index", "trip_index", "purpose", "start_time", "end_time", "is_first", "is_last"]],
- df_last[["person_id", "activity_index", "trip_index", "purpose", "start_time", "end_time", "is_first", "is_last"]]
- ]).sort_values(by = ["person_id", "activity_index"])
+ df_activities = pd.concat(
+ [
+ df_activities[
+ [
+ "person_id",
+ "activity_index",
+ "trip_index",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ ]
+ ],
+ df_last[
+ [
+ "person_id",
+ "activity_index",
+ "trip_index",
+ "purpose",
+ "start_time",
+ "end_time",
+ "is_first",
+ "is_last",
+ ]
+ ],
+ ]
+ ).sort_values(by=["person_id", "activity_index"])
# Add activities for people without trips
df_missing = context.stage("synthesis.population.enriched")
- df_missing = df_missing[~df_missing["person_id"].isin(df_activities["person_id"])][["person_id"]]
+ df_missing = df_missing[~df_missing["person_id"].isin(df_activities["person_id"])][
+ ["person_id"]
+ ]
df_missing["activity_index"] = 0
df_missing["trip_index"] = -1
diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py
index 15fc5649..22d83427 100644
--- a/synthesis/population/enriched.py
+++ b/synthesis/population/enriched.py
@@ -13,27 +13,38 @@
This stage fuses census data with HTS data.
"""
+
def configure(context):
context.stage("synthesis.population.matched")
context.stage("synthesis.population.sampled")
context.stage("synthesis.population.income.selected")
hts = context.config("hts")
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def execute(context):
# Select population columns
- df_population = context.stage("synthesis.population.sampled")[[
- "person_id", "household_id",
- "census_person_id", "census_household_id",
- "age", "sex", "employed", "studies",
- "number_of_vehicles", "household_size", "consumption_units",
- "socioprofessional_class"
- ]]
+ df_population = context.stage("synthesis.population.sampled")[
+ [
+ "person_id",
+ "household_id",
+ "census_person_id",
+ "census_household_id",
+ "age",
+ "sex",
+ "employed",
+ "studies",
+ "number_of_vehicles",
+ "household_size",
+ "consumption_units",
+ "socioprofessional_class",
+ ]
+ ]
# Attach matching information
df_matching = context.stage("synthesis.population.matched")
- df_population = pd.merge(df_population, df_matching, on = "person_id")
+ df_population = pd.merge(df_population, df_matching, on="person_id")
initial_size = len(df_population)
initial_person_ids = len(df_population["person_id"].unique())
@@ -41,22 +52,40 @@ def execute(context):
# Attach person and household attributes from HTS
df_hts_households, df_hts_persons, _ = context.stage("hts")
- df_hts_persons = df_hts_persons.rename(columns = { "person_id": "hts_id", "household_id": "hts_household_id" })
- df_hts_households = df_hts_households.rename(columns = { "household_id": "hts_household_id" })
-
- df_population = pd.merge(df_population, df_hts_persons[[
- "hts_id", "hts_household_id", "has_license", "has_pt_subscription", "is_passenger"
- ]], on = "hts_id")
-
- df_population = pd.merge(df_population, df_hts_households[[
- "hts_household_id", "number_of_bikes"
- ]], on = "hts_household_id")
+ df_hts_persons = df_hts_persons.rename(
+ columns={"person_id": "hts_id", "household_id": "hts_household_id"}
+ )
+ df_hts_households = df_hts_households.rename(
+ columns={"household_id": "hts_household_id"}
+ )
+
+ df_population = pd.merge(
+ df_population,
+ df_hts_persons[
+ [
+ "hts_id",
+ "hts_household_id",
+ "has_license",
+ "has_pt_subscription",
+ "is_passenger",
+ ]
+ ],
+ on="hts_id",
+ )
+
+ df_population = pd.merge(
+ df_population,
+ df_hts_households[["hts_household_id", "number_of_bikes"]],
+ on="hts_household_id",
+ )
# Attach income
df_income = context.stage("synthesis.population.income.selected")
- df_population = pd.merge(df_population, df_income[[
- "household_id", "household_income"
- ]], on = "household_id")
+ df_population = pd.merge(
+ df_population,
+ df_income[["household_id", "household_income"]],
+ on="household_id",
+ )
# Check consistency
final_size = len(df_population)
@@ -68,28 +97,55 @@ def execute(context):
assert initial_household_ids == final_household_ids
# Add car availability
- df_number_of_cars = df_population[["household_id", "number_of_vehicles"]].drop_duplicates("household_id")
- df_number_of_licenses = df_population[["household_id", "has_license"]].groupby("household_id").sum().reset_index().rename(columns = { "has_license": "number_of_licenses" })
+ df_number_of_cars = df_population[
+ ["household_id", "number_of_vehicles"]
+ ].drop_duplicates("household_id")
+ df_number_of_licenses = (
+ df_population[["household_id", "has_license"]]
+ .groupby("household_id")
+ .sum()
+ .reset_index()
+ .rename(columns={"has_license": "number_of_licenses"})
+ )
df_car_availability = pd.merge(df_number_of_cars, df_number_of_licenses)
df_car_availability["car_availability"] = "all"
- df_car_availability.loc[df_car_availability["number_of_vehicles"] < df_car_availability["number_of_licenses"], "car_availability"] = "some"
- df_car_availability.loc[df_car_availability["number_of_vehicles"] == 0, "car_availability"] = "none"
- df_car_availability["car_availability"] = df_car_availability["car_availability"].astype("category")
-
- df_population = pd.merge(df_population, df_car_availability[["household_id", "car_availability"]])
+ df_car_availability.loc[
+ df_car_availability["number_of_vehicles"]
+ < df_car_availability["number_of_licenses"],
+ "car_availability",
+ ] = "some"
+ df_car_availability.loc[
+ df_car_availability["number_of_vehicles"] == 0, "car_availability"
+ ] = "none"
+ df_car_availability["car_availability"] = df_car_availability[
+ "car_availability"
+ ].astype("category")
+
+ df_population = pd.merge(
+ df_population, df_car_availability[["household_id", "car_availability"]]
+ )
# Add bike availability
df_population["bike_availability"] = "all"
- df_population.loc[df_population["number_of_bikes"] < df_population["household_size"], "bike_availability"] = "some"
- df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = "none"
- df_population["bike_availability"] = df_population["bike_availability"].astype("category")
-
+ df_population.loc[
+ df_population["number_of_bikes"] < df_population["household_size"],
+ "bike_availability",
+ ] = "some"
+ df_population.loc[df_population["number_of_bikes"] == 0, "bike_availability"] = (
+ "none"
+ )
+ df_population["bike_availability"] = df_population["bike_availability"].astype(
+ "category"
+ )
+
# Add age range for education
df_population["age_range"] = "higher_education"
- df_population.loc[df_population["age"]<=10,"age_range"] = "primary_school"
- df_population.loc[df_population["age"].between(11,14),"age_range"] = "middle_school"
- df_population.loc[df_population["age"].between(15,17),"age_range"] = "high_school"
+ df_population.loc[df_population["age"] <= 10, "age_range"] = "primary_school"
+ df_population.loc[df_population["age"].between(11, 14), "age_range"] = (
+ "middle_school"
+ )
+ df_population.loc[df_population["age"].between(15, 17), "age_range"] = "high_school"
df_population["age_range"] = df_population["age_range"].astype("category")
-
+
return df_population
diff --git a/synthesis/population/income/bhepop2.py b/synthesis/population/income/bhepop2.py
index 6aa6b7fb..17f3ae28 100644
--- a/synthesis/population/income/bhepop2.py
+++ b/synthesis/population/income/bhepop2.py
@@ -1,6 +1,9 @@
import numpy as np
import pandas as pd
-from synthesis.population.income.utils import income_uniform_sample, MAXIMUM_INCOME_FACTOR
+from synthesis.population.income.utils import (
+ income_uniform_sample,
+ MAXIMUM_INCOME_FACTOR,
+)
from bhepop2.tools import add_household_size_attribute, add_household_type_attribute
from bhepop2.sources.marginal_distributions import QuantitativeMarginalDistributions
from bhepop2.enrichment.bhepop2 import Bhepop2Enrichment
@@ -55,15 +58,17 @@ def _sample_income(context, args):
"Filosofi",
attribute_selection=[
"size", # modalities: ["1_pers", "2_pers", "3_pers", "4_pers", "5_pers_or_more"]
- "family_comp" # modalities: ["Single_man", "Single_wom", "Couple_without_child", "Couple_with_child", "Single_parent", "complex_hh"]
+ "family_comp", # modalities: ["Single_man", "Single_wom", "Couple_without_child", "Couple_with_child", "Single_parent", "complex_hh"]
],
abs_minimum=0,
relative_maximum=MAXIMUM_INCOME_FACTOR,
- delta_min=1000
+ delta_min=1000,
)
# create enrichment class
- enrich_class = Bhepop2Enrichment(df_selected, source, feature_name=INCOME_COLUMN, seed=random_seed)
+ enrich_class = Bhepop2Enrichment(
+ df_selected, source, feature_name=INCOME_COLUMN, seed=random_seed
+ )
# evaluate feature values on the population
pop = enrich_class.assign_feature_values()
@@ -84,7 +89,12 @@ def _sample_income(context, args):
# get global distribution of the commune
distrib_all = distribs[distribs["modality"] == "all"]
assert len(distrib_all) == 1
- centiles = list(distrib_all[["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9"]].iloc[0].values / 12)
+ centiles = list(
+ distrib_all[["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9"]]
+ .iloc[0]
+ .values
+ / 12
+ )
incomes = income_uniform_sample(random, centiles, len(df_selected))
@@ -102,29 +112,39 @@ def execute(context):
df_population = add_household_size_attribute(df_population)
df_population = add_household_type_attribute(df_population)
- df_households = df_population[[
- "household_id", "consumption_units", "size", "family_comp"
- ]].drop_duplicates("household_id")
+ df_households = df_population[
+ ["household_id", "consumption_units", "size", "family_comp"]
+ ].drop_duplicates("household_id")
- df_homes = context.stage("synthesis.population.spatial.home.zones")[[
- "household_id", "commune_id"
- ]]
+ df_homes = context.stage("synthesis.population.spatial.home.zones")[
+ ["household_id", "commune_id"]
+ ]
df_households = pd.merge(df_households, df_homes)
commune_ids = df_households["commune_id"].unique()
- random_seeds = random.randint(10000, size = len(commune_ids))
+ random_seeds = random.randint(10000, size=len(commune_ids))
# Perform sampling per commune
- with context.progress(label = "Imputing income ...", total = len(commune_ids)) as progress:
- with context.parallel(dict(households = df_households, income = df_income)) as parallel:
-
- for f, incomes, method in parallel.imap(_sample_income, zip(commune_ids, random_seeds)):
- df_households.loc[f, "household_income"] = incomes * df_households.loc[f, "consumption_units"]
+ with context.progress(
+ label="Imputing income ...", total=len(commune_ids)
+ ) as progress:
+ with context.parallel(
+ dict(households=df_households, income=df_income)
+ ) as parallel:
+
+ for f, incomes, method in parallel.imap(
+ _sample_income, zip(commune_ids, random_seeds)
+ ):
+ df_households.loc[f, "household_income"] = (
+ incomes * df_households.loc[f, "consumption_units"]
+ )
df_households.loc[f, "method"] = method
# Cleanup
- df_households = df_households[["household_id", "household_income", "consumption_units"]]
+ df_households = df_households[
+ ["household_id", "household_income", "consumption_units"]
+ ]
assert len(df_households) == len(df_households["household_id"].unique())
return df_households
diff --git a/synthesis/population/income/selected.py b/synthesis/population/income/selected.py
index 24d9abc5..43395d57 100644
--- a/synthesis/population/income/selected.py
+++ b/synthesis/population/income/selected.py
@@ -1,14 +1,13 @@
-
def configure(context):
method = context.config("income_assignation_method", "uniform")
if method == "uniform":
- context.stage("synthesis.population.income.uniform", alias = "income")
+ context.stage("synthesis.population.income.uniform", alias="income")
elif method == "bhepop2":
- context.stage("synthesis.population.income.bhepop2", alias = "income")
+ context.stage("synthesis.population.income.bhepop2", alias="income")
else:
raise RuntimeError("Unknown income assignation method : %s" % method)
+
def execute(context):
return context.stage("income")
-
diff --git a/synthesis/population/income/uniform.py b/synthesis/population/income/uniform.py
index f3fdd758..918f2aaf 100644
--- a/synthesis/population/income/uniform.py
+++ b/synthesis/population/income/uniform.py
@@ -12,6 +12,7 @@
income distribution and a random income within the selected stratum is chosen.
"""
+
def configure(context):
context.stage("data.income.municipality")
context.stage("synthesis.population.sampled")
@@ -29,38 +30,56 @@ def _sample_income(context, args):
f = df_households["commune_id"] == commune_id
df_selected = df_households[f]
- centiles = list(df_income[df_income["commune_id"] == commune_id][["q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]].iloc[0].values / 12)
+ centiles = list(
+ df_income[df_income["commune_id"] == commune_id][
+ ["q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"]
+ ]
+ .iloc[0]
+ .values
+ / 12
+ )
incomes = income_uniform_sample(random, centiles, len(df_selected))
return f, incomes
+
def execute(context):
random = np.random.RandomState(context.config("random_seed"))
# Load data
df_income = context.stage("data.income.municipality")
- df_income = df_income[(df_income["attribute"] == "all") & (df_income["value"] == "all")]
+ df_income = df_income[
+ (df_income["attribute"] == "all") & (df_income["value"] == "all")
+ ]
- df_households = context.stage("synthesis.population.sampled")[[
- "household_id", "consumption_units"
- ]].drop_duplicates("household_id")
+ df_households = context.stage("synthesis.population.sampled")[
+ ["household_id", "consumption_units"]
+ ].drop_duplicates("household_id")
- df_homes = context.stage("synthesis.population.spatial.home.zones")[[
- "household_id", "commune_id"
- ]]
+ df_homes = context.stage("synthesis.population.spatial.home.zones")[
+ ["household_id", "commune_id"]
+ ]
df_households = pd.merge(df_households, df_homes)
# Perform sampling per commune
- with context.parallel(dict(households = df_households, income = df_income)) as parallel:
+ with context.parallel(dict(households=df_households, income=df_income)) as parallel:
commune_ids = df_households["commune_id"].unique()
- random_seeds = random.randint(10000, size = len(commune_ids))
+ random_seeds = random.randint(10000, size=len(commune_ids))
- for f, incomes in context.progress(parallel.imap(_sample_income, zip(commune_ids, random_seeds)), label = "Imputing income ...", total = len(commune_ids)):
- df_households.loc[f, "household_income"] = incomes * df_households.loc[f, "consumption_units"]
+ for f, incomes in context.progress(
+ parallel.imap(_sample_income, zip(commune_ids, random_seeds)),
+ label="Imputing income ...",
+ total=len(commune_ids),
+ ):
+ df_households.loc[f, "household_income"] = (
+ incomes * df_households.loc[f, "consumption_units"]
+ )
# Cleanup
- df_households = df_households[["household_id", "household_income", "consumption_units"]]
+ df_households = df_households[
+ ["household_id", "household_income", "consumption_units"]
+ ]
assert len(df_households) == len(df_households["household_id"].unique())
return df_households
diff --git a/synthesis/population/income/utils.py b/synthesis/population/income/utils.py
index b937417b..22a0ea98 100644
--- a/synthesis/population/income/utils.py
+++ b/synthesis/population/income/utils.py
@@ -23,6 +23,8 @@ def income_uniform_sample(random_state, deciles, size):
indices = random_state.randint(10, size=size)
lower_bounds, upper_bounds = deciles[indices], deciles[indices + 1]
- incomes = lower_bounds + random_state.random_sample(size=size) * (upper_bounds - lower_bounds)
+ incomes = lower_bounds + random_state.random_sample(size=size) * (
+ upper_bounds - lower_bounds
+ )
return incomes
diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py
index 5ab5bed0..09022a6a 100644
--- a/synthesis/population/matched.py
+++ b/synthesis/population/matched.py
@@ -20,10 +20,14 @@
}
DEFAULT_MATCHING_ATTRIBUTES = [
- "sex", "any_cars", "age_class", "socioprofessional_class",
- "departement_id"
+ "sex",
+ "any_cars",
+ "age_class",
+ "socioprofessional_class",
+ "departement_id",
]
+
def configure(context):
context.config("processes")
context.config("random_seed")
@@ -34,9 +38,10 @@ def configure(context):
context.stage("synthesis.population.income.selected")
hts = context.config("hts")
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
-@numba.jit(nopython = True) # Already parallelized parallel = True)
+@numba.jit(nopython=True) # Already parallelized parallel = True)
def sample_indices(uniform, cdf, selected_indices):
indices = np.arange(len(uniform))
@@ -45,7 +50,18 @@ def sample_indices(uniform, cdf, selected_indices):
return selected_indices[indices]
-def statistical_matching(progress, df_source, source_identifier, weight, df_target, target_identifier, columns, random_seed = 0, minimum_observations = 0):
+
+def statistical_matching(
+ progress,
+ df_source,
+ source_identifier,
+ weight,
+ df_target,
+ target_identifier,
+ columns,
+ random_seed=0,
+ minimum_observations=0,
+):
random = np.random.RandomState(random_seed)
# Reduce data frames
@@ -53,21 +69,27 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
df_target = df_target[[target_identifier] + columns].copy()
# Sort data frames
- df_source = df_source.sort_values(by = columns)
- df_target = df_target.sort_values(by = columns)
+ df_source = df_source.sort_values(by=columns)
+ df_target = df_target.sort_values(by=columns)
# Find unique values for all columns
unique_values = {}
for column in columns:
- unique_values[column] = list(sorted(set(df_source[column].unique()) | set(df_target[column].unique())))
+ unique_values[column] = list(
+ sorted(set(df_source[column].unique()) | set(df_target[column].unique()))
+ )
# Generate filters for all columns and values
source_filters, target_filters = {}, {}
for column, column_unique_values in unique_values.items():
- source_filters[column] = [df_source[column].values == value for value in column_unique_values]
- target_filters[column] = [df_target[column].values == value for value in column_unique_values]
+ source_filters[column] = [
+ df_source[column].values == value for value in column_unique_values
+ ]
+ target_filters[column] = [
+ df_target[column].values == value for value in column_unique_values
+ ]
# Define search order
source_filters = [source_filters[column] for column in columns]
@@ -75,10 +97,10 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
# Perform matching
weights = df_source[weight].values
- assigned_indices = np.ones((len(df_target),), dtype = int) * -1
- unassigned_mask = np.ones((len(df_target),), dtype = bool)
- assigned_levels = np.ones((len(df_target),), dtype = int) * -1
- uniform = random.random_sample(size = (len(df_target),))
+ assigned_indices = np.ones((len(df_target),), dtype=int) * -1
+ unassigned_mask = np.ones((len(df_target),), dtype=bool)
+ assigned_levels = np.ones((len(df_target),), dtype=int) * -1
+ uniform = random.random_sample(size=(len(df_target),))
column_indices = [np.arange(len(unique_values[column])) for column in columns]
@@ -87,8 +109,13 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
if np.count_nonzero(unassigned_mask) > 0:
for column_index in itertools.product(*level_column_indices):
- f_source = np.logical_and.reduce([source_filters[i][k] for i, k in enumerate(column_index)])
- f_target = np.logical_and.reduce([target_filters[i][k] for i, k in enumerate(column_index)] + [unassigned_mask])
+ f_source = np.logical_and.reduce(
+ [source_filters[i][k] for i, k in enumerate(column_index)]
+ )
+ f_target = np.logical_and.reduce(
+ [target_filters[i][k] for i, k in enumerate(column_index)]
+ + [unassigned_mask]
+ )
selected_indices = np.nonzero(f_source)[0]
requested_samples = np.count_nonzero(f_target)
@@ -103,7 +130,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
cdf = np.cumsum(selected_weights)
cdf /= cdf[-1]
- assigned_indices[f_target] = sample_indices(uniform[f_target], cdf, selected_indices)
+ assigned_indices[f_target] = sample_indices(
+ uniform[f_target], cdf, selected_indices
+ )
assigned_levels[f_target] = level
unassigned_mask[f_target] = False
@@ -113,13 +142,17 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
cdf = np.cumsum(weights)
cdf /= cdf[-1]
- assigned_indices[unassigned_mask] = sample_indices(uniform[unassigned_mask], cdf, np.arange(len(weights)))
+ assigned_indices[unassigned_mask] = sample_indices(
+ uniform[unassigned_mask], cdf, np.arange(len(weights))
+ )
assigned_levels[unassigned_mask] = 0
progress.update(np.count_nonzero(unassigned_mask))
if np.count_nonzero(unassigned_mask) > 0:
- raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?")
+ raise RuntimeError(
+ "Some target observations could not be matched. Minimum observations configured too high?"
+ )
assert np.count_nonzero(unassigned_mask) == 0
assert np.count_nonzero(assigned_indices == -1) == 0
@@ -130,6 +163,7 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
return df_target, assigned_levels
+
def _run_parallel_statistical_matching(context, args):
# Pass arguments
df_target, random_seed = args
@@ -142,28 +176,56 @@ def _run_parallel_statistical_matching(context, args):
columns = context.data("columns")
minimum_observations = context.data("minimum_observations")
- return statistical_matching(context.progress, df_source, source_identifier, weight, df_target, target_identifier, columns, random_seed, minimum_observations)
-
-def parallel_statistical_matching(context, df_source, source_identifier, weight, df_target, target_identifier, columns, minimum_observations = 0):
+ return statistical_matching(
+ context.progress,
+ df_source,
+ source_identifier,
+ weight,
+ df_target,
+ target_identifier,
+ columns,
+ random_seed,
+ minimum_observations,
+ )
+
+
+def parallel_statistical_matching(
+ context,
+ df_source,
+ source_identifier,
+ weight,
+ df_target,
+ target_identifier,
+ columns,
+ minimum_observations=0,
+):
random_seed = context.config("random_seed")
processes = context.config("processes")
random = np.random.RandomState(random_seed)
chunks = np.array_split(df_target, processes)
- with context.progress(label = "Statistical matching ...", total = len(df_target)):
- with context.parallel({
- "df_source": df_source, "source_identifier": source_identifier, "weight": weight,
- "target_identifier": target_identifier, "columns": columns,
- "minimum_observations": minimum_observations
- }) as parallel:
- random_seeds = random.randint(10000, size = len(chunks))
- results = parallel.map(_run_parallel_statistical_matching, zip(chunks, random_seeds))
+ with context.progress(label="Statistical matching ...", total=len(df_target)):
+ with context.parallel(
+ {
+ "df_source": df_source,
+ "source_identifier": source_identifier,
+ "weight": weight,
+ "target_identifier": target_identifier,
+ "columns": columns,
+ "minimum_observations": minimum_observations,
+ }
+ ) as parallel:
+ random_seeds = random.randint(10000, size=len(chunks))
+ results = parallel.map(
+ _run_parallel_statistical_matching, zip(chunks, random_seeds)
+ )
+
+ levels = np.hstack([r[1] for r in results])
+ df_target = pd.concat([r[0] for r in results])
- levels = np.hstack([r[1] for r in results])
- df_target = pd.concat([r[0] for r in results])
+ return df_target, levels
- return df_target, levels
def execute(context):
hts = context.config("hts")
@@ -178,18 +240,25 @@ def execute(context):
try:
default_index = columns.index("*default*")
- columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
- except ValueError: pass
+ columns[default_index : default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
+ except ValueError:
+ pass
# Define matching attributes
AGE_BOUNDARIES = [14, 29, 44, 59, 74, 1000]
if "age_class" in columns:
- df_target["age_class"] = np.digitize(df_target["age"], AGE_BOUNDARIES, right = True)
- df_source["age_class"] = np.digitize(df_source["age"], AGE_BOUNDARIES, right = True)
+ df_target["age_class"] = np.digitize(
+ df_target["age"], AGE_BOUNDARIES, right=True
+ )
+ df_source["age_class"] = np.digitize(
+ df_source["age"], AGE_BOUNDARIES, right=True
+ )
if "income_class" in columns:
- df_income = context.stage("synthesis.population.income.selected")[["household_id", "household_income"]]
+ df_income = context.stage("synthesis.population.income.selected")[
+ ["household_id", "household_income"]
+ ]
df_target = pd.merge(df_target, df_income)
df_target["income_class"] = INCOME_CLASS[hts](df_target)
@@ -199,30 +268,47 @@ def execute(context):
df_source["any_cars"] = df_source["number_of_vehicles"] > 0
# Perform statistical matching
- df_source = df_source.rename(columns = { "person_id": "hts_id" })
+ df_source = df_source.rename(columns={"person_id": "hts_id"})
for column in columns:
if not column in df_source:
- raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column))
+ raise RuntimeError(
+ "Attribute not available in source (HTS) for matching: {}".format(
+ column
+ )
+ )
if not column in df_target:
- raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column))
+ raise RuntimeError(
+ "Attribute not available in target (census) for matching: {}".format(
+ column
+ )
+ )
df_assignment, levels = parallel_statistical_matching(
context,
- df_source, "hts_id", "person_weight",
- df_target, "person_id",
+ df_source,
+ "hts_id",
+ "person_weight",
+ df_target,
+ "person_id",
columns,
- minimum_observations = context.config("matching_minimum_observations"))
+ minimum_observations=context.config("matching_minimum_observations"),
+ )
- df_target = pd.merge(df_target, df_assignment, on = "person_id")
+ df_target = pd.merge(df_target, df_assignment, on="person_id")
assert len(df_target) == len(df_assignment)
- context.set_info("matched_counts", {
- count: np.count_nonzero(levels >= count) for count in range(len(columns) + 1)
- })
+ context.set_info(
+ "matched_counts",
+ {count: np.count_nonzero(levels >= count) for count in range(len(columns) + 1)},
+ )
for count in range(len(columns) + 1):
- print("%d matched levels:" % count, np.count_nonzero(levels >= count), "%.2f%%" % (100 * np.count_nonzero(levels >= count) / len(df_target),))
+ print(
+ "%d matched levels:" % count,
+ np.count_nonzero(levels >= count),
+ "%.2f%%" % (100 * np.count_nonzero(levels >= count) / len(df_target),),
+ )
return df_target[["person_id", "hts_id"]]
diff --git a/synthesis/population/projection/ipu.py b/synthesis/population/projection/ipu.py
index 580ce007..e19bc82a 100644
--- a/synthesis/population/projection/ipu.py
+++ b/synthesis/population/projection/ipu.py
@@ -5,10 +5,12 @@
This stage reweights the census data set according to the projection data for a different year.
"""
+
def configure(context):
context.stage("data.census.cleaned")
context.stage("data.census.projection")
+
def execute(context):
df_census = context.stage("data.census.cleaned")
projection = context.stage("data.census.projection")
@@ -17,7 +19,9 @@ def execute(context):
adjust_projection(projection)
# Prepare indexing
- df_households = df_census[["household_id", "household_size", "weight"]].drop_duplicates("household_id")
+ df_households = df_census[
+ ["household_id", "household_size", "weight"]
+ ].drop_duplicates("household_id")
df_households["household_index"] = np.arange(len(df_households))
df_census = pd.merge(df_census, df_households[["household_id", "household_index"]])
@@ -33,7 +37,11 @@ def execute(context):
# Proccesing age ...
df_marginal = projection["age"]
- for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: age", total = len(df_marginal)):
+ for index, row in context.progress(
+ df_marginal.iterrows(),
+ label="Processing attribute: age",
+ total=len(df_marginal),
+ ):
f = df_census["age"] == row["age"]
assert np.count_nonzero(f) > 0
@@ -42,10 +50,14 @@ def execute(context):
attribute_membership.append(df_counts.index.values)
attribute_counts.append(df_counts.values)
attributes.append("age={}".format(row["age"]))
-
+
# Processing sex ...
df_marginal = projection["sex"]
- for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: sex", total = len(df_marginal)):
+ for index, row in context.progress(
+ df_marginal.iterrows(),
+ label="Processing attribute: sex",
+ total=len(df_marginal),
+ ):
f = df_census["sex"] == row["sex"]
f &= (df_census["age"] > 0) & (df_census["age"] <= 104)
assert np.count_nonzero(f) > 0
@@ -58,7 +70,11 @@ def execute(context):
# Processing age x sex ...
df_marginal = projection["cross"]
- for index, row in context.progress(df_marginal.iterrows(), label = "Processing attributes: sex x age", total = len(df_marginal)):
+ for index, row in context.progress(
+ df_marginal.iterrows(),
+ label="Processing attributes: sex x age",
+ total=len(df_marginal),
+ ):
f = (df_census["sex"] == row["sex"]) & (df_census["age"] == row["age"])
assert np.count_nonzero(f) > 0
@@ -71,7 +87,7 @@ def execute(context):
# Processing total ...
f = (df_census["age"] > 0) & (df_census["age"] <= 104)
assert np.count_nonzero(f) > 0
-
+
df_counts = df_census.loc[f, "household_index"].value_counts()
attribute_targets.append(projection["total"]["projection"].values[0])
attribute_membership.append(df_counts.index.values)
@@ -86,37 +102,49 @@ def execute(context):
maximum_iterations = 100
for iteration in range(maximum_iterations):
- factors = []
+ factors = []
for k in np.arange(len(attributes)):
selection = attribute_membership[k]
-
+
target = attribute_targets[k]
- current = np.sum(update[selection] * household_weights[selection] * attribute_counts[k])
-
+ current = np.sum(
+ update[selection] * household_weights[selection] * attribute_counts[k]
+ )
+
factor = target / current
factors.append(factor)
-
+
update[selection] *= factor
- print("IPU it={} min={} max={}".format(iteration, np.min(factors), np.max(factors)))
+ print(
+ "IPU it={} min={} max={}".format(
+ iteration, np.min(factors), np.max(factors)
+ )
+ )
converged = np.abs(1 - np.max(factors)) < convergence_threshold
converged &= np.abs(1 - np.min(factors)) < convergence_threshold
- if converged: break
+ if converged:
+ break
# Check that the applied factors in the last iteration are sufficiently small
assert converged
- print("IPF updates min={} max={} mean={}".format(np.min(update), np.max(update), np.mean(update)))
+ print(
+ "IPF updates min={} max={} mean={}".format(
+ np.min(update), np.max(update), np.mean(update)
+ )
+ )
# Update the weights
df_households["weight"] *= update
-
+
return df_households[["household_id", "weight"]]
+
def adjust_projection(projection):
# The projection data contains information on zero-year old persons. However, there is a big difference between the
- # RP data and the projection, probably because RP is fixed to a certain reference date and not all of them are
+ # RP data and the projection, probably because RP is fixed to a certain reference date and not all of them are
# registered. We, in particular, see that there is a large jump between 0 years and 1 years.
# Therefore, we exclude the zero-year persons from the projection. This, however, means adapting all the marginals.
# Also, exclude everything that is 105+
@@ -131,18 +159,16 @@ def adjust_projection(projection):
if row["age"] == 0 or row["age"] == "105+":
f_sex = df_sex["sex"] == row["sex"]
- df_sex.loc[f_sex, "projection"] = df_sex.loc[f_sex, "projection"] - row["projection"]
+ df_sex.loc[f_sex, "projection"] = (
+ df_sex.loc[f_sex, "projection"] - row["projection"]
+ )
df_total["projection"] = df_total["projection"] - row["projection"]
-
+
projection["sex"] = df_sex
projection["total"] = df_total
# Remove zero old years from cross distribution
- projection["cross"] = df_cross[
- (df_cross["age"] != 0) & (df_cross["age"] != "105+")
- ]
+ projection["cross"] = df_cross[(df_cross["age"] != 0) & (df_cross["age"] != "105+")]
# Remove zero old years from age distribution
- projection["age"] = df_age[
- (df_age["age"] != 0) & (df_age["age"] != "105+")
- ]
+ projection["age"] = df_age[(df_age["age"] != 0) & (df_age["age"] != "105+")]
diff --git a/synthesis/population/projection/reweighted.py b/synthesis/population/projection/reweighted.py
index 9863e6a3..5450c4e3 100644
--- a/synthesis/population/projection/reweighted.py
+++ b/synthesis/population/projection/reweighted.py
@@ -5,18 +5,20 @@
This stage reweights the census data set according to the projection data for a different year.
"""
+
def configure(context):
context.stage("data.census.filtered")
context.stage("synthesis.population.projection.ipu")
+
def execute(context):
df_census = context.stage("data.census.filtered")
df_weights = context.stage("synthesis.population.projection.ipu")
initial_size = len(df_census)
- df_census = df_census.drop(columns = "weight")
- df_census = pd.merge(df_census, df_weights, on = "household_id")
+ df_census = df_census.drop(columns="weight")
+ df_census = pd.merge(df_census, df_weights, on="household_id")
final_size = len(df_census)
assert initial_size == final_size
diff --git a/synthesis/population/sampled.py b/synthesis/population/sampled.py
index c4a33592..a2a7ae55 100644
--- a/synthesis/population/sampled.py
+++ b/synthesis/population/sampled.py
@@ -8,25 +8,31 @@
through the 'sampling_rate' configuration option.
"""
+
def configure(context):
if context.config("projection_year", None) is None:
- context.stage("data.census.filtered", alias = "source")
+ context.stage("data.census.filtered", alias="source")
else:
- context.stage("synthesis.population.projection.reweighted", alias = "source")
+ context.stage("synthesis.population.projection.reweighted", alias="source")
context.config("random_seed")
context.config("sampling_rate")
+
def execute(context):
- df_census = context.stage("source").sort_values(by = "household_id").copy()
+ df_census = context.stage("source").sort_values(by="household_id").copy()
sampling_rate = context.config("sampling_rate")
random = np.random.RandomState(context.config("random_seed"))
# Perform stochastic rounding for the population (and scale weights)
- df_rounding = df_census[["household_id", "weight", "household_size"]].drop_duplicates("household_id")
+ df_rounding = df_census[
+ ["household_id", "weight", "household_size"]
+ ].drop_duplicates("household_id")
df_rounding["multiplicator"] = np.floor(df_rounding["weight"])
- df_rounding["multiplicator"] += random.random_sample(len(df_rounding)) <= (df_rounding["weight"] - df_rounding["multiplicator"])
+ df_rounding["multiplicator"] += random.random_sample(len(df_rounding)) <= (
+ df_rounding["weight"] - df_rounding["multiplicator"]
+ )
df_rounding["multiplicator"] = df_rounding["multiplicator"].astype(int)
# Multiply households (use same multiplicator for all household members)
@@ -50,7 +56,9 @@ def execute(context):
household_sizes = np.repeat(household_sizes, household_multiplicators)
household_count = np.sum(household_multiplicators)
- df_census.loc[:, "household_id"] = np.repeat(np.arange(household_count), household_sizes)
+ df_census.loc[:, "household_id"] = np.repeat(
+ np.arange(household_count), household_sizes
+ )
# Select sample from 100% population
selector = random.random_sample(household_count) < sampling_rate
diff --git a/synthesis/population/spatial/commute_distance.py b/synthesis/population/spatial/commute_distance.py
index 49064d89..c9f0c2e6 100644
--- a/synthesis/population/spatial/commute_distance.py
+++ b/synthesis/population/spatial/commute_distance.py
@@ -1,28 +1,32 @@
import pandas as pd
+
def configure(context):
context.stage("synthesis.population.enriched")
context.stage("data.hts.commute_distance")
+
def execute(context):
df_matching = context.stage("synthesis.population.enriched")
df_commute_distance = context.stage("data.hts.commute_distance")
df_work = pd.merge(
df_matching[["person_id", "hts_id"]],
- df_commute_distance["work"][["person_id", "commute_distance"]].rename(columns = dict(person_id = "hts_id")),
- how = "left"
+ df_commute_distance["work"][["person_id", "commute_distance"]].rename(
+ columns=dict(person_id="hts_id")
+ ),
+ how="left",
)
df_education = pd.merge(
df_matching[["person_id", "hts_id"]],
- df_commute_distance["education"][["person_id", "commute_distance"]].rename(columns = dict(person_id = "hts_id")),
- how = "left"
+ df_commute_distance["education"][["person_id", "commute_distance"]].rename(
+ columns=dict(person_id="hts_id")
+ ),
+ how="left",
)
assert len(df_work) == len(df_matching)
assert len(df_education) == len(df_matching)
- return dict(
- work = df_work, education = df_education
- )
+ return dict(work=df_work, education=df_education)
diff --git a/synthesis/population/spatial/home/locations.py b/synthesis/population/spatial/home/locations.py
index 9347e5ec..0604941b 100644
--- a/synthesis/population/spatial/home/locations.py
+++ b/synthesis/population/spatial/home/locations.py
@@ -3,13 +3,15 @@
import pandas as pd
import geopandas as gpd
+
def configure(context):
context.stage("synthesis.population.spatial.home.zones")
context.stage("synthesis.locations.home.locations")
context.config("home_location_source", "addresses")
-
+
context.config("random_seed")
+
def _sample_locations(context, args):
# Extract data sets
df_locations = context.data("df_locations")
@@ -35,33 +37,39 @@ def _sample_locations(context, args):
cdf = np.cumsum(df_locations["weight"].values)
cdf /= cdf[-1]
- indices = np.array([np.count_nonzero(cdf < u)
- for u in random.random_sample(size = home_count)])
-
+ indices = np.array(
+ [np.count_nonzero(cdf < u) for u in random.random_sample(size=home_count)]
+ )
+
# Apply selection
df_homes["geometry"] = df_locations.iloc[indices]["geometry"].values
df_homes["home_location_id"] = df_locations.iloc[indices]["home_location_id"].values
-
+
# Update progress
context.progress.update()
- return gpd.GeoDataFrame(df_homes, crs = df_locations.crs)
+ return gpd.GeoDataFrame(df_homes, crs=df_locations.crs)
+
def execute(context):
random = np.random.RandomState(context.config("random_seed"))
df_homes = context.stage("synthesis.population.spatial.home.zones")
df_locations = context.stage("synthesis.locations.home.locations")
-
+
# Sample locations for home
unique_iris_ids = sorted(set(df_homes["iris_id"].unique()))
- with context.progress(label = "Sampling home locations ...", total = len(unique_iris_ids)):
- with context.parallel(dict(
- df_locations = df_locations, df_homes = df_homes
- )) as parallel:
- seeds = random.randint(10000, size = len(unique_iris_ids))
- df_homes = pd.concat(parallel.map(_sample_locations, zip(unique_iris_ids, seeds)))
+ with context.progress(
+ label="Sampling home locations ...", total=len(unique_iris_ids)
+ ):
+ with context.parallel(
+ dict(df_locations=df_locations, df_homes=df_homes)
+ ) as parallel:
+ seeds = random.randint(10000, size=len(unique_iris_ids))
+ df_homes = pd.concat(
+ parallel.map(_sample_locations, zip(unique_iris_ids, seeds))
+ )
out = ["household_id", "commune_id", "home_location_id", "geometry"]
-
+
return df_homes[out]
diff --git a/synthesis/population/spatial/home/zones.py b/synthesis/population/spatial/home/zones.py
index 2964fdc5..4ae6da82 100644
--- a/synthesis/population/spatial/home/zones.py
+++ b/synthesis/population/spatial/home/zones.py
@@ -12,6 +12,7 @@
has less than 200 inhabitants to the second case.
"""
+
def configure(context):
context.stage("synthesis.population.sampled")
@@ -21,66 +22,100 @@ def configure(context):
context.config("random_seed")
+
def execute(context):
random = np.random.RandomState(context.config("random_seed"))
- df_households = context.stage("synthesis.population.sampled").drop_duplicates("household_id")[[
- "household_id", "commune_id", "iris_id", "departement_id"
- ]].copy().set_index("household_id")
+ df_households = (
+ context.stage("synthesis.population.sampled")
+ .drop_duplicates("household_id")[
+ ["household_id", "commune_id", "iris_id", "departement_id"]
+ ]
+ .copy()
+ .set_index("household_id")
+ )
f_has_commune = df_households["commune_id"] != "undefined"
f_has_iris = df_households["iris_id"] != "undefined"
# Fix missing communes (we select from those without IRIS)
- df_municipalities = context.stage("data.spatial.municipalities").set_index("commune_id")
- df_municipalities["population"] = context.stage("data.spatial.population").groupby("commune_id")["population"].sum()
+ df_municipalities = context.stage("data.spatial.municipalities").set_index(
+ "commune_id"
+ )
+ df_municipalities["population"] = (
+ context.stage("data.spatial.population")
+ .groupby("commune_id")["population"]
+ .sum()
+ )
df_households["commune_id"] = df_households["commune_id"].cat.add_categories(
- sorted(set(df_municipalities.index.unique()) - set(df_households["commune_id"].cat.categories)))
+ sorted(
+ set(df_municipalities.index.unique())
+ - set(df_households["commune_id"].cat.categories)
+ )
+ )
departements = df_households[~f_has_commune]["departement_id"].unique()
- for departement_id in context.progress(departements, label = "Fixing missing communes ..."):
+ for departement_id in context.progress(
+ departements, label="Fixing missing communes ..."
+ ):
df_candidates = df_municipalities[
- ~df_municipalities["has_iris"] &
- (df_municipalities["departement_id"].astype(str) == departement_id)]
+ ~df_municipalities["has_iris"]
+ & (df_municipalities["departement_id"].astype(str) == departement_id)
+ ]
df_target = df_households[
- ~f_has_commune &
- (df_households["departement_id"] == departement_id)].copy()
+ ~f_has_commune & (df_households["departement_id"] == departement_id)
+ ].copy()
weights = df_candidates["population"].values.astype(float)
weights /= np.sum(weights)
- indices = np.repeat(np.arange(weights.shape[0]), random.multinomial(len(df_target), weights))
- df_target["commune_id"] = df_candidates.reset_index()["commune_id"].iloc[indices].values
+ indices = np.repeat(
+ np.arange(weights.shape[0]), random.multinomial(len(df_target), weights)
+ )
+ df_target["commune_id"] = (
+ df_candidates.reset_index()["commune_id"].iloc[indices].values
+ )
df_households.loc[df_target.index, "commune_id"] = df_target["commune_id"]
# Fix missing IRIS (we select from those with <200 inhabitants)
df_iris = context.stage("data.spatial.iris").set_index("iris_id")
- df_iris["population"] = context.stage("data.spatial.population").set_index("iris_id")["population"]
+ df_iris["population"] = context.stage("data.spatial.population").set_index(
+ "iris_id"
+ )["population"]
df_households["iris_id"] = df_households["iris_id"].cat.add_categories(
- sorted(set(df_iris.index.unique()) - set(df_households["iris_id"].cat.categories)))
+ sorted(
+ set(df_iris.index.unique()) - set(df_households["iris_id"].cat.categories)
+ )
+ )
communes = df_households[~f_has_iris & f_has_commune]["commune_id"].unique()
- for commune_id in context.progress(communes, label = "Fixing missing IRIS ..."):
+ for commune_id in context.progress(communes, label="Fixing missing IRIS ..."):
df_candidates = df_iris[
- (df_iris["population"] <= 200) &
- (df_iris["commune_id"].astype(str) == commune_id)]
+ (df_iris["population"] <= 200)
+ & (df_iris["commune_id"].astype(str) == commune_id)
+ ]
df_target = df_households[
- f_has_commune & ~f_has_iris &
- (df_households["commune_id"] == commune_id)].copy()
+ f_has_commune & ~f_has_iris & (df_households["commune_id"] == commune_id)
+ ].copy()
weights = df_candidates["population"].values.astype(float)
- if (weights == 0.0).all(): weights += 1.0
+ if (weights == 0.0).all():
+ weights += 1.0
weights /= np.sum(weights)
- indices = np.repeat(np.arange(weights.shape[0]), random.multinomial(len(df_target), weights))
- df_target["iris_id"] = df_candidates.reset_index()["iris_id"].iloc[indices].values
+ indices = np.repeat(
+ np.arange(weights.shape[0]), random.multinomial(len(df_target), weights)
+ )
+ df_target["iris_id"] = (
+ df_candidates.reset_index()["iris_id"].iloc[indices].values
+ )
df_households.loc[df_target.index, "iris_id"] = df_target["iris_id"]
@@ -90,14 +125,20 @@ def execute(context):
# Now there are some people left who don't have an IRIS, because the commune
# is not covered in IRIS. Hence, we drive the commune-based IRIS for them.
f = df_households["iris_id"] == "undefined"
- df_households.loc[f, "iris_id"] = df_households.loc[f, "commune_id"].astype(str) + "0000"
+ df_households.loc[f, "iris_id"] = (
+ df_households.loc[f, "commune_id"].astype(str) + "0000"
+ )
# Finally, make sure that we have no invalid codes
- invalid_communes = set(df_households["commune_id"].unique()) - set(df_municipalities.index.unique())
+ invalid_communes = set(df_households["commune_id"].unique()) - set(
+ df_municipalities.index.unique()
+ )
invalid_iris = set(df_households["iris_id"].unique()) - set(df_iris.index.unique())
assert len(invalid_communes) == 0
assert len(invalid_iris) == 0
assert np.count_nonzero(df_households["iris_id"] == "undefined") == 0
- return df_households.reset_index()[["household_id", "departement_id", "commune_id", "iris_id"]]
+ return df_households.reset_index()[
+ ["household_id", "departement_id", "commune_id", "iris_id"]
+ ]
diff --git a/synthesis/population/spatial/locations.py b/synthesis/population/spatial/locations.py
index 2397e095..adc9bb2d 100644
--- a/synthesis/population/spatial/locations.py
+++ b/synthesis/population/spatial/locations.py
@@ -2,6 +2,7 @@
import geopandas as gpd
import numpy as np
+
def configure(context):
context.stage("synthesis.population.spatial.home.locations")
context.stage("synthesis.population.spatial.primary.locations")
@@ -11,57 +12,94 @@ def configure(context):
context.stage("synthesis.population.sampled")
context.stage("data.spatial.iris")
+
def execute(context):
df_home = context.stage("synthesis.population.spatial.home.locations")
- df_work, df_education = context.stage("synthesis.population.spatial.primary.locations")
+ df_work, df_education = context.stage(
+ "synthesis.population.spatial.primary.locations"
+ )
df_secondary = context.stage("synthesis.population.spatial.secondary.locations")[0]
- df_persons = context.stage("synthesis.population.sampled")[["person_id", "household_id"]]
- df_locations = context.stage("synthesis.population.activities")[["person_id", "activity_index", "purpose"]]
+ df_persons = context.stage("synthesis.population.sampled")[
+ ["person_id", "household_id"]
+ ]
+ df_locations = context.stage("synthesis.population.activities")[
+ ["person_id", "activity_index", "purpose"]
+ ]
# Home locations
df_home_locations = df_locations[df_locations["purpose"] == "home"]
- df_home_locations = pd.merge(df_home_locations, df_persons, on = "person_id")
- df_home_locations = pd.merge(df_home_locations, df_home[["household_id", "geometry"]], on = "household_id")
+ df_home_locations = pd.merge(df_home_locations, df_persons, on="person_id")
+ df_home_locations = pd.merge(
+ df_home_locations, df_home[["household_id", "geometry"]], on="household_id"
+ )
df_home_locations["location_id"] = -1
- df_home_locations = df_home_locations[["person_id", "activity_index", "location_id", "geometry"]]
+ df_home_locations = df_home_locations[
+ ["person_id", "activity_index", "location_id", "geometry"]
+ ]
# Work locations
df_work_locations = df_locations[df_locations["purpose"] == "work"]
- df_work_locations = pd.merge(df_work_locations, df_work[["person_id", "location_id", "geometry"]], on = "person_id")
- df_work_locations = df_work_locations[["person_id", "activity_index", "location_id", "geometry"]]
+ df_work_locations = pd.merge(
+ df_work_locations,
+ df_work[["person_id", "location_id", "geometry"]],
+ on="person_id",
+ )
+ df_work_locations = df_work_locations[
+ ["person_id", "activity_index", "location_id", "geometry"]
+ ]
assert not df_work_locations["geometry"].isna().any()
# Education locations
df_education_locations = df_locations[df_locations["purpose"] == "education"]
- df_education_locations = pd.merge(df_education_locations, df_education[["person_id", "location_id", "geometry"]], on = "person_id")
- df_education_locations = df_education_locations[["person_id", "activity_index", "location_id", "geometry"]]
+ df_education_locations = pd.merge(
+ df_education_locations,
+ df_education[["person_id", "location_id", "geometry"]],
+ on="person_id",
+ )
+ df_education_locations = df_education_locations[
+ ["person_id", "activity_index", "location_id", "geometry"]
+ ]
assert not df_education_locations["geometry"].isna().any()
# Secondary locations
- df_secondary_locations = df_locations[~df_locations["purpose"].isin(("home", "work", "education"))].copy()
- df_secondary_locations = pd.merge(df_secondary_locations, df_secondary[[
- "person_id", "activity_index", "location_id", "geometry"
- ]], on = ["person_id", "activity_index"], how = "left")
- df_secondary_locations = df_secondary_locations[["person_id", "activity_index", "location_id", "geometry"]]
+ df_secondary_locations = df_locations[
+ ~df_locations["purpose"].isin(("home", "work", "education"))
+ ].copy()
+ df_secondary_locations = pd.merge(
+ df_secondary_locations,
+ df_secondary[["person_id", "activity_index", "location_id", "geometry"]],
+ on=["person_id", "activity_index"],
+ how="left",
+ )
+ df_secondary_locations = df_secondary_locations[
+ ["person_id", "activity_index", "location_id", "geometry"]
+ ]
assert not df_secondary_locations["geometry"].isna().any()
# Validation
initial_count = len(df_locations)
- df_locations = pd.concat([df_home_locations, df_work_locations, df_education_locations, df_secondary_locations])
+ df_locations = pd.concat(
+ [
+ df_home_locations,
+ df_work_locations,
+ df_education_locations,
+ df_secondary_locations,
+ ]
+ )
- df_locations = df_locations.sort_values(by = ["person_id", "activity_index"])
+ df_locations = df_locations.sort_values(by=["person_id", "activity_index"])
final_count = len(df_locations)
assert initial_count == final_count
assert not df_locations["geometry"].isna().any()
- df_locations = gpd.GeoDataFrame(df_locations, crs = df_home.crs)
+ df_locations = gpd.GeoDataFrame(df_locations, crs=df_home.crs)
# add municipalities
df_iris = context.stage("data.spatial.iris")
- df_iris = gpd.GeoDataFrame(df_iris, crs = df_home.crs)
+ df_iris = gpd.GeoDataFrame(df_iris, crs=df_home.crs)
- df_locations = gpd.sjoin(df_locations,df_iris,how="left")
+ df_locations = gpd.sjoin(df_locations, df_iris, how="left")
return df_locations
diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
index 7af9963c..811bb5be 100644
--- a/synthesis/population/spatial/primary/candidates.py
+++ b/synthesis/population/spatial/primary/candidates.py
@@ -1,6 +1,7 @@
import pandas as pd
import numpy as np
+
def configure(context):
context.stage("data.od.weighted")
@@ -15,11 +16,14 @@ def configure(context):
context.config("random_seed")
context.config("education_location_source", "bpe")
+
EDUCATION_MAPPING = {
"primary_school": ["C1"],
"middle_school": ["C2"],
"high_school": ["C3"],
- "higher_education": ["C4", "C5", "C6"]}
+ "higher_education": ["C4", "C5", "C6"],
+}
+
def sample_destination_municipalities(context, arguments):
# Load data
@@ -37,6 +41,7 @@ def sample_destination_municipalities(context, arguments):
context.progress.update()
return df_od[["origin_id", "destination_id", "count"]]
+
def sample_locations(context, arguments):
# Load data
destination_id, random_seed = arguments
@@ -45,7 +50,7 @@ def sample_locations(context, arguments):
# Prepare state
random = np.random.RandomState(random_seed)
df_locations = df_locations[df_locations["commune_id"] == destination_id]
-
+
# Determine demand
df_flow = df_flow[df_flow["destination_id"] == destination_id]
count = df_flow["count"].sum()
@@ -55,40 +60,45 @@ def sample_locations(context, arguments):
if "weight" in df_locations:
weight = df_locations["weight"].values / df_locations["weight"].sum()
-
+
location_counts = random.multinomial(count, weight)
location_ids = df_locations["location_id"].values
location_ids = np.repeat(location_ids, location_counts)
- # Shuffle, as otherwise it is likely that *all* copies
+ # Shuffle, as otherwise it is likely that *all* copies
# of the first location id go to the first origin, and so on
random.shuffle(location_ids)
# Construct a data set for all commutes to this zone
origin_id = np.repeat(df_flow["origin_id"].values, df_flow["count"].values)
- df_result = pd.DataFrame.from_records(dict(
- origin_id = origin_id,
- location_id = location_ids
- ))
+ df_result = pd.DataFrame.from_records(
+ dict(origin_id=origin_id, location_id=location_ids)
+ )
df_result["destination_id"] = destination_id
return df_result
-def process(context, purpose, random, df_persons, df_od, df_locations,step_name):
+
+def process(context, purpose, random, df_persons, df_od, df_locations, step_name):
df_persons = df_persons[df_persons["has_%s_trip" % purpose]]
# Sample commute flows based on population
- df_demand = df_persons.groupby("commune_id").size().reset_index(name = "count")
+ df_demand = df_persons.groupby("commune_id").size().reset_index(name="count")
df_demand["random_seed"] = random.randint(0, int(1e6), len(df_demand))
df_demand = df_demand[["commune_id", "count", "random_seed"]]
df_demand = df_demand[df_demand["count"] > 0]
df_flow = []
- with context.progress(label = "Sampling %s municipalities" % step_name, total = len(df_demand)) as progress:
- with context.parallel(dict(df_od = df_od)) as parallel:
- for df_partial in parallel.imap_unordered(sample_destination_municipalities, df_demand.itertuples(index = False, name = None)):
+ with context.progress(
+ label="Sampling %s municipalities" % step_name, total=len(df_demand)
+ ) as progress:
+ with context.parallel(dict(df_od=df_od)) as parallel:
+ for df_partial in parallel.imap_unordered(
+ sample_destination_municipalities,
+ df_demand.itertuples(index=False, name=None),
+ ):
df_flow.append(df_partial)
df_flow = pd.concat(df_flow).sort_values(["origin_id", "destination_id"])
@@ -99,30 +109,45 @@ def process(context, purpose, random, df_persons, df_od, df_locations,step_name)
df_result = []
- with context.progress(label = "Sampling %s destinations" % purpose, total = len(df_demand)) as progress:
- with context.parallel(dict(df_locations = df_locations, df_flow = df_flow)) as parallel:
- for df_partial in parallel.imap_unordered(sample_locations, zip(unique_ids, random_seeds)):
+ with context.progress(
+ label="Sampling %s destinations" % purpose, total=len(df_demand)
+ ) as progress:
+ with context.parallel(
+ dict(df_locations=df_locations, df_flow=df_flow)
+ ) as parallel:
+ for df_partial in parallel.imap_unordered(
+ sample_locations, zip(unique_ids, random_seeds)
+ ):
df_result.append(df_partial)
df_result = pd.concat(df_result).sort_values(["origin_id", "destination_id"])
return df_result[["origin_id", "destination_id", "location_id"]]
+
def execute(context):
# Prepare population data
- df_persons = context.stage("synthesis.population.enriched")[["person_id", "household_id", "age_range"]].copy()
+ df_persons = context.stage("synthesis.population.enriched")[
+ ["person_id", "household_id", "age_range"]
+ ].copy()
df_trips = context.stage("synthesis.population.trips")
- df_persons["has_work_trip"] = df_persons["person_id"].isin(df_trips[
- (df_trips["following_purpose"] == "work") | (df_trips["preceding_purpose"] == "work")
- ]["person_id"])
-
- df_persons["has_education_trip"] = df_persons["person_id"].isin(df_trips[
- (df_trips["following_purpose"] == "education") | (df_trips["preceding_purpose"] == "education")
- ]["person_id"])
+ df_persons["has_work_trip"] = df_persons["person_id"].isin(
+ df_trips[
+ (df_trips["following_purpose"] == "work")
+ | (df_trips["preceding_purpose"] == "work")
+ ]["person_id"]
+ )
+
+ df_persons["has_education_trip"] = df_persons["person_id"].isin(
+ df_trips[
+ (df_trips["following_purpose"] == "education")
+ | (df_trips["preceding_purpose"] == "education")
+ ]["person_id"]
+ )
df_homes = context.stage("synthesis.population.spatial.home.zones")
- df_persons = pd.merge(df_persons, df_homes, on = "household_id")
+ df_persons = pd.merge(df_persons, df_homes, on="household_id")
# Prepare spatial data
df_work_od, df_education_od = context.stage("data.od.weighted")
@@ -132,27 +157,50 @@ def execute(context):
df_locations = context.stage("synthesis.locations.work")
df_locations["weight"] = df_locations["employees"]
- df_work = process(context, "work", random, df_persons,
- df_work_od, df_locations, "work"
+ df_work = process(
+ context, "work", random, df_persons, df_work_od, df_locations, "work"
)
df_locations = context.stage("synthesis.locations.education")
- if context.config("education_location_source") == 'bpe':
- df_education = process(context, "education", random, df_persons, df_education_od, df_locations,"education")
- else :
+ if context.config("education_location_source") == "bpe":
+ df_education = process(
+ context,
+ "education",
+ random,
+ df_persons,
+ df_education_od,
+ df_locations,
+ "education",
+ )
+ else:
df_education = []
for prefix, education_type in EDUCATION_MAPPING.items():
df_education.append(
- process(context, "education", random,
- df_persons[df_persons["age_range"]==prefix],
- df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["education_type"].isin(education_type)],prefix)
+ process(
+ context,
+ "education",
+ random,
+ df_persons[df_persons["age_range"] == prefix],
+ df_education_od[df_education_od["age_range"] == prefix],
+ df_locations[df_locations["education_type"].isin(education_type)],
+ prefix,
+ )
)
df_education = pd.concat(df_education)
return dict(
- work_candidates = df_work,
- education_candidates = df_education,
- persons = df_persons[df_persons["has_work_trip"] | df_persons["has_education_trip"]][[
- "person_id", "household_id", "age_range", "commune_id", "has_work_trip", "has_education_trip"
- ]]
+ work_candidates=df_work,
+ education_candidates=df_education,
+ persons=df_persons[
+ df_persons["has_work_trip"] | df_persons["has_education_trip"]
+ ][
+ [
+ "person_id",
+ "household_id",
+ "age_range",
+ "commune_id",
+ "has_work_trip",
+ "has_education_trip",
+ ]
+ ],
)
diff --git a/synthesis/population/spatial/primary/locations.py b/synthesis/population/spatial/primary/locations.py
index 136e18ac..92c1c5f1 100644
--- a/synthesis/population/spatial/primary/locations.py
+++ b/synthesis/population/spatial/primary/locations.py
@@ -3,6 +3,7 @@
import geopandas as gpd
from .candidates import EDUCATION_MAPPING
+
def configure(context):
context.stage("synthesis.population.spatial.primary.candidates")
context.stage("synthesis.population.spatial.commute_distance")
@@ -16,17 +17,20 @@ def configure(context):
def define_distance_ordering(df_persons, df_candidates, progress):
indices = []
- f_available = np.ones((len(df_candidates),), dtype = bool)
+ f_available = np.ones((len(df_candidates),), dtype=bool)
costs = np.ones((len(df_candidates),)) * np.inf
- commute_coordinates = np.vstack([
- df_candidates["geometry"].x.values,
- df_candidates["geometry"].y.values
- ]).T
+ commute_coordinates = np.vstack(
+ [df_candidates["geometry"].x.values, df_candidates["geometry"].y.values]
+ ).T
- for home_coordinate, commute_distance in zip(df_persons["home_location"], df_persons["commute_distance"]):
+ for home_coordinate, commute_distance in zip(
+ df_persons["home_location"], df_persons["commute_distance"]
+ ):
home_coordinate = np.array([home_coordinate.x, home_coordinate.y])
- distances = np.sqrt(np.sum((commute_coordinates[f_available] - home_coordinate)**2, axis = 1))
+ distances = np.sqrt(
+ np.sum((commute_coordinates[f_available] - home_coordinate) ** 2, axis=1)
+ )
costs[f_available] = np.abs(distances - commute_distance)
selected_index = np.argmin(costs)
@@ -40,20 +44,25 @@ def define_distance_ordering(df_persons, df_candidates, progress):
return indices
+
def define_random_ordering(df_persons, df_candidates, progress):
progress.update(len(df_candidates))
return np.arange(len(df_candidates))
+
define_ordering = define_distance_ordering
+
def process_municipality(context, origin_id):
# Load data
- df_candidates, df_persons = context.data("df_candidates"), context.data("df_persons")
+ df_candidates, df_persons = context.data("df_candidates"), context.data(
+ "df_persons"
+ )
# Find relevant records
- df_persons = df_persons[df_persons["commune_id"] == origin_id][[
- "person_id", "home_location", "commute_distance"
- ]].copy()
+ df_persons = df_persons[df_persons["commune_id"] == origin_id][
+ ["person_id", "home_location", "commute_distance"]
+ ].copy()
df_candidates = df_candidates[df_candidates["origin_id"] == origin_id]
# From previous step, this should be equal!
@@ -63,22 +72,28 @@ def process_municipality(context, origin_id):
df_candidates = df_candidates.iloc[indices]
df_candidates["person_id"] = df_persons["person_id"].values
- df_candidates = df_candidates.rename(columns = dict(destination_id = "commune_id"))
+ df_candidates = df_candidates.rename(columns=dict(destination_id="commune_id"))
return df_candidates[["person_id", "commune_id", "location_id", "geometry"]]
+
def process(context, purpose, df_persons, df_candidates):
unique_ids = df_candidates["origin_id"].unique()
df_result = []
- with context.progress(label = "Distributing %s destinations" % purpose, total = len(df_persons)) as progress:
- with context.parallel(dict(df_persons = df_persons, df_candidates = df_candidates)) as parallel:
+ with context.progress(
+ label="Distributing %s destinations" % purpose, total=len(df_persons)
+ ) as progress:
+ with context.parallel(
+ dict(df_persons=df_persons, df_candidates=df_candidates)
+ ) as parallel:
for df_partial in parallel.imap_unordered(process_municipality, unique_ids):
df_result.append(df_partial)
return pd.concat(df_result).sort_index()
+
def execute(context):
data = context.stage("synthesis.population.spatial.primary.candidates")
df_persons = data["persons"]
@@ -90,38 +105,69 @@ def execute(context):
# Attach home locations
df_home = context.stage("synthesis.population.spatial.home.locations")
- df_work = pd.merge(df_work, df_home[["household_id", "geometry"]].rename(columns = {
- "geometry": "home_location"
- }), how = "left", on = "household_id")
-
- df_education = pd.merge(df_education, df_home[["household_id", "geometry"]].rename(columns = {
- "geometry": "home_location"
- }), how = "left", on = "household_id")
+ df_work = pd.merge(
+ df_work,
+ df_home[["household_id", "geometry"]].rename(
+ columns={"geometry": "home_location"}
+ ),
+ how="left",
+ on="household_id",
+ )
+
+ df_education = pd.merge(
+ df_education,
+ df_home[["household_id", "geometry"]].rename(
+ columns={"geometry": "home_location"}
+ ),
+ how="left",
+ on="household_id",
+ )
# Attach commute distances
df_commute_distance = context.stage("synthesis.population.spatial.commute_distance")
- df_work = pd.merge(df_work, df_commute_distance["work"], how = "left", on = "person_id")
- df_education = pd.merge(df_education, df_commute_distance["education"], how = "left", on = "person_id")
+ df_work = pd.merge(df_work, df_commute_distance["work"], how="left", on="person_id")
+ df_education = pd.merge(
+ df_education, df_commute_distance["education"], how="left", on="person_id"
+ )
# Attach geometry
- df_locations = context.stage("synthesis.locations.work")[["location_id", "geometry"]]
+ df_locations = context.stage("synthesis.locations.work")[
+ ["location_id", "geometry"]
+ ]
df_work_candidates = data["work_candidates"]
- df_work_candidates = pd.merge(df_work_candidates, df_locations, how = "left", on = "location_id")
+ df_work_candidates = pd.merge(
+ df_work_candidates, df_locations, how="left", on="location_id"
+ )
df_work_candidates = gpd.GeoDataFrame(df_work_candidates)
- df_locations = context.stage("synthesis.locations.education")[["education_type", "location_id", "geometry"]]
+ df_locations = context.stage("synthesis.locations.education")[
+ ["education_type", "location_id", "geometry"]
+ ]
df_education_candidates = data["education_candidates"]
- df_education_candidates = pd.merge(df_education_candidates, df_locations, how = "left", on = "location_id")
+ df_education_candidates = pd.merge(
+ df_education_candidates, df_locations, how="left", on="location_id"
+ )
df_education_candidates = gpd.GeoDataFrame(df_education_candidates)
# Assign destinations
df_work = process(context, "work", df_work, df_work_candidates)
- if context.config("education_location_source") == 'bpe':
- df_education = process(context, "education", df_education, df_education_candidates)
- else :
+ if context.config("education_location_source") == "bpe":
+ df_education = process(
+ context, "education", df_education, df_education_candidates
+ )
+ else:
education = []
for prefix, education_type in EDUCATION_MAPPING.items():
- education.append(process(context, prefix,df_education[df_education["age_range"]==prefix],df_education_candidates[df_education_candidates["education_type"].isin(education_type)]))
+ education.append(
+ process(
+ context,
+ prefix,
+ df_education[df_education["age_range"] == prefix],
+ df_education_candidates[
+ df_education_candidates["education_type"].isin(education_type)
+ ],
+ )
+ )
df_education = pd.concat(education).sort_index()
return df_work, df_education
diff --git a/synthesis/population/spatial/secondary/components.py b/synthesis/population/spatial/secondary/components.py
index d16a8877..49627650 100644
--- a/synthesis/population/spatial/secondary/components.py
+++ b/synthesis/population/spatial/secondary/components.py
@@ -2,9 +2,12 @@
import sklearn.neighbors
import numpy as np
+
class CustomDistanceSampler(rda.FeasibleDistanceSampler):
- def __init__(self, random, distributions, maximum_iterations = 1000):
- rda.FeasibleDistanceSampler.__init__(self, random = random, maximum_iterations = maximum_iterations)
+ def __init__(self, random, distributions, maximum_iterations=1000):
+ rda.FeasibleDistanceSampler.__init__(
+ self, random=random, maximum_iterations=maximum_iterations
+ )
self.random = random
self.distributions = distributions
@@ -12,7 +15,9 @@ def __init__(self, random, distributions, maximum_iterations = 1000):
def sample_distances(self, problem):
distances = np.zeros((len(problem["modes"])))
- for index, (mode, travel_time) in enumerate(zip(problem["modes"], problem["travel_times"])):
+ for index, (mode, travel_time) in enumerate(
+ zip(problem["modes"], problem["travel_times"])
+ ):
mode_distribution = self.distributions[mode]
bound_index = np.count_nonzero(travel_time > mode_distribution["bounds"])
@@ -24,6 +29,7 @@ def sample_distances(self, problem):
return distances
+
class CandidateIndex:
def __init__(self, data):
self.data = data
@@ -34,7 +40,9 @@ def __init__(self, data):
self.indices[purpose] = sklearn.neighbors.KDTree(data["locations"])
def query(self, purpose, location):
- index = self.indices[purpose].query(location.reshape(1, -1), return_distance = False)[0][0]
+ index = self.indices[purpose].query(
+ location.reshape(1, -1), return_distance=False
+ )[0][0]
identifier = self.data[purpose]["identifiers"][index]
location = self.data[purpose]["locations"][index]
return identifier, location
@@ -45,6 +53,7 @@ def sample(self, purpose, random):
location = self.data[purpose]["locations"][index]
return identifier, location
+
class CustomDiscretizationSolver(rda.DiscretizationSolver):
def __init__(self, index):
self.index = index
@@ -62,9 +71,12 @@ def solve(self, problem, locations):
assert len(discretized_locations) == problem["size"]
return dict(
- valid = True, locations = np.vstack(discretized_locations), identifiers = discretized_identifiers
+ valid=True,
+ locations=np.vstack(discretized_locations),
+ identifiers=discretized_identifiers,
)
+
class CustomFreeChainSolver(rda.RelaxationSolver):
def __init__(self, random, index):
self.random = random
@@ -76,4 +88,4 @@ def solve(self, problem, distances):
locations = np.vstack((anchor, locations))
assert len(locations) == len(distances) + 1
- return dict(valid = True, locations = locations)
+ return dict(valid=True, locations=locations)
diff --git a/synthesis/population/spatial/secondary/distance_distributions.py b/synthesis/population/spatial/secondary/distance_distributions.py
index 7fb7273b..fbf31424 100644
--- a/synthesis/population/spatial/secondary/distance_distributions.py
+++ b/synthesis/population/spatial/secondary/distance_distributions.py
@@ -1,8 +1,10 @@
import numpy as np
import pandas as pd
+
def configure(context):
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def calculate_bounds(values, bin_size):
values = np.sort(values)
@@ -26,25 +28,44 @@ def calculate_bounds(values, bin_size):
bounds[-1] = np.inf
else:
bounds.append(np.inf)
-
+
return bounds
+
def execute(context):
# Prepare data
df_households, df_persons, df_trips = context.stage("hts")
- df_trips = pd.merge(df_trips, df_persons[["person_id", "person_weight"]].rename(columns = { "person_weight": "weight" }))
+ df_trips = pd.merge(
+ df_trips,
+ df_persons[["person_id", "person_weight"]].rename(
+ columns={"person_weight": "weight"}
+ ),
+ )
df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"]
- distance_column = "euclidean_distance" if "euclidean_distance" in df_trips else "routed_distance"
- df = df_trips[["mode", "travel_time", distance_column, "weight", "preceding_purpose", "following_purpose"]].rename(columns = { distance_column: "distance" })
+ distance_column = (
+ "euclidean_distance" if "euclidean_distance" in df_trips else "routed_distance"
+ )
+ df = df_trips[
+ [
+ "mode",
+ "travel_time",
+ distance_column,
+ "weight",
+ "preceding_purpose",
+ "following_purpose",
+ ]
+ ].rename(columns={distance_column: "distance"})
# Filtering
primary_activities = ["home", "work", "education"]
- df = df[~(
- df["preceding_purpose"].isin(primary_activities) &
- df["following_purpose"].isin(primary_activities)
- )]
+ df = df[
+ ~(
+ df["preceding_purpose"].isin(primary_activities)
+ & df["following_purpose"].isin(primary_activities)
+ )
+ ]
# Calculate distributions
modes = df["mode"].unique()
@@ -57,11 +78,13 @@ def execute(context):
f_mode = df["mode"] == mode
bounds = calculate_bounds(df[f_mode]["travel_time"].values, bin_size)
- distributions[mode] = dict(bounds = np.array(bounds), distributions = [])
+ distributions[mode] = dict(bounds=np.array(bounds), distributions=[])
# Second, calculate distribution per band
for lower_bound, upper_bound in zip([-np.inf] + bounds[:-1], bounds):
- f_bound = (df["travel_time"] > lower_bound) & (df["travel_time"] <= upper_bound)
+ f_bound = (df["travel_time"] > lower_bound) & (
+ df["travel_time"] <= upper_bound
+ )
# Set up distribution
values = df[f_mode & f_bound]["distance"].values
@@ -75,6 +98,8 @@ def execute(context):
cdf /= cdf[-1]
# Write distribution
- distributions[mode]["distributions"].append(dict(cdf = cdf, values = values, weights = weights))
+ distributions[mode]["distributions"].append(
+ dict(cdf=cdf, values=values, weights=weights)
+ )
return distributions
diff --git a/synthesis/population/spatial/secondary/locations.py b/synthesis/population/spatial/secondary/locations.py
index b36fb214..f6871e79 100644
--- a/synthesis/population/spatial/secondary/locations.py
+++ b/synthesis/population/spatial/secondary/locations.py
@@ -6,6 +6,7 @@
from synthesis.population.spatial.secondary.problems import find_assignment_problems
+
def configure(context):
context.stage("synthesis.population.trips")
@@ -21,72 +22,113 @@ def configure(context):
context.config("secloc_maximum_iterations", np.inf)
+
def prepare_locations(context):
# Load persons and their primary locations
df_home = context.stage("synthesis.population.spatial.home.locations")
- df_work, df_education = context.stage("synthesis.population.spatial.primary.locations")
+ df_work, df_education = context.stage(
+ "synthesis.population.spatial.primary.locations"
+ )
crs = df_home.crs
- df_home = df_home.rename(columns = { "geometry": "home" })
- df_work = df_work.rename(columns = { "geometry": "work" })
- df_education = df_education.rename(columns = { "geometry": "education" })
+ df_home = df_home.rename(columns={"geometry": "home"})
+ df_work = df_work.rename(columns={"geometry": "work"})
+ df_education = df_education.rename(columns={"geometry": "education"})
+
+ df_locations = context.stage("synthesis.population.sampled")[
+ ["person_id", "household_id"]
+ ]
+ df_locations = pd.merge(
+ df_locations, df_home[["household_id", "home"]], how="left", on="household_id"
+ )
+ df_locations = pd.merge(
+ df_locations, df_work[["person_id", "work"]], how="left", on="person_id"
+ )
+ df_locations = pd.merge(
+ df_locations,
+ df_education[["person_id", "education"]],
+ how="left",
+ on="person_id",
+ )
- df_locations = context.stage("synthesis.population.sampled")[["person_id", "household_id"]]
- df_locations = pd.merge(df_locations, df_home[["household_id", "home"]], how = "left", on = "household_id")
- df_locations = pd.merge(df_locations, df_work[["person_id", "work"]], how = "left", on = "person_id")
- df_locations = pd.merge(df_locations, df_education[["person_id", "education"]], how = "left", on = "person_id")
+ return (
+ df_locations[["person_id", "home", "work", "education"]].sort_values(
+ by="person_id"
+ ),
+ crs,
+ )
- return df_locations[["person_id", "home", "work", "education"]].sort_values(by = "person_id"), crs
def prepare_destinations(context):
df_locations = context.stage("synthesis.locations.secondary")
identifiers = df_locations["location_id"].values
- locations = np.vstack(df_locations["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
+ locations = np.vstack(
+ df_locations["geometry"].apply(lambda x: np.array([x.x, x.y])).values
+ )
data = {}
for purpose in ("shop", "leisure", "other"):
f = df_locations["offers_%s" % purpose].values
- data[purpose] = dict(
- identifiers = identifiers[f],
- locations = locations[f]
- )
+ data[purpose] = dict(identifiers=identifiers[f], locations=locations[f])
return data
+
def resample_cdf(cdf, factor):
if factor >= 0.0:
cdf = cdf * (1.0 + factor * np.arange(1, len(cdf) + 1) / len(cdf))
else:
- cdf = cdf * (1.0 + abs(factor) - abs(factor) * np.arange(1, len(cdf) + 1) / len(cdf))
+ cdf = cdf * (
+ 1.0 + abs(factor) - abs(factor) * np.arange(1, len(cdf) + 1) / len(cdf)
+ )
cdf /= cdf[-1]
return cdf
+
def resample_distributions(distributions, factors):
for mode, mode_distributions in distributions.items():
for distribution in mode_distributions["distributions"]:
distribution["cdf"] = resample_cdf(distribution["cdf"], factors[mode])
-from synthesis.population.spatial.secondary.rda import AssignmentSolver, DiscretizationErrorObjective, GravityChainSolver, AngularTailSolver, GeneralRelaxationSolver
-from synthesis.population.spatial.secondary.components import CustomDistanceSampler, CustomDiscretizationSolver, CandidateIndex, CustomFreeChainSolver
+
+from synthesis.population.spatial.secondary.rda import (
+ AssignmentSolver,
+ DiscretizationErrorObjective,
+ GravityChainSolver,
+ AngularTailSolver,
+ GeneralRelaxationSolver,
+)
+from synthesis.population.spatial.secondary.components import (
+ CustomDistanceSampler,
+ CustomDiscretizationSolver,
+ CandidateIndex,
+ CustomFreeChainSolver,
+)
+
def execute(context):
# Load trips and primary locations
- df_trips = context.stage("synthesis.population.trips").sort_values(by = ["person_id", "trip_index"])
+ df_trips = context.stage("synthesis.population.trips").sort_values(
+ by=["person_id", "trip_index"]
+ )
df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"]
df_primary, crs = prepare_locations(context)
# Prepare data
- distance_distributions = context.stage("synthesis.population.spatial.secondary.distance_distributions")
+ distance_distributions = context.stage(
+ "synthesis.population.spatial.secondary.distance_distributions"
+ )
destinations = prepare_destinations(context)
# Resampling for calibration
- resample_distributions(distance_distributions, dict(
- car = 0.0, car_passenger = 0.1, pt = 0.5, bike = 0.0, walk = -0.5
- ))
+ resample_distributions(
+ distance_distributions,
+ dict(car=0.0, car_passenger=0.1, pt=0.5, bike=0.0, walk=-0.5),
+ )
# Segment into subsamples
processes = context.config("processes")
@@ -96,108 +138,133 @@ def execute(context):
unique_person_ids = np.array_split(unique_person_ids, processes)
random = np.random.RandomState(context.config("random_seed"))
- random_seeds = random.randint(10000, size = processes)
+ random_seeds = random.randint(10000, size=processes)
# Create batch problems for parallelization
batches = []
for index in range(processes):
- batches.append((
- df_trips[df_trips["person_id"].isin(unique_person_ids[index])],
- df_primary[df_primary["person_id"].isin(unique_person_ids[index])],
- random_seeds[index], crs
- ))
+ batches.append(
+ (
+ df_trips[df_trips["person_id"].isin(unique_person_ids[index])],
+ df_primary[df_primary["person_id"].isin(unique_person_ids[index])],
+ random_seeds[index],
+ crs,
+ )
+ )
# Run algorithm in parallel
- with context.progress(label = "Assigning secondary locations to persons", total = number_of_persons):
- with context.parallel(processes = processes, data = dict(
- distance_distributions = distance_distributions,
- destinations = destinations
- )) as parallel:
+ with context.progress(
+ label="Assigning secondary locations to persons", total=number_of_persons
+ ):
+ with context.parallel(
+ processes=processes,
+ data=dict(
+ distance_distributions=distance_distributions, destinations=destinations
+ ),
+ ) as parallel:
df_locations, df_convergence = [], []
- for df_locations_item, df_convergence_item in parallel.imap_unordered(process, batches):
+ for df_locations_item, df_convergence_item in parallel.imap_unordered(
+ process, batches
+ ):
df_locations.append(df_locations_item)
df_convergence.append(df_convergence_item)
- df_locations = pd.concat(df_locations).sort_values(by = ["person_id", "activity_index"])
+ df_locations = pd.concat(df_locations).sort_values(
+ by=["person_id", "activity_index"]
+ )
df_convergence = pd.concat(df_convergence)
print("Success rate:", df_convergence["valid"].mean())
return df_locations, df_convergence
+
def process(context, arguments):
- df_trips, df_primary, random_seed, crs = arguments
-
- # Set up RNG
- random = np.random.RandomState(random_seed)
- maximum_iterations = context.config("secloc_maximum_iterations")
-
- # Set up discretization solver
- destinations = context.data("destinations")
- candidate_index = CandidateIndex(destinations)
- discretization_solver = CustomDiscretizationSolver(candidate_index)
-
- # Set up distance sampler
- distance_distributions = context.data("distance_distributions")
- distance_sampler = CustomDistanceSampler(
- maximum_iterations = min(1000, maximum_iterations),
- random = random,
- distributions = distance_distributions)
-
- # Set up relaxation solver; currently, we do not consider tail problems.
- chain_solver = GravityChainSolver(
- random = random, eps = 10.0, lateral_deviation = 10.0, alpha = 0.1,
- maximum_iterations = min(1000, maximum_iterations)
+ df_trips, df_primary, random_seed, crs = arguments
+
+ # Set up RNG
+ random = np.random.RandomState(random_seed)
+ maximum_iterations = context.config("secloc_maximum_iterations")
+
+ # Set up discretization solver
+ destinations = context.data("destinations")
+ candidate_index = CandidateIndex(destinations)
+ discretization_solver = CustomDiscretizationSolver(candidate_index)
+
+ # Set up distance sampler
+ distance_distributions = context.data("distance_distributions")
+ distance_sampler = CustomDistanceSampler(
+ maximum_iterations=min(1000, maximum_iterations),
+ random=random,
+ distributions=distance_distributions,
)
- tail_solver = AngularTailSolver(random = random)
- free_solver = CustomFreeChainSolver(random, candidate_index)
+ # Set up relaxation solver; currently, we do not consider tail problems.
+ chain_solver = GravityChainSolver(
+ random=random,
+ eps=10.0,
+ lateral_deviation=10.0,
+ alpha=0.1,
+ maximum_iterations=min(1000, maximum_iterations),
+ )
+
+ tail_solver = AngularTailSolver(random=random)
+ free_solver = CustomFreeChainSolver(random, candidate_index)
- relaxation_solver = GeneralRelaxationSolver(chain_solver, tail_solver, free_solver)
+ relaxation_solver = GeneralRelaxationSolver(chain_solver, tail_solver, free_solver)
- # Set up assignment solver
- thresholds = dict(
- car = 200.0, car_passenger = 200.0, pt = 200.0,
- bike = 100.0, walk = 100.0
- )
+ # Set up assignment solver
+ thresholds = dict(car=200.0, car_passenger=200.0, pt=200.0, bike=100.0, walk=100.0)
- assignment_objective = DiscretizationErrorObjective(thresholds = thresholds)
- assignment_solver = AssignmentSolver(
- distance_sampler = distance_sampler,
- relaxation_solver = relaxation_solver,
- discretization_solver = discretization_solver,
- objective = assignment_objective,
- maximum_iterations = min(20, maximum_iterations)
- )
+ assignment_objective = DiscretizationErrorObjective(thresholds=thresholds)
+ assignment_solver = AssignmentSolver(
+ distance_sampler=distance_sampler,
+ relaxation_solver=relaxation_solver,
+ discretization_solver=discretization_solver,
+ objective=assignment_objective,
+ maximum_iterations=min(20, maximum_iterations),
+ )
- df_locations = []
- df_convergence = []
+ df_locations = []
+ df_convergence = []
- last_person_id = None
+ last_person_id = None
- for problem in find_assignment_problems(df_trips, df_primary):
- result = assignment_solver.solve(problem)
+ for problem in find_assignment_problems(df_trips, df_primary):
+ result = assignment_solver.solve(problem)
- starting_activity_index = problem["activity_index"]
+ starting_activity_index = problem["activity_index"]
- for index, (identifier, location) in enumerate(zip(result["discretization"]["identifiers"], result["discretization"]["locations"])):
- df_locations.append((
- problem["person_id"], starting_activity_index + index, identifier, geo.Point(location)
- ))
+ for index, (identifier, location) in enumerate(
+ zip(
+ result["discretization"]["identifiers"],
+ result["discretization"]["locations"],
+ )
+ ):
+ df_locations.append(
+ (
+ problem["person_id"],
+ starting_activity_index + index,
+ identifier,
+ geo.Point(location),
+ )
+ )
- df_convergence.append((
- result["valid"], problem["size"]
- ))
+ df_convergence.append((result["valid"], problem["size"]))
- if problem["person_id"] != last_person_id:
- last_person_id = problem["person_id"]
- context.progress.update()
+ if problem["person_id"] != last_person_id:
+ last_person_id = problem["person_id"]
+ context.progress.update()
- df_locations = pd.DataFrame.from_records(df_locations, columns = ["person_id", "activity_index", "location_id", "geometry"])
- df_locations = gpd.GeoDataFrame(df_locations, crs = crs)
- assert not df_locations["geometry"].isna().any()
+ df_locations = pd.DataFrame.from_records(
+ df_locations, columns=["person_id", "activity_index", "location_id", "geometry"]
+ )
+ df_locations = gpd.GeoDataFrame(df_locations, crs=crs)
+ assert not df_locations["geometry"].isna().any()
- df_convergence = pd.DataFrame.from_records(df_convergence, columns = ["valid", "size"])
- return df_locations, df_convergence
+ df_convergence = pd.DataFrame.from_records(
+ df_convergence, columns=["valid", "size"]
+ )
+ return df_locations, df_convergence
diff --git a/synthesis/population/spatial/secondary/problems.py b/synthesis/population/spatial/secondary/problems.py
index b4f7295e..ae60266c 100644
--- a/synthesis/population/spatial/secondary/problems.py
+++ b/synthesis/population/spatial/secondary/problems.py
@@ -1,14 +1,29 @@
import numpy as np
import pandas as pd
-FIELDS = ["person_id", "trip_index", "preceding_purpose", "following_purpose", "mode", "travel_time"]
+FIELDS = [
+ "person_id",
+ "trip_index",
+ "preceding_purpose",
+ "following_purpose",
+ "mode",
+ "travel_time",
+]
FIXED_PURPOSES = ["home", "work", "education"]
+
def find_bare_assignment_problems(df):
problem = None
- for row in df[FIELDS].itertuples(index = False):
- person_id, trip_index, preceding_purpose, following_purpose, mode, travel_time = row
+ for row in df[FIELDS].itertuples(index=False):
+ (
+ person_id,
+ trip_index,
+ preceding_purpose,
+ following_purpose,
+ mode,
+ travel_time,
+ ) = row
if not problem is None and person_id != problem["person_id"]:
# We switch person, but we're still tracking a problem. This is a tail!
@@ -18,8 +33,11 @@ def find_bare_assignment_problems(df):
if problem is None:
# Start a new problem
problem = dict(
- person_id = person_id, trip_index = trip_index, purposes = [preceding_purpose],
- modes = [], travel_times = []
+ person_id=person_id,
+ trip_index=trip_index,
+ purposes=[preceding_purpose],
+ modes=[],
+ travel_times=[],
)
problem["purposes"].append(following_purpose)
@@ -34,16 +52,18 @@ def find_bare_assignment_problems(df):
if not problem is None:
yield problem
+
LOCATION_FIELDS = ["person_id", "home", "work", "education"]
+
def find_assignment_problems(df, df_locations):
"""
- Enriches assignment problems with:
- - Locations of the fixed activities
- - Size of the problem
- - Reduces purposes to the variable ones
+ Enriches assignment problems with:
+ - Locations of the fixed activities
+ - Size of the problem
+ - Reduces purposes to the variable ones
"""
- location_iterator = df_locations[LOCATION_FIELDS].itertuples(index = False)
+ location_iterator = df_locations[LOCATION_FIELDS].itertuples(index=False)
current_location = None
for problem in find_bare_assignment_problems(df):
@@ -61,13 +81,13 @@ def find_assignment_problems(df, df_locations):
problem["purposes"] = problem["purposes"][:-1]
else:
- pass # Neither chain nor tail
+ pass # Neither chain nor tail
# Define size
problem["size"] = len(problem["purposes"])
if problem["size"] == 0:
- continue # We can skip if there are no variable activities
+ continue # We can skip if there are no variable activities
# Advance location iterator until we arrive at the current problem's person
while current_location is None or current_location[0] != problem["person_id"]:
@@ -78,12 +98,18 @@ def find_assignment_problems(df, df_locations):
problem["destination"] = None
if origin_purpose in FIXED_PURPOSES:
- problem["origin"] = current_location[LOCATION_FIELDS.index(origin_purpose)] # Shapely POINT
+ problem["origin"] = current_location[
+ LOCATION_FIELDS.index(origin_purpose)
+ ] # Shapely POINT
problem["origin"] = np.array([[problem["origin"].x, problem["origin"].y]])
if destination_purpose in FIXED_PURPOSES:
- problem["destination"] = current_location[LOCATION_FIELDS.index(destination_purpose)] # Shapely POINT
- problem["destination"] = np.array([[problem["destination"].x, problem["destination"].y]])
+ problem["destination"] = current_location[
+ LOCATION_FIELDS.index(destination_purpose)
+ ] # Shapely POINT
+ problem["destination"] = np.array(
+ [[problem["destination"].x, problem["destination"].y]]
+ )
if problem["origin"] is None:
problem["activity_index"] = problem["trip_index"]
diff --git a/synthesis/population/spatial/secondary/rda.py b/synthesis/population/spatial/secondary/rda.py
index 232d1c86..3eaddc1c 100644
--- a/synthesis/population/spatial/secondary/rda.py
+++ b/synthesis/population/spatial/secondary/rda.py
@@ -1,10 +1,15 @@
import numpy as np
import numpy.linalg as la
-def check_feasibility(distances, direct_distance, consider_total_distance = True):
- return calculate_feasibility(distances, direct_distance, consider_total_distance) == 0.0
-def calculate_feasibility(distances, direct_distance, consider_total_distance = True):
+def check_feasibility(distances, direct_distance, consider_total_distance=True):
+ return (
+ calculate_feasibility(distances, direct_distance, consider_total_distance)
+ == 0.0
+ )
+
+
+def calculate_feasibility(distances, direct_distance, consider_total_distance=True):
total_distance = np.sum(distances)
delta_distance = 0.0
@@ -16,24 +21,38 @@ def calculate_feasibility(distances, direct_distance, consider_total_distance =
return float(max(delta, 0))
+
class DiscretizationSolver:
def solve(self, problem, locations):
raise NotImplementedError()
+
class RelaxationSolver:
def solve(self, problem, distances):
raise NotImplementedError()
+
class DistanceSampler:
def sample(self, problem):
raise NotImplementedError()
+
class AssignmentObjective:
- def evaluate(self, problem, distance_result, relaxation_result, discretization_result):
+ def evaluate(
+ self, problem, distance_result, relaxation_result, discretization_result
+ ):
raise NotImplementedError()
+
class AssignmentSolver:
- def __init__(self, distance_sampler, relaxation_solver, discretization_solver, objective, maximum_iterations = 1000):
+ def __init__(
+ self,
+ distance_sampler,
+ relaxation_solver,
+ discretization_solver,
+ objective,
+ maximum_iterations=1000,
+ ):
self.maximum_iterations = maximum_iterations
self.relaxation_solver = relaxation_solver
@@ -47,12 +66,21 @@ def solve(self, problem):
for assignment_iteration in range(self.maximum_iterations):
distance_result = self.distance_sampler.sample(problem)
- relaxation_result = self.relaxation_solver.solve(problem, distance_result["distances"])
- discretization_result = self.discretization_solver.solve(problem, relaxation_result["locations"])
+ relaxation_result = self.relaxation_solver.solve(
+ problem, distance_result["distances"]
+ )
+ discretization_result = self.discretization_solver.solve(
+ problem, relaxation_result["locations"]
+ )
- assignment_result = self.objective.evaluate(problem, distance_result, relaxation_result, discretization_result)
+ assignment_result = self.objective.evaluate(
+ problem, distance_result, relaxation_result, discretization_result
+ )
- if best_result is None or assignment_result["objective"] < best_result["objective"]:
+ if (
+ best_result is None
+ or assignment_result["objective"] < best_result["objective"]
+ ):
best_result = assignment_result
assignment_result["distance"] = distance_result
@@ -65,8 +93,9 @@ def solve(self, problem):
return best_result
+
class GeneralRelaxationSolver(RelaxationSolver):
- def __init__(self, chain_solver, tail_solver = None, free_solver = None):
+ def __init__(self, chain_solver, tail_solver=None, free_solver=None):
self.chain_solver = chain_solver
self.tail_solver = tail_solver
self.free_solver = free_solver
@@ -81,6 +110,7 @@ def solve(self, problem, distances):
else:
return self.chain_solver.solve(problem, distances)
+
def sample_tail(random, anchor, distances):
angles = random.random_sample(len(distances)) * 2.0 * np.pi
offsets = np.vstack([np.cos(angles), np.sin(angles)]).T * distances[:, np.newaxis]
@@ -92,6 +122,7 @@ def sample_tail(random, anchor, distances):
return np.vstack(locations[1:])
+
class AngularTailSolver(RelaxationSolver):
def __init__(self, random):
self.random = random
@@ -111,26 +142,38 @@ def solve(self, problem, distances):
raise RuntimeError("Invalid chain for AngularTailSolver")
locations = sample_tail(self.random, anchor, distances)
- if reverse: locations = locations[::-1,:]
+ if reverse:
+ locations = locations[::-1, :]
assert len(locations) == len(distances)
- return dict(valid = True, locations = locations)
+ return dict(valid=True, locations=locations)
+
class GravityChainSolver:
- def __init__(self, random, alpha = 0.3, eps = 1.0, maximum_iterations = 1000, lateral_deviation = None):
+ def __init__(
+ self,
+ random,
+ alpha=0.3,
+ eps=1.0,
+ maximum_iterations=1000,
+ lateral_deviation=None,
+ ):
self.alpha = 0.3
self.eps = 1e-2
self.maximum_iterations = maximum_iterations
self.random = random
self.lateral_deviation = lateral_deviation
- def solve_two_points(self, problem, origin, destination, distances, direction, direct_distance):
+ def solve_two_points(
+ self, problem, origin, destination, distances, direction, direct_distance
+ ):
if direct_distance == 0.0:
location = origin + direction * distances[0]
return dict(
- valid = distances[0] == distances[1],
- locations = location.reshape(-1, 2), iterations = None
+ valid=distances[0] == distances[1],
+ locations=location.reshape(-1, 2),
+ iterations=None,
)
elif direct_distance > np.sum(distances):
@@ -141,9 +184,7 @@ def solve_two_points(self, problem, origin, destination, distances, direction, d
location = origin + direction * ratio * direct_distance
- return dict(
- valid = False, locations = location.reshape(-1, 2), iterations = None
- )
+ return dict(valid=False, locations=location.reshape(-1, 2), iterations=None)
elif direct_distance < np.abs(distances[0] - distances[1]):
ratio = 1.0
@@ -154,24 +195,24 @@ def solve_two_points(self, problem, origin, destination, distances, direction, d
maximum_distance = max(distances)
location = origin + direction * ratio * maximum_distance
- return dict(
- valid = False, locations = location.reshape(-1, 2), iterations = None
- )
+ return dict(valid=False, locations=location.reshape(-1, 2), iterations=None)
else:
- A = 0.5 * ( distances[0]**2 - distances[1]**2 + direct_distance**2 ) / direct_distance
- H = np.sqrt(max(0, distances[0]**2 - A**2))
+ A = (
+ 0.5
+ * (distances[0] ** 2 - distances[1] ** 2 + direct_distance**2)
+ / direct_distance
+ )
+ H = np.sqrt(max(0, distances[0] ** 2 - A**2))
r = self.random.random_sample()
center = origin + direction * A
offset = direction * H
- offset = np.array([offset[0,1], -offset[0,0]])
+ offset = np.array([offset[0, 1], -offset[0, 0]])
location = center + (1.0 if r < 0.5 else -1.0) * offset
- return dict(
- valid = True, locations = location.reshape(-1, 2), iterations = None
- )
+ return dict(valid=True, locations=location.reshape(-1, 2), iterations=None)
def solve(self, problem, distances):
origin, destination = problem["origin"], problem["destination"]
@@ -182,21 +223,23 @@ def solve(self, problem, distances):
# Prepare direction and normal direction
direct_distance = la.norm(destination - origin)
- if direct_distance < 1e-12: # We have a zero direct distance, choose a direction randomly
+ if (
+ direct_distance < 1e-12
+ ): # We have a zero direct distance, choose a direction randomly
angle = self.random.random() * np.pi * 2.0
- direction = np.array([
- np.cos(angle), np.sin(angle)
- ]).reshape((1, 2))
+ direction = np.array([np.cos(angle), np.sin(angle)]).reshape((1, 2))
else:
direction = (destination - origin) / direct_distance
- normal = np.array([direction[0,1], -direction[0,0]])
+ normal = np.array([direction[0, 1], -direction[0, 0]])
# If we have only one variable point, take a short cut
if problem["size"] == 1:
- return self.solve_two_points(problem, origin, destination, distances, direction, direct_distance)
+ return self.solve_two_points(
+ problem, origin, destination, distances, direction, direct_distance
+ )
# Prepare initial locations
if np.sum(distances) < 1e-12:
@@ -208,52 +251,76 @@ def solve(self, problem, distances):
locations = np.vstack([origin, locations, destination])
if not check_feasibility(distances, direct_distance):
- return dict( # We still return some locations although they may not be perfect
- valid = False, locations = locations[1:-1], iterations = None
+ return (
+ dict( # We still return some locations although they may not be perfect
+ valid=False, locations=locations[1:-1], iterations=None
+ )
)
# Add lateral devations
- lateral_deviation = self.lateral_deviation if not self.lateral_deviation is None else max(direct_distance, 1.0)
- locations[1:-1] += normal * 2.0 * (self.random.normal(size = len(distances) - 1)[:, np.newaxis] - 0.5) * lateral_deviation
+ lateral_deviation = (
+ self.lateral_deviation
+ if not self.lateral_deviation is None
+ else max(direct_distance, 1.0)
+ )
+ locations[1:-1] += (
+ normal
+ * 2.0
+ * (self.random.normal(size=len(distances) - 1)[:, np.newaxis] - 0.5)
+ * lateral_deviation
+ )
# Prepare gravity simulation
valid = False
origin_weights = np.ones((len(distances) - 1, 2))
- origin_weights[0,:] = 2.0
+ origin_weights[0, :] = 2.0
destination_weights = np.ones((len(distances) - 1, 2))
- destination_weights[-1,:] = 2.0
+ destination_weights[-1, :] = 2.0
# Run gravity simulation
for k in range(self.maximum_iterations):
directions = locations[:-1] - locations[1:]
- lengths = la.norm(directions, axis = 1)
+ lengths = la.norm(directions, axis=1)
offset = distances - lengths
lengths[lengths < 1.0] = 1.0
directions /= lengths[:, np.newaxis]
- if np.all(np.abs(offset) < self.eps): # Check if we have converged
+ if np.all(np.abs(offset) < self.eps): # Check if we have converged
valid = True
break
# Apply adjustment to locations
adjustment = np.zeros((len(distances) - 1, 2))
- adjustment -= 0.5 * self.alpha * offset[:-1, np.newaxis] * directions[:-1] * origin_weights
- adjustment += 0.5 * self.alpha * offset[1:, np.newaxis] * directions[1:] * destination_weights
+ adjustment -= (
+ 0.5
+ * self.alpha
+ * offset[:-1, np.newaxis]
+ * directions[:-1]
+ * origin_weights
+ )
+ adjustment += (
+ 0.5
+ * self.alpha
+ * offset[1:, np.newaxis]
+ * directions[1:]
+ * destination_weights
+ )
locations[1:-1] += adjustment
if np.isnan(locations).any() or np.isinf(locations).any():
- raise RuntimeError("NaN/Inf value encountered during gravity simulation")
+ raise RuntimeError(
+ "NaN/Inf value encountered during gravity simulation"
+ )
+
+ return dict(valid=valid, locations=locations[1:-1], iterations=k)
- return dict(
- valid = valid, locations = locations[1:-1], iterations = k
- )
class FeasibleDistanceSampler(DistanceSampler):
- def __init__(self, random, maximum_iterations = 1000):
+ def __init__(self, random, maximum_iterations=1000):
self.maximum_iterations = maximum_iterations
self.random = random
@@ -264,26 +331,26 @@ def sample_distances(self, problem):
def sample(self, problem):
origin, destination = problem["origin"], problem["destination"]
- if origin is None and destination is None: # This is a free chain
+ if origin is None and destination is None: # This is a free chain
distances = self.sample_distances(problem)
- return dict(valid = True, distances = distances, iterations = None)
+ return dict(valid=True, distances=distances, iterations=None)
- elif origin is None: # This is a left tail
+ elif origin is None: # This is a left tail
distances = self.sample_distances(problem)
- return dict(valid = True, distances = distances, iterations = None)
+ return dict(valid=True, distances=distances, iterations=None)
- elif destination is None: # This is a right tail
+ elif destination is None: # This is a right tail
distances = self.sample_distances(problem)
- return dict(valid = True, distances = distances, iterations = None)
+ return dict(valid=True, distances=distances, iterations=None)
- direct_distance = la.norm(destination - origin, axis = 1)
+ direct_distance = la.norm(destination - origin, axis=1)
# One point and two trips
if direct_distance < 1e-3 and problem["size"] == 1:
distances = self.sample_distances(problem)
distances = np.array([distances[0], distances[0]])
- return dict(valid = True, distances = distances, iterations = None)
+ return dict(valid=True, distances=distances, iterations=None)
# This is the general case
best_distances = None
@@ -300,32 +367,35 @@ def sample(self, problem):
if delta == 0.0:
break
- return dict(
- valid = best_delta == 0.0,
- distances = best_distances,
- iterations = k
- )
+ return dict(valid=best_delta == 0.0, distances=best_distances, iterations=k)
+
class DiscretizationErrorObjective(AssignmentObjective):
def __init__(self, thresholds):
self.thresholds = thresholds
- def evaluate(self, problem, distance_result, relaxation_result, discretization_result):
+ def evaluate(
+ self, problem, distance_result, relaxation_result, discretization_result
+ ):
sampled_distances = distance_result["distances"]
discretized_locations = []
- if not problem["origin"] is None: discretized_locations.append(problem["origin"])
+ if not problem["origin"] is None:
+ discretized_locations.append(problem["origin"])
discretized_locations.append(discretization_result["locations"])
- if not problem["destination"] is None: discretized_locations.append(problem["destination"])
+ if not problem["destination"] is None:
+ discretized_locations.append(problem["destination"])
discretized_locations = np.vstack(discretized_locations)
- discretized_distances = la.norm(discretized_locations[:-1] - discretized_locations[1:], axis = 1)
+ discretized_distances = la.norm(
+ discretized_locations[:-1] - discretized_locations[1:], axis=1
+ )
discretization_error = np.abs(sampled_distances - discretized_distances)
objective = 0.0
for error, mode in zip(discretization_error, problem["modes"]):
target_error = self.thresholds[mode]
- excess_error = max(0.0, error - target_error )
+ excess_error = max(0.0, error - target_error)
objective = max(objective, excess_error)
valid = objective == 0.0
@@ -333,4 +403,4 @@ def evaluate(self, problem, distance_result, relaxation_result, discretization_r
valid &= relaxation_result["valid"]
valid &= discretization_result["valid"]
- return dict(valid = valid, objective = objective)
+ return dict(valid=valid, objective=objective)
diff --git a/synthesis/population/trips.py b/synthesis/population/trips.py
index 7a76af96..c4483743 100644
--- a/synthesis/population/trips.py
+++ b/synthesis/population/trips.py
@@ -7,12 +7,14 @@
This stage duplicates trips and attaches them to the synthetic population.
"""
+
def configure(context):
context.stage("synthesis.population.matched")
context.config("random_seed")
hts = context.config("hts")
- context.stage("data.hts.selected", alias = "hts")
+ context.stage("data.hts.selected", alias="hts")
+
def execute(context):
# Load data
@@ -20,23 +22,39 @@ def execute(context):
# Duplicate with synthetic persons
df_matching = context.stage("synthesis.population.matched")
- df_trips = df_trips.rename(columns = { "person_id": "hts_id" })
- df_trips = pd.merge(df_matching, df_trips, on = "hts_id")
- df_trips = df_trips.sort_values(by = ["person_id", "trip_id"])
+ df_trips = df_trips.rename(columns={"person_id": "hts_id"})
+ df_trips = pd.merge(df_matching, df_trips, on="hts_id")
+ df_trips = df_trips.sort_values(by=["person_id", "trip_id"])
# Define trip index
- df_count = df_trips.groupby("person_id").size().reset_index(name = "count")
- df_trips["trip_index"] = np.hstack([np.arange(count) for count in df_count["count"].values])
- df_trips = df_trips.sort_values(by = ["person_id", "trip_index"])
+ df_count = df_trips.groupby("person_id").size().reset_index(name="count")
+ df_trips["trip_index"] = np.hstack(
+ [np.arange(count) for count in df_count["count"].values]
+ )
+ df_trips = df_trips.sort_values(by=["person_id", "trip_index"])
# Diversify departure times
random = np.random.RandomState(context.config("random_seed"))
- counts = df_trips[["person_id"]].groupby("person_id").size().reset_index(name = "count")["count"].values
+ counts = (
+ df_trips[["person_id"]]
+ .groupby("person_id")
+ .size()
+ .reset_index(name="count")["count"]
+ .values
+ )
- interval = df_trips[["person_id", "departure_time"]].groupby("person_id").min().reset_index()["departure_time"].values
- interval = np.minimum(1800.0, interval) # If first departure time is just 5min after midnight, we only add a deviation of 5min
+ interval = (
+ df_trips[["person_id", "departure_time"]]
+ .groupby("person_id")
+ .min()
+ .reset_index()["departure_time"]
+ .values
+ )
+ interval = np.minimum(
+ 1800.0, interval
+ ) # If first departure time is just 5min after midnight, we only add a deviation of 5min
- offset = random.random_sample(size = (len(counts), )) * interval * 2.0 - interval
+ offset = random.random_sample(size=(len(counts),)) * interval * 2.0 - interval
offset = np.repeat(offset, counts)
df_trips["departure_time"] += offset
@@ -47,11 +65,18 @@ def execute(context):
assert (df_trips["departure_time"] >= 0.0).all()
assert (df_trips["arrival_time"] >= 0.0).all()
- return df_trips[[
- "person_id", "trip_index",
- "departure_time", "arrival_time",
- "preceding_purpose", "following_purpose",
- "is_first_trip", "is_last_trip",
- "trip_duration", "activity_duration",
- "mode"
- ]]
+ return df_trips[
+ [
+ "person_id",
+ "trip_index",
+ "departure_time",
+ "arrival_time",
+ "preceding_purpose",
+ "following_purpose",
+ "is_first_trip",
+ "is_last_trip",
+ "trip_duration",
+ "activity_duration",
+ "mode",
+ ]
+ ]
diff --git a/synthesis/vehicles/cars/default.py b/synthesis/vehicles/cars/default.py
index 1bf32836..0ebf92e4 100644
--- a/synthesis/vehicles/cars/default.py
+++ b/synthesis/vehicles/cars/default.py
@@ -5,20 +5,34 @@
Creates a vehicle fleet based on a default vehicle type
"""
+
def configure(context):
context.stage("synthesis.population.enriched")
+
def execute(context):
df_persons = context.stage("synthesis.population.enriched")
- df_vehicle_types = pd.DataFrame.from_records([{
- "type_id": "default_car", "nb_seats": 4, "length": 5.0, "width": 1.0, "pce": 1.0, "mode": "car",
- "hbefa_cat": "PASSENGER_CAR", "hbefa_tech": "average", "hbefa_size": "average", "hbefa_emission": "average",
- }])
+ df_vehicle_types = pd.DataFrame.from_records(
+ [
+ {
+ "type_id": "default_car",
+ "nb_seats": 4,
+ "length": 5.0,
+ "width": 1.0,
+ "pce": 1.0,
+ "mode": "car",
+ "hbefa_cat": "PASSENGER_CAR",
+ "hbefa_tech": "average",
+ "hbefa_size": "average",
+ "hbefa_emission": "average",
+ }
+ ]
+ )
df_vehicles = df_persons[["person_id"]].copy()
- df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" })
-
+ df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"})
+
df_vehicles["mode"] = "car"
df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car"
@@ -28,4 +42,4 @@ def execute(context):
df_vehicles["age"] = 0
df_vehicles["euro"] = 6
- return df_vehicle_types, df_vehicles
\ No newline at end of file
+ return df_vehicle_types, df_vehicles
diff --git a/synthesis/vehicles/cars/fleet_sampling.py b/synthesis/vehicles/cars/fleet_sampling.py
index dcd20a5a..269a2404 100644
--- a/synthesis/vehicles/cars/fleet_sampling.py
+++ b/synthesis/vehicles/cars/fleet_sampling.py
@@ -7,6 +7,7 @@
Creates the synthetic vehicle fleet
"""
+
def configure(context):
context.stage("synthesis.population.enriched")
context.stage("synthesis.population.spatial.home.zones")
@@ -15,21 +16,30 @@ def configure(context):
context.config("vehicles_year", 2021)
+
def _sample_vehicle(context, args):
vehicle = args
year = context.config("vehicles_year")
- df_vehicle_fleet_counts, df_vehicle_age_counts = context.data("fleet"), context.data("age")
+ df_vehicle_fleet_counts, df_vehicle_age_counts = context.data(
+ "fleet"
+ ), context.data("age")
commune_id = vehicle["commune_id"]
- if commune_id in df_vehicle_fleet_counts["commune_id"].unique():
- fleet = df_vehicle_fleet_counts.loc[df_vehicle_fleet_counts["commune_id"] == commune_id]
+ if commune_id in df_vehicle_fleet_counts["commune_id"].unique():
+ fleet = df_vehicle_fleet_counts.loc[
+ df_vehicle_fleet_counts["commune_id"] == commune_id
+ ]
choice = fleet.sample(weights="fleet")
critair = choice["critair"].values[0]
technology = choice["technology"].values[0]
- age_mask = (df_vehicle_age_counts["critair"] == critair) & (df_vehicle_age_counts["technology"] == technology)
- age = df_vehicle_age_counts.loc[age_mask].sample(weights="fleet")["age"].values[0]
+ age_mask = (df_vehicle_age_counts["critair"] == critair) & (
+ df_vehicle_age_counts["technology"] == technology
+ )
+ age = (
+ df_vehicle_age_counts.loc[age_mask].sample(weights="fleet")["age"].values[0]
+ )
else:
choice = df_vehicle_age_counts.sample(weights="fleet")
critair = choice["critair"].values[0]
@@ -53,16 +63,24 @@ def _sample_vehicle(context, args):
context.progress.update()
return vehicle
+
def _get_euro_from_critair(vehicle, year):
- critair = vehicle["critair"] # Crit'air 1, Crit'air 2, ..., Crit'air 5, Crit'air E, Non classée
- technology = vehicle["technology"] # Gazole, Essence, Electrique et hydrogène, Essence hybride rechargeable, Gaz, Gazole hybride rechargeable
- age = vehicle["age"] # 0 ans, 1 ans, ..., 19 ans, >20 ans
+ critair = vehicle[
+ "critair"
+ ] # Crit'air 1, Crit'air 2, ..., Crit'air 5, Crit'air E, Non classée
+ technology = vehicle[
+ "technology"
+ ] # Gazole, Essence, Electrique et hydrogène, Essence hybride rechargeable, Gaz, Gazole hybride rechargeable
+ age = vehicle["age"] # 0 ans, 1 ans, ..., 19 ans, >20 ans
# we are using the following table : https://www.ecologie.gouv.fr/sites/default/files/Tableau_classification_des_vehicules.pdf
- age_num = re.findall(r'\d+', age)
+ age_num = re.findall(r"\d+", age)
if len(age_num) == 0:
- raise RuntimeError("Badly formatted 'age' variable found for vehicle (id: %s) : %s" % (age, vehicle["vehicle_id"]))
+ raise RuntimeError(
+ "Badly formatted 'age' variable found for vehicle (id: %s) : %s"
+ % (age, vehicle["vehicle_id"])
+ )
birthday = int(year) - int(age_num[0])
@@ -92,7 +110,7 @@ def _get_euro_from_critair(vehicle, year):
if critair == "Crit'air 2" and technology == "Gazole":
euro = max(euro, 5) # or 6 in table
if critair == "Crit'air 3" and technology == "Essence":
- euro = max(euro, 2) # or 3 in table
+ euro = max(euro, 2) # or 3 in table
if critair == "Crit'air 3" and technology == "Gazole":
euro = max(euro, 4)
if critair == "Crit'air 4" and technology == "Gazole":
@@ -103,14 +121,15 @@ def _get_euro_from_critair(vehicle, year):
euro = max(euro, 1)
euro = str(euro)
- if euro == '6':
+ if euro == "6":
if 2016 <= birthday < 2019:
- euro = '6ab'
+ euro = "6ab"
else:
- euro = '6c'
+ euro = "6c"
return euro
+
def execute(context):
df_vehicle_types = context.stage("data.vehicles.types")
@@ -118,11 +137,15 @@ def execute(context):
df_persons = context.stage("synthesis.population.enriched")
df_homes = context.stage("synthesis.population.spatial.home.zones")
- df_vehicles = pd.merge(df_persons[["household_id", "person_id"]], df_homes[["household_id", "commune_id"]], on = "household_id")
+ df_vehicles = pd.merge(
+ df_persons[["household_id", "person_id"]],
+ df_homes[["household_id", "commune_id"]],
+ on="household_id",
+ )
- df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" })
+ df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"})
df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car"
- df_vehicles = df_vehicles.drop_duplicates("vehicle_id") # is this needed?
+ df_vehicles = df_vehicles.drop_duplicates("vehicle_id") # is this needed?
df_vehicles["type_id"] = "default_car"
df_vehicles["mode"] = "car"
@@ -130,11 +153,17 @@ def execute(context):
res = []
- with context.progress(label = "Processing vehicles data ...", total = len(df_vehicles)) as progress:
- with context.parallel(dict(fleet = df_vehicle_fleet_counts, age = df_vehicle_age_counts)) as parallel:
- for df_partial in parallel.imap(_sample_vehicle, df_vehicles.to_dict(orient="records")):
+ with context.progress(
+ label="Processing vehicles data ...", total=len(df_vehicles)
+ ) as progress:
+ with context.parallel(
+ dict(fleet=df_vehicle_fleet_counts, age=df_vehicle_age_counts)
+ ) as parallel:
+ for df_partial in parallel.imap(
+ _sample_vehicle, df_vehicles.to_dict(orient="records")
+ ):
res.append(df_partial)
df_vehicles = pd.DataFrame.from_dict(res)
- return df_vehicle_types, df_vehicles
\ No newline at end of file
+ return df_vehicle_types, df_vehicles
diff --git a/synthesis/vehicles/passengers/default.py b/synthesis/vehicles/passengers/default.py
index 6916f5bb..4ed9249f 100644
--- a/synthesis/vehicles/passengers/default.py
+++ b/synthesis/vehicles/passengers/default.py
@@ -5,20 +5,34 @@
Creates a vehicle fleet based on a default vehicle type for the dummy passenger mode
"""
+
def configure(context):
context.stage("synthesis.population.enriched")
+
def execute(context):
df_persons = context.stage("synthesis.population.enriched")
- df_vehicle_types = pd.DataFrame.from_records([{
- "type_id": "default_car_passenger", "nb_seats": 4, "length": 5.0, "width": 1.0, "pce": 1.0, "mode": "car_passenger",
- "hbefa_cat": "PASSENGER_CAR", "hbefa_tech": "average", "hbefa_size": "average", "hbefa_emission": "average",
- }])
+ df_vehicle_types = pd.DataFrame.from_records(
+ [
+ {
+ "type_id": "default_car_passenger",
+ "nb_seats": 4,
+ "length": 5.0,
+ "width": 1.0,
+ "pce": 1.0,
+ "mode": "car_passenger",
+ "hbefa_cat": "PASSENGER_CAR",
+ "hbefa_tech": "average",
+ "hbefa_size": "average",
+ "hbefa_emission": "average",
+ }
+ ]
+ )
df_vehicles = df_persons[["person_id"]].copy()
- df_vehicles = df_vehicles.rename(columns = { "person_id": "owner_id" })
-
+ df_vehicles = df_vehicles.rename(columns={"person_id": "owner_id"})
+
df_vehicles["mode"] = "car_passenger"
df_vehicles["vehicle_id"] = df_vehicles["owner_id"].astype(str) + ":car_passenger"
@@ -28,4 +42,4 @@ def execute(context):
df_vehicles["age"] = 0
df_vehicles["euro"] = 6
- return df_vehicle_types, df_vehicles
\ No newline at end of file
+ return df_vehicle_types, df_vehicles
diff --git a/synthesis/vehicles/vehicles.py b/synthesis/vehicles/vehicles.py
index 922cd36c..14cf552b 100644
--- a/synthesis/vehicles/vehicles.py
+++ b/synthesis/vehicles/vehicles.py
@@ -1,20 +1,24 @@
import pandas as pd
+
def configure(context):
method = context.config("vehicles_method", "default")
if method == "default":
- context.stage("synthesis.vehicles.cars.default", alias = "cars")
+ context.stage("synthesis.vehicles.cars.default", alias="cars")
elif method == "fleet_sample":
- context.stage("synthesis.vehicles.cars.fleet_sampling", alias = "cars")
+ context.stage("synthesis.vehicles.cars.fleet_sampling", alias="cars")
else:
raise RuntimeError("Unknown vehicles generation method : %s" % method)
-
+
context.stage("synthesis.vehicles.passengers.default")
+
def execute(context):
df_car_types, df_cars = context.stage("cars")
- df_passenger_types, df_passengers = context.stage("synthesis.vehicles.passengers.default")
+ df_passenger_types, df_passengers = context.stage(
+ "synthesis.vehicles.passengers.default"
+ )
df_vehicles = pd.concat([df_cars, df_passengers])
df_types = pd.concat([df_car_types, df_passenger_types])
diff --git a/tests/test_determinism.py b/tests/test_determinism.py
index e2755d7a..233c7934 100644
--- a/tests/test_determinism.py
+++ b/tests/test_determinism.py
@@ -4,6 +4,7 @@
from . import testdata
import sqlite3
+
def hash_sqlite_db(path):
"""
Hash SQLite database file from its dump.
@@ -37,6 +38,7 @@ def hash_file(file):
f.close()
return hash.hexdigest()
+
def test_determinism(tmpdir):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)
@@ -44,51 +46,62 @@ def test_determinism(tmpdir):
for index in range(2):
_test_determinism(index, data_path, tmpdir)
+
def _test_determinism(index, data_path, tmpdir):
print("Running index %d" % index)
cache_path = str(tmpdir.mkdir("cache_%d" % index))
output_path = str(tmpdir.mkdir("output_%d" % index))
config = dict(
- data_path = data_path, output_path = output_path,
- regions = [10, 11], sampling_rate = 1.0, hts = "entd",
- random_seed = 1000, processes = 1,
- secloc_maximum_iterations = 10,
- maven_skip_tests = True,
- matching_attributes = [
- "sex", "any_cars", "age_class", "socioprofessional_class",
- "income_class", "departement_id"
- ]
+ data_path=data_path,
+ output_path=output_path,
+ regions=[10, 11],
+ sampling_rate=1.0,
+ hts="entd",
+ random_seed=1000,
+ processes=1,
+ secloc_maximum_iterations=10,
+ maven_skip_tests=True,
+ matching_attributes=[
+ "sex",
+ "any_cars",
+ "age_class",
+ "socioprofessional_class",
+ "income_class",
+ "departement_id",
+ ],
)
stages = [
- dict(descriptor = "synthesis.output"),
+ dict(descriptor="synthesis.output"),
]
- synpp.run(stages, config, working_directory = cache_path)
+ synpp.run(stages, config, working_directory=cache_path)
REFERENCE_CSV_HASHES = {
- "ile_de_france_activities.csv": "53c44fb4026d2037729ee8ff1c8fb93f",
- "ile_de_france_households.csv": "ca2a29ef13467326f937638f1ff8be1a",
- "ile_de_france_persons.csv": "ddbe9b418c915b14e888b54efbdf9b1e",
- "ile_de_france_trips.csv": "6c5f3427e41e683da768eeb53796a806",
- "ile_de_france_vehicle_types.csv": "00bee1ea6d7bc9af43ae6c7101dd75da",
- "ile_de_france_vehicles.csv": "3567b0f29e51d521b13d91c82c77cecb",
+ "ile_de_france_activities.csv": "53c44fb4026d2037729ee8ff1c8fb93f",
+ "ile_de_france_households.csv": "ca2a29ef13467326f937638f1ff8be1a",
+ "ile_de_france_persons.csv": "ddbe9b418c915b14e888b54efbdf9b1e",
+ "ile_de_france_trips.csv": "6c5f3427e41e683da768eeb53796a806",
+ "ile_de_france_vehicle_types.csv": "00bee1ea6d7bc9af43ae6c7101dd75da",
+ "ile_de_france_vehicles.csv": "3567b0f29e51d521b13d91c82c77cecb",
}
REFERENCE_GPKG_HASHES = {
- "ile_de_france_activities.gpkg": "884eec1fd0c29904284eb4362ff89be1",
- "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3",
- "ile_de_france_homes.gpkg": "a85e973f0e2f51031cd60170d351845e",
- "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8",
+ "ile_de_france_activities.gpkg": "884eec1fd0c29904284eb4362ff89be1",
+ "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3",
+ "ile_de_france_homes.gpkg": "a85e973f0e2f51031cd60170d351845e",
+ "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8",
}
generated_csv_hashes = {
- file: hash_file("%s/%s" % (output_path, file)) for file in REFERENCE_CSV_HASHES.keys()
+ file: hash_file("%s/%s" % (output_path, file))
+ for file in REFERENCE_CSV_HASHES.keys()
}
generated_gpkg_hashes = {
- file: hash_sqlite_db("%s/%s" % (output_path, file)) for file in REFERENCE_GPKG_HASHES.keys()
+ file: hash_sqlite_db("%s/%s" % (output_path, file))
+ for file in REFERENCE_GPKG_HASHES.keys()
}
print("Generated CSV hashes: ", generated_csv_hashes)
@@ -100,6 +113,7 @@ def _test_determinism(index, data_path, tmpdir):
for file in REFERENCE_GPKG_HASHES.keys():
assert REFERENCE_GPKG_HASHES[file] == generated_gpkg_hashes[file]
+
def test_determinism_matsim(tmpdir):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)
@@ -107,36 +121,45 @@ def test_determinism_matsim(tmpdir):
for index in range(2):
_test_determinism_matsim(index, data_path, tmpdir)
+
def _test_determinism_matsim(index, data_path, tmpdir):
print("Running index %d" % index)
cache_path = str(tmpdir.mkdir("cache_%d" % index))
output_path = str(tmpdir.mkdir("output_%d" % index))
config = dict(
- data_path = data_path, output_path = output_path,
- regions = [10, 11], sampling_rate = 1.0, hts = "entd",
- random_seed = 1000, processes = 1,
- secloc_maximum_iterations = 10,
- maven_skip_tests = True,
- matching_attributes = [
- "sex", "any_cars", "age_class", "socioprofessional_class",
- "income_class", "departement_id"
- ]
+ data_path=data_path,
+ output_path=output_path,
+ regions=[10, 11],
+ sampling_rate=1.0,
+ hts="entd",
+ random_seed=1000,
+ processes=1,
+ secloc_maximum_iterations=10,
+ maven_skip_tests=True,
+ matching_attributes=[
+ "sex",
+ "any_cars",
+ "age_class",
+ "socioprofessional_class",
+ "income_class",
+ "departement_id",
+ ],
)
stages = [
- dict(descriptor = "matsim.output"),
+ dict(descriptor="matsim.output"),
]
- synpp.run(stages, config, working_directory = cache_path)
+ synpp.run(stages, config, working_directory=cache_path)
REFERENCE_HASHES = {
- #"ile_de_france_population.xml.gz": "e1407f918cb92166ebf46ad769d8d085",
- #"ile_de_france_network.xml.gz": "5f10ec295b49d2bb768451c812955794",
- "ile_de_france_households.xml.gz": "64a0c9fab72aad51bc6adb926a1c9d44",
- #"ile_de_france_facilities.xml.gz": "5ad41afff9ae5c470082510b943e6778",
- "ile_de_france_config.xml": "30871dfbbd2b5bf6922be1dfe20ffe73",
- "ile_de_france_vehicles.xml.gz": "d7c8d0dba531a21dc83355b2f82778c2"
+ # "ile_de_france_population.xml.gz": "e1407f918cb92166ebf46ad769d8d085",
+ # "ile_de_france_network.xml.gz": "5f10ec295b49d2bb768451c812955794",
+ "ile_de_france_households.xml.gz": "64a0c9fab72aad51bc6adb926a1c9d44",
+ # "ile_de_france_facilities.xml.gz": "5ad41afff9ae5c470082510b943e6778",
+ "ile_de_france_config.xml": "30871dfbbd2b5bf6922be1dfe20ffe73",
+ "ile_de_france_vehicles.xml.gz": "d7c8d0dba531a21dc83355b2f82778c2",
}
# activities.gpkg, trips.gpkg, meta.json,
@@ -147,7 +170,8 @@ def _test_determinism_matsim(index, data_path, tmpdir):
# detailed inspection of meta.json would make sense!
generated_hashes = {
- file: hash_file("%s/%s" % (output_path, file)) for file in REFERENCE_HASHES.keys()
+ file: hash_file("%s/%s" % (output_path, file))
+ for file in REFERENCE_HASHES.keys()
}
print("Generated hashes: ", generated_hashes)
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index d9856f52..384f8242 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -4,6 +4,7 @@
from . import testdata
import pandas as pd
+
def test_data(tmpdir):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)
@@ -11,50 +12,55 @@ def test_data(tmpdir):
cache_path = str(tmpdir.mkdir("cache"))
output_path = str(tmpdir.mkdir("output"))
config = dict(
- data_path = data_path, output_path = output_path,
- regions = [10, 11], hts = "entd")
+ data_path=data_path, output_path=output_path, regions=[10, 11], hts="entd"
+ )
stages = [
- dict(descriptor = "data.spatial.iris"),
- dict(descriptor = "data.spatial.codes"),
- dict(descriptor = "data.spatial.population"),
- dict(descriptor = "data.bpe.cleaned"),
- dict(descriptor = "data.income.municipality"),
- dict(descriptor = "data.hts.entd.cleaned"),
- dict(descriptor = "data.hts.egt.cleaned"),
- dict(descriptor = "data.census.cleaned"),
- dict(descriptor = "data.od.cleaned"),
- dict(descriptor = "data.hts.output"),
- dict(descriptor = "data.sirene.output"),
+ dict(descriptor="data.spatial.iris"),
+ dict(descriptor="data.spatial.codes"),
+ dict(descriptor="data.spatial.population"),
+ dict(descriptor="data.bpe.cleaned"),
+ dict(descriptor="data.income.municipality"),
+ dict(descriptor="data.hts.entd.cleaned"),
+ dict(descriptor="data.hts.egt.cleaned"),
+ dict(descriptor="data.census.cleaned"),
+ dict(descriptor="data.od.cleaned"),
+ dict(descriptor="data.hts.output"),
+ dict(descriptor="data.sirene.output"),
]
- synpp.run(stages, config, working_directory = cache_path)
+ synpp.run(stages, config, working_directory=cache_path)
assert os.path.isfile("%s/ile_de_france_hts_households.csv" % output_path)
assert os.path.isfile("%s/ile_de_france_hts_persons.csv" % output_path)
assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path)
assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path)
-def run_population(tmpdir, hts, update = {}):
+
+def run_population(tmpdir, hts, update={}):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)
cache_path = str(tmpdir.mkdir("cache"))
output_path = str(tmpdir.mkdir("output"))
config = dict(
- data_path = data_path, output_path = output_path,
- regions = [10, 11], sampling_rate = 1.0, hts = hts,
- random_seed = 1000, processes = 1,
- secloc_maximum_iterations = 10,
- maven_skip_tests = True
+ data_path=data_path,
+ output_path=output_path,
+ regions=[10, 11],
+ sampling_rate=1.0,
+ hts=hts,
+ random_seed=1000,
+ processes=1,
+ secloc_maximum_iterations=10,
+ maven_skip_tests=True,
)
config.update(update)
stages = [
- dict(descriptor = "synthesis.output"),
+ dict(descriptor="synthesis.output"),
]
- synpp.run(stages, config, working_directory = cache_path)
+ synpp.run(stages, config, working_directory=cache_path)
assert os.path.isfile("%s/ile_de_france_activities.csv" % output_path)
assert os.path.isfile("%s/ile_de_france_persons.csv" % output_path)
@@ -63,50 +69,94 @@ def run_population(tmpdir, hts, update = {}):
assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path)
assert os.path.isfile("%s/ile_de_france_meta.json" % output_path)
- assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";"))
- assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";"))
- assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";"))
-
- assert 447 * 2 == len(pd.read_csv("%s/ile_de_france_vehicles.csv" % output_path, usecols = ["vehicle_id"], sep = ";"))
+ assert 2235 == len(
+ pd.read_csv(
+ "%s/ile_de_france_activities.csv" % output_path,
+ usecols=["household_id"],
+ sep=";",
+ )
+ )
+ assert 447 == len(
+ pd.read_csv(
+ "%s/ile_de_france_persons.csv" % output_path,
+ usecols=["household_id"],
+ sep=";",
+ )
+ )
+ assert 149 == len(
+ pd.read_csv(
+ "%s/ile_de_france_households.csv" % output_path,
+ usecols=["household_id"],
+ sep=";",
+ )
+ )
+
+ assert 447 * 2 == len(
+ pd.read_csv(
+ "%s/ile_de_france_vehicles.csv" % output_path,
+ usecols=["vehicle_id"],
+ sep=";",
+ )
+ )
if "vehicles_method" in update and update["vehicles_method"] == "fleet_sample":
- assert 17 + 1 == len(pd.read_csv("%s/ile_de_france_vehicle_types.csv" % output_path, usecols = ["type_id"], sep = ";"))
+ assert 17 + 1 == len(
+ pd.read_csv(
+ "%s/ile_de_france_vehicle_types.csv" % output_path,
+ usecols=["type_id"],
+ sep=";",
+ )
+ )
else:
- assert 2 == len(pd.read_csv("%s/ile_de_france_vehicle_types.csv" % output_path, usecols = ["type_id"], sep = ";"))
+ assert 2 == len(
+ pd.read_csv(
+ "%s/ile_de_france_vehicle_types.csv" % output_path,
+ usecols=["type_id"],
+ sep=";",
+ )
+ )
+
def test_population_with_entd(tmpdir):
run_population(tmpdir, "entd")
+
def test_population_with_egt(tmpdir):
run_population(tmpdir, "egt")
+
def test_population_with_mode_choice(tmpdir):
- run_population(tmpdir, "entd", { "mode_choice": True })
+ run_population(tmpdir, "entd", {"mode_choice": True})
+
def test_population_with_fleet_sample(tmpdir):
- run_population(tmpdir, "entd", {
- "vehicles_method": "fleet_sample",
- "vehicles_year": 2021
- })
+ run_population(
+ tmpdir, "entd", {"vehicles_method": "fleet_sample", "vehicles_year": 2021}
+ )
+
def test_population_with_bhepop2_income(tmpdir):
- run_population(tmpdir, "egt", {
- "income_assignation_method": "bhepop2"
- })
+ run_population(tmpdir, "egt", {"income_assignation_method": "bhepop2"})
+
def test_population_with_urban_type(tmpdir):
- run_population(tmpdir, "entd", {
- "use_urban_type": True,
- "matching_attributes": [
- "urban_type", "*default*"
- ],
- "matching_minimum_observations": 5
- })
+ run_population(
+ tmpdir,
+ "entd",
+ {
+ "use_urban_type": True,
+ "matching_attributes": ["urban_type", "*default*"],
+ "matching_minimum_observations": 5,
+ },
+ )
+
def test_population_with_urban_type_and_egt(tmpdir):
- run_population(tmpdir, "egt", {
- "use_urban_type": True,
- "matching_attributes": [
- "urban_type", "*default*"
- ],
- "matching_minimum_observations": 5
- })
+ run_population(
+ tmpdir,
+ "egt",
+ {
+ "use_urban_type": True,
+ "matching_attributes": ["urban_type", "*default*"],
+ "matching_minimum_observations": 5,
+ },
+ )
diff --git a/tests/test_simulation.py b/tests/test_simulation.py
index e31d6be9..baf1a2bc 100644
--- a/tests/test_simulation.py
+++ b/tests/test_simulation.py
@@ -3,6 +3,7 @@
import hashlib
from . import testdata
+
def test_simulation(tmpdir):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)
@@ -11,18 +12,20 @@ def test_simulation(tmpdir):
output_path = str(tmpdir.mkdir("output"))
config = dict(
- data_path = data_path, output_path = output_path,
- regions = [10, 11], sampling_rate = 1.0, hts = "entd",
- random_seed = 1000, processes = 1,
- secloc_maximum_iterations = 10,
- maven_skip_tests = True
+ data_path=data_path,
+ output_path=output_path,
+ regions=[10, 11],
+ sampling_rate=1.0,
+ hts="entd",
+ random_seed=1000,
+ processes=1,
+ secloc_maximum_iterations=10,
+ maven_skip_tests=True,
)
- stages = [
- dict(descriptor = "matsim.output")
- ]
+ stages = [dict(descriptor="matsim.output")]
- synpp.run(stages, config, working_directory = cache_path)
+ synpp.run(stages, config, working_directory=cache_path)
assert os.path.isfile("%s/ile_de_france_population.xml.gz" % output_path)
assert os.path.isfile("%s/ile_de_france_network.xml.gz" % output_path)
diff --git a/tests/testdata.py b/tests/testdata.py
index 6e75f71d..1ef1fae5 100644
--- a/tests/testdata.py
+++ b/tests/testdata.py
@@ -7,6 +7,7 @@
import glob
import subprocess
+
def create(output_path):
"""
This script creates test fixtures for the Île-de-France / France pipeline.
@@ -76,23 +77,27 @@ def create(output_path):
print("Creating zoning system ...")
df = []
- WITH_IRIS = set([
- "1B013", "1B014", "1B018", "1B019",
- "2D007", "2D008", "2D012", "2D013"
- ])
+ WITH_IRIS = set(
+ ["1B013", "1B014", "1B018", "1B019", "2D007", "2D008", "2D012", "2D013"]
+ )
for region_column in np.arange(2):
region_prefix = region_column + 1
- region_number = region_prefix * 10 # TODO: This means we will have 10 and 20, but the unit tests define 10 and 11 (so only 10 is used -> TODO)
+ region_number = (
+ region_prefix * 10
+ ) # TODO: This means we will have 10 and 20, but the unit tests define 10 and 11 (so only 10 is used -> TODO)
region_x = anchor_x + region_column * REGION_LENGTH
region_y = anchor_y + 0
for department_row in np.arange(2):
for department_column in np.arange(2):
- department_letter = { (0, 0): "A", (0, 1): "B", (1, 0): "C", (1, 1): "D" }[(
- department_row, department_column
- )]
+ department_letter = {
+ (0, 0): "A",
+ (0, 1): "B",
+ (1, 0): "C",
+ (1, 1): "D",
+ }[(department_row, department_column)]
department_name = "%d%s" % (region_prefix, department_letter)
@@ -100,13 +105,20 @@ def create(output_path):
department_y = region_y - department_row * DEPARTMENT_LENGTH
for municipality_index in np.arange(25):
- municipality_name = "%s%03d" % (department_name, municipality_index + 1)
+ municipality_name = "%s%03d" % (
+ department_name,
+ municipality_index + 1,
+ )
municipality_row = municipality_index // 5
municipality_column = municipality_index % 5
- municipality_x = department_x + municipality_column * MUNICIPALITY_LENGTH
- municipality_y = department_y - municipality_row * MUNICIPALITY_LENGTH
+ municipality_x = (
+ department_x + municipality_column * MUNICIPALITY_LENGTH
+ )
+ municipality_y = (
+ department_y - municipality_row * MUNICIPALITY_LENGTH
+ )
if municipality_name in WITH_IRIS:
for iris_index in np.arange(100):
@@ -118,48 +130,61 @@ def create(output_path):
iris_x = municipality_x + iris_column * IRIS_LENGTH
iris_y = municipality_y - iris_row * IRIS_LENGTH
- iris_polygon = geo.Polygon([
- (iris_x, iris_y), (iris_x + IRIS_LENGTH, iris_y),
- (iris_x + IRIS_LENGTH, iris_y - IRIS_LENGTH),
- (iris_x, iris_y - IRIS_LENGTH)
- ])
-
- df.append(dict(
- region = region_number,
- department = department_name,
- municipality = municipality_name,
- iris = iris_name,
- geometry = iris_polygon
- ))
+ iris_polygon = geo.Polygon(
+ [
+ (iris_x, iris_y),
+ (iris_x + IRIS_LENGTH, iris_y),
+ (iris_x + IRIS_LENGTH, iris_y - IRIS_LENGTH),
+ (iris_x, iris_y - IRIS_LENGTH),
+ ]
+ )
+
+ df.append(
+ dict(
+ region=region_number,
+ department=department_name,
+ municipality=municipality_name,
+ iris=iris_name,
+ geometry=iris_polygon,
+ )
+ )
else:
- municipality_polygon = geo.Polygon([
- (municipality_x, municipality_y), (municipality_x + MUNICIPALITY_LENGTH, municipality_y),
- (municipality_x + MUNICIPALITY_LENGTH, municipality_y - MUNICIPALITY_LENGTH),
- (municipality_x, municipality_y - MUNICIPALITY_LENGTH)
- ])
+ municipality_polygon = geo.Polygon(
+ [
+ (municipality_x, municipality_y),
+ (municipality_x + MUNICIPALITY_LENGTH, municipality_y),
+ (
+ municipality_x + MUNICIPALITY_LENGTH,
+ municipality_y - MUNICIPALITY_LENGTH,
+ ),
+ (municipality_x, municipality_y - MUNICIPALITY_LENGTH),
+ ]
+ )
iris_name = "%s0000" % municipality_name
- df.append(dict(
- region = region_number,
- department = department_name,
- municipality = municipality_name,
- iris = iris_name,
- geometry = municipality_polygon
- ))
+ df.append(
+ dict(
+ region=region_number,
+ department=department_name,
+ municipality=municipality_name,
+ iris=iris_name,
+ geometry=municipality_polygon,
+ )
+ )
df = pd.DataFrame.from_records(df)
- df = gpd.GeoDataFrame(df, crs = "EPSG:2154")
-
+ df = gpd.GeoDataFrame(df, crs="EPSG:2154")
+
# Dataset: IRIS zones
# Required attributes: CODE_IRIS, INSEE_COM, geometry
print("Creating IRIS zones ...")
df_iris = df.copy()
- df_iris = df_iris[["iris", "municipality", "geometry"]].rename(columns = dict(
- iris = "CODE_IRIS", municipality = "INSEE_COM"
- ))
+ df_iris = df_iris[["iris", "municipality", "geometry"]].rename(
+ columns=dict(iris="CODE_IRIS", municipality="INSEE_COM")
+ )
os.mkdir("%s/iris_2021" % output_path)
df_iris.to_file("%s/iris_2021/CONTOURS-IRIS.shp" % output_path)
@@ -174,17 +199,20 @@ def create(output_path):
print("Creating codes ...")
df_codes = df.copy()
- df_codes = df_codes[["iris", "municipality", "department", "region"]].rename(columns = dict(
- iris = "CODE_IRIS", municipality = "DEPCOM", department = "DEP", region = "REG"
- ))
+ df_codes = df_codes[["iris", "municipality", "department", "region"]].rename(
+ columns=dict(
+ iris="CODE_IRIS", municipality="DEPCOM", department="DEP", region="REG"
+ )
+ )
os.mkdir("%s/codes_2021" % output_path)
- with zipfile.ZipFile("%s/codes_2021/reference_IRIS_geo2021.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/codes_2021/reference_IRIS_geo2021.zip" % output_path, "w"
+ ) as archive:
with archive.open("reference_IRIS_geo2021.xlsx", "w") as f:
df_codes.to_excel(
- f, sheet_name = "Emboitements_IRIS",
- startrow = 5, index = False
+ f, sheet_name="Emboitements_IRIS", startrow=5, index=False
)
# Dataset: Aggregate census
@@ -192,20 +220,22 @@ def create(output_path):
print("Creating aggregate census ...")
df_population = df.copy()
- df_population = df_population[["iris", "municipality", "department", "region"]].rename(columns = dict(
- iris = "IRIS", municipality = "COM", department = "DEP", region = "REG"
- ))
+ df_population = df_population[
+ ["iris", "municipality", "department", "region"]
+ ].rename(
+ columns=dict(iris="IRIS", municipality="COM", department="DEP", region="REG")
+ )
# Set all population to fixed number
df_population["P19_POP"] = 120.0
os.mkdir("%s/rp_2019" % output_path)
- with zipfile.ZipFile("%s/rp_2019/base-ic-evol-struct-pop-2019.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/rp_2019/base-ic-evol-struct-pop-2019.zip" % output_path, "w"
+ ) as archive:
with archive.open("base-ic-evol-struct-pop-2019.xlsx", "w") as f:
- df_population.to_excel(
- f, sheet_name = "IRIS", startrow = 5, index = False
- )
+ df_population.to_excel(f, sheet_name="IRIS", startrow=5, index=False)
# Dataset: BPE
# Required attributes: DCIRIS, LAMBERT_X, LAMBERT_Y, TYPEQU, DEPCOM, DEP
@@ -215,13 +245,15 @@ def create(output_path):
observations = BPE_OBSERVATIONS
categories = np.array(["A", "B", "C", "D", "E", "F", "G"])
- df_selection = df.iloc[random.randint(0, len(df), size = observations)].copy()
+ df_selection = df.iloc[random.randint(0, len(df), size=observations)].copy()
df_selection["DCIRIS"] = df_selection["iris"]
df_selection["DEPCOM"] = df_selection["municipality"]
df_selection["DEP"] = df_selection["department"]
df_selection["LAMBERT_X"] = df_selection["geometry"].centroid.x
df_selection["LAMBERT_Y"] = df_selection["geometry"].centroid.y
- df_selection["TYPEQU"] = categories[random.randint(0, len(categories), size = len(df_selection))]
+ df_selection["TYPEQU"] = categories[
+ random.randint(0, len(categories), size=len(df_selection))
+ ]
# Deliberately set coordinates for some to NaN
df_selection.iloc[-10:, df_selection.columns.get_loc("LAMBERT_X")] = np.nan
@@ -231,10 +263,11 @@ def create(output_path):
os.mkdir("%s/bpe_2021" % output_path)
- with zipfile.ZipFile("%s/bpe_2021/bpe21_ensemble_xy_csv.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/bpe_2021/bpe21_ensemble_xy_csv.zip" % output_path, "w"
+ ) as archive:
with archive.open("bpe21_ensemble_xy.csv", "w") as f:
- df_selection[columns].to_csv(f,
- sep = ";", index = False)
+ df_selection[columns].to_csv(f, sep=";", index=False)
# Dataset: Tax data
# Required attributes: CODGEO, D115, ..., D915
@@ -244,23 +277,80 @@ def create(output_path):
filosofi_year = "19"
income_data = {
"househod_size": [
- {"name": "1_pers", "sheet": "TAILLEM_1", "col_pattern": "TME1", "data": [9820,13380,15730,18140,20060,22050,24710,28120,34150]},
- {"name": "2_pers", "sheet": "TAILLEM_2", "col_pattern": "TME2", "data": [12950,16840,19920,22660,25390,28500,32080,37030,45910]},
- {"name": "3_pers", "sheet": "TAILLEM_3", "col_pattern": "TME3", "data": [11440,14850,18070,21040,23960,27190,30930,36130,45680]},
- {"name": "4_pers", "sheet": "TAILLEM_4", "col_pattern": "TME4", "data": [11920,15720,19130,22440,25540,28750,32400,37520,46870]},
- {"name": "5_pers_or_more", "sheet": "TAILLEM_5", "col_pattern": "TME5", "data": [9320,11510,13580,16180,19920,24570,29180,35460,46370]},
+ {
+ "name": "1_pers",
+ "sheet": "TAILLEM_1",
+ "col_pattern": "TME1",
+ "data": [9820, 13380, 15730, 18140, 20060, 22050, 24710, 28120, 34150],
+ },
+ {
+ "name": "2_pers",
+ "sheet": "TAILLEM_2",
+ "col_pattern": "TME2",
+ "data": [12950, 16840, 19920, 22660, 25390, 28500, 32080, 37030, 45910],
+ },
+ {
+ "name": "3_pers",
+ "sheet": "TAILLEM_3",
+ "col_pattern": "TME3",
+ "data": [11440, 14850, 18070, 21040, 23960, 27190, 30930, 36130, 45680],
+ },
+ {
+ "name": "4_pers",
+ "sheet": "TAILLEM_4",
+ "col_pattern": "TME4",
+ "data": [11920, 15720, 19130, 22440, 25540, 28750, 32400, 37520, 46870],
+ },
+ {
+ "name": "5_pers_or_more",
+ "sheet": "TAILLEM_5",
+ "col_pattern": "TME5",
+ "data": [9320, 11510, 13580, 16180, 19920, 24570, 29180, 35460, 46370],
+ },
],
"family_comp": [
- {"name": "Single_man", "sheet": "TYPMENR_1", "col_pattern": "TYM1", "data": [9180,12830,15100,17740,19800,21890,24780,28290,34850]},
- {"name": "Single_wom", "sheet": "TYPMENR_2", "col_pattern": "TYM2", "data": [10730,13730,16220,18420,20260,22160,24680,27990,33570]},
- {"name": "Couple_without_child", "sheet": "TYPMENR_3", "col_pattern": "TYM3", "data": [15360,19560,22600,25260,27990,30980,34710,39640,49110]},
- {"name": "Couple_with_child", "sheet": "TYPMENR_4", "col_pattern": "TYM4", "data": [11790,15540,19240,22670,25850,29180,33090,38570,48700]},
- {"name": "Single_parent", "sheet": "TYPMENR_5", "col_pattern": "TYM5", "data": [9350,11150,12830,14660,16640,18760,21230,24700,31170]},
- {"name": "complex_hh", "sheet": "TYPMENR_6", "col_pattern": "TYM6", "data": [9280,11850,14100,16740,19510,22480,26100,30640,38970]},
- ]
+ {
+ "name": "Single_man",
+ "sheet": "TYPMENR_1",
+ "col_pattern": "TYM1",
+ "data": [9180, 12830, 15100, 17740, 19800, 21890, 24780, 28290, 34850],
+ },
+ {
+ "name": "Single_wom",
+ "sheet": "TYPMENR_2",
+ "col_pattern": "TYM2",
+ "data": [10730, 13730, 16220, 18420, 20260, 22160, 24680, 27990, 33570],
+ },
+ {
+ "name": "Couple_without_child",
+ "sheet": "TYPMENR_3",
+ "col_pattern": "TYM3",
+ "data": [15360, 19560, 22600, 25260, 27990, 30980, 34710, 39640, 49110],
+ },
+ {
+ "name": "Couple_with_child",
+ "sheet": "TYPMENR_4",
+ "col_pattern": "TYM4",
+ "data": [11790, 15540, 19240, 22670, 25850, 29180, 33090, 38570, 48700],
+ },
+ {
+ "name": "Single_parent",
+ "sheet": "TYPMENR_5",
+ "col_pattern": "TYM5",
+ "data": [9350, 11150, 12830, 14660, 16640, 18760, 21230, 24700, 31170],
+ },
+ {
+ "name": "complex_hh",
+ "sheet": "TYPMENR_6",
+ "col_pattern": "TYM6",
+ "data": [9280, 11850, 14100, 16740, 19510, 22480, 26100, 30640, 38970],
+ },
+ ],
}
- df_income = df.drop_duplicates("municipality")[["municipality"]].rename(columns = dict(municipality = "CODGEO"))
+ df_income = df.drop_duplicates("municipality")[["municipality"]].rename(
+ columns=dict(municipality="CODGEO")
+ )
df_income_ensemble = df_income.copy()
@@ -276,9 +366,9 @@ def create(output_path):
df_income_ensemble["D919"] = 32303.0
# Deliberately remove some of them
- df_income_ensemble = df_income_ensemble[~df_income_ensemble["CODGEO"].isin([
- "1A015", "1A016"
- ])]
+ df_income_ensemble = df_income_ensemble[
+ ~df_income_ensemble["CODGEO"].isin(["1A015", "1A016"])
+ ]
# Deliberately only provide median for some
f = df_income_ensemble["CODGEO"].isin(["1D002", "1D005"])
@@ -288,17 +378,25 @@ def create(output_path):
value["df"] = df_income.copy()
col_pattern = value["col_pattern"]
columns = [
- "%sD%d" % (col_pattern, q) + filosofi_year if q != 5 else col_pattern + "Q2" + filosofi_year
+ (
+ "%sD%d" % (col_pattern, q) + filosofi_year
+ if q != 5
+ else col_pattern + "Q2" + filosofi_year
+ )
for q in range(1, 10)
]
for i, column in enumerate(columns):
value["df"][column] = value["data"][i]
-
+
for value in income_data["family_comp"]:
value["df"] = df_income.copy()
col_pattern = value["col_pattern"]
columns = [
- "%sD%d" % (col_pattern, q) + filosofi_year if q != 5 else col_pattern + "Q2" + filosofi_year
+ (
+ "%sD%d" % (col_pattern, q) + filosofi_year
+ if q != 5
+ else col_pattern + "Q2" + filosofi_year
+ )
for q in range(1, 10)
]
for i, column in enumerate(columns):
@@ -306,30 +404,33 @@ def create(output_path):
os.mkdir("%s/filosofi_2019" % output_path)
- with zipfile.ZipFile("%s/filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/filosofi_2019/indic-struct-distrib-revenu-2019-COMMUNES.zip" % output_path,
+ "w",
+ ) as archive:
with archive.open("FILO2019_DISP_COM.xlsx", "w") as f:
- with pd.ExcelWriter(f) as writer:
+ with pd.ExcelWriter(f) as writer:
df_income_ensemble.to_excel(
- writer, sheet_name = "ENSEMBLE", startrow = 5, index = False
+ writer, sheet_name="ENSEMBLE", startrow=5, index=False
)
for value in income_data["househod_size"]:
value["df"].to_excel(
- writer, sheet_name = value["sheet"], startrow = 5, index = False
+ writer, sheet_name=value["sheet"], startrow=5, index=False
)
for value in income_data["family_comp"]:
value["df"].to_excel(
- writer, sheet_name = value["sheet"], startrow = 5, index = False
+ writer, sheet_name=value["sheet"], startrow=5, index=False
)
# Data set: ENTD
print("Creating ENTD ...")
data = dict(
- Q_MENAGE = [],
- Q_TCM_MENAGE = [],
- Q_INDIVIDU = [],
- Q_TCM_INDIVIDU = [],
- K_DEPLOC = [],
+ Q_MENAGE=[],
+ Q_TCM_MENAGE=[],
+ Q_INDIVIDU=[],
+ Q_TCM_INDIVIDU=[],
+ K_DEPLOC=[],
)
for household_index in range(HTS_HOUSEHOLDS):
@@ -338,107 +439,182 @@ def create(output_path):
region = random.choice([10, 20])
department = "%d%s" % (region // 10, random.choice(["A", "B", "C", "D"]))
- data["Q_MENAGE"].append(dict(
- DEP = department, idENT_MEN = household_id, PONDV1 = 1.0,
- RG = region, V1_JNBVELOADT = random.randint(4),
- V1_JNBVEH = random.randint(3), V1_JNBMOTO = random.randint(2),
- V1_JNBCYCLO = 0
- ))
-
- data["Q_TCM_MENAGE"].append(dict(
- NPERS = 3, PONDV1 = 1.0, DEP = department,
- idENT_MEN = household_id, RG = region,
- TrancheRevenuMensuel = random.choice([
- "Moins de 400", "De 400", "De 600", "De 800",
- "De 1 000", "De 1 200", "De 1 500", "De 1800",
- "De 2 000", "De 2 500", "De 3 000", "De 4 000",
- "De 6 000", "10 000"
- ]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4]
- ))
+ data["Q_MENAGE"].append(
+ dict(
+ DEP=department,
+ idENT_MEN=household_id,
+ PONDV1=1.0,
+ RG=region,
+ V1_JNBVELOADT=random.randint(4),
+ V1_JNBVEH=random.randint(3),
+ V1_JNBMOTO=random.randint(2),
+ V1_JNBCYCLO=0,
+ )
+ )
+
+ data["Q_TCM_MENAGE"].append(
+ dict(
+ NPERS=3,
+ PONDV1=1.0,
+ DEP=department,
+ idENT_MEN=household_id,
+ RG=region,
+ TrancheRevenuMensuel=random.choice(
+ [
+ "Moins de 400",
+ "De 400",
+ "De 600",
+ "De 800",
+ "De 1 000",
+ "De 1 200",
+ "De 1 500",
+ "De 1800",
+ "De 2 000",
+ "De 2 500",
+ "De 3 000",
+ "De 4 000",
+ "De 6 000",
+ "10 000",
+ ]
+ ),
+ numcom_UU2010=["B", "C", "I", "R"][household_index % 4],
+ )
+ )
for person_index in range(HTS_HOUSEHOLD_MEMBERS):
person_id = household_id * 1000 + person_index
studies = random.random_sample() < 0.3
- data["Q_INDIVIDU"].append(dict(
- IDENT_IND = person_id, idENT_MEN = household_id,
- RG = region,
- V1_GPERMIS = random.choice([1, 2]), V1_GPERMIS2R = random.choice([1, 2]),
- V1_ICARTABON = random.choice([1, 2]),
- ))
-
- data["Q_TCM_INDIVIDU"].append(dict(
- AGE = random.randint(90), SEXE = random.choice([1, 2]),
- CS24 = random.randint(8) * 10, DEP = department,
- ETUDES = 1 if studies else 2, IDENT_IND = person_id,
- IDENT_MEN = household_id, PONDV1 = 1.0,
- SITUA = random.choice([1, 2])
- ))
-
- if person_index == 0: # Only one person per household has activity chain
+ data["Q_INDIVIDU"].append(
+ dict(
+ IDENT_IND=person_id,
+ idENT_MEN=household_id,
+ RG=region,
+ V1_GPERMIS=random.choice([1, 2]),
+ V1_GPERMIS2R=random.choice([1, 2]),
+ V1_ICARTABON=random.choice([1, 2]),
+ )
+ )
+
+ data["Q_TCM_INDIVIDU"].append(
+ dict(
+ AGE=random.randint(90),
+ SEXE=random.choice([1, 2]),
+ CS24=random.randint(8) * 10,
+ DEP=department,
+ ETUDES=1 if studies else 2,
+ IDENT_IND=person_id,
+ IDENT_MEN=household_id,
+ PONDV1=1.0,
+ SITUA=random.choice([1, 2]),
+ )
+ )
+
+ if person_index == 0: # Only one person per household has activity chain
home_department = department
work_department = random.choice(df["department"].unique())
purpose = "1.11" if studies else "9"
mode = random.choice(["1", "2", "2.20", "2.23", "4"])
- data["K_DEPLOC"].append(dict(
- IDENT_IND = person_id, V2_MMOTIFDES = purpose, V2_MMOTIFORI = 1,
- V2_TYPJOUR = 1, V2_MORIHDEP = "08:00:00", V2_MDESHARR = "09:00:00",
- V2_MDISTTOT = 3, # km
- IDENT_JOUR = 1, V2_MTP = mode,
- V2_MDESDEP = work_department,
- V2_MORIDEP = home_department,
- NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0
- ))
-
- data["K_DEPLOC"].append(dict(
- IDENT_IND = person_id, V2_MMOTIFDES = 2, V2_MMOTIFORI = purpose,
- V2_TYPJOUR = 1, V2_MORIHDEP = "17:00:00", V2_MDESHARR = "17:30:00",
- V2_MDISTTOT = 3, # km
- IDENT_JOUR = 1, V2_MTP = mode,
- V2_MDESDEP = home_department,
- V2_MORIDEP = work_department,
- NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0
- ))
-
- data["K_DEPLOC"].append(dict(
- IDENT_IND = person_id, V2_MMOTIFDES = 1, V2_MMOTIFORI = 2,
- V2_TYPJOUR = 1, V2_MORIHDEP = "18:00:00", V2_MDESHARR = "19:00:00",
- V2_MDISTTOT = 3, # km
- IDENT_JOUR = 1, V2_MTP = mode,
- V2_MDESDEP = home_department,
- V2_MORIDEP = home_department,
- NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0
- ))
+ data["K_DEPLOC"].append(
+ dict(
+ IDENT_IND=person_id,
+ V2_MMOTIFDES=purpose,
+ V2_MMOTIFORI=1,
+ V2_TYPJOUR=1,
+ V2_MORIHDEP="08:00:00",
+ V2_MDESHARR="09:00:00",
+ V2_MDISTTOT=3, # km
+ IDENT_JOUR=1,
+ V2_MTP=mode,
+ V2_MDESDEP=work_department,
+ V2_MORIDEP=home_department,
+ NDEP=4,
+ V2_MOBILREF=1,
+ PONDKI=3.0,
+ )
+ )
+
+ data["K_DEPLOC"].append(
+ dict(
+ IDENT_IND=person_id,
+ V2_MMOTIFDES=2,
+ V2_MMOTIFORI=purpose,
+ V2_TYPJOUR=1,
+ V2_MORIHDEP="17:00:00",
+ V2_MDESHARR="17:30:00",
+ V2_MDISTTOT=3, # km
+ IDENT_JOUR=1,
+ V2_MTP=mode,
+ V2_MDESDEP=home_department,
+ V2_MORIDEP=work_department,
+ NDEP=4,
+ V2_MOBILREF=1,
+ PONDKI=3.0,
+ )
+ )
+
+ data["K_DEPLOC"].append(
+ dict(
+ IDENT_IND=person_id,
+ V2_MMOTIFDES=1,
+ V2_MMOTIFORI=2,
+ V2_TYPJOUR=1,
+ V2_MORIHDEP="18:00:00",
+ V2_MDESHARR="19:00:00",
+ V2_MDISTTOT=3, # km
+ IDENT_JOUR=1,
+ V2_MTP=mode,
+ V2_MDESDEP=home_department,
+ V2_MORIDEP=home_department,
+ NDEP=4,
+ V2_MOBILREF=1,
+ PONDKI=3.0,
+ )
+ )
# Add a tail
- data["K_DEPLOC"].append(dict(
- IDENT_IND = person_id, V2_MMOTIFDES = 2, V2_MMOTIFORI = 1,
- V2_TYPJOUR = 1, V2_MORIHDEP = "21:00:00", V2_MDESHARR = "22:00:00",
- V2_MDISTTOT = 3, # km
- IDENT_JOUR = 1, V2_MTP = mode,
- V2_MDESDEP = home_department,
- V2_MORIDEP = home_department,
- NDEP = 4, V2_MOBILREF = 1, PONDKI = 3.0
- ))
+ data["K_DEPLOC"].append(
+ dict(
+ IDENT_IND=person_id,
+ V2_MMOTIFDES=2,
+ V2_MMOTIFORI=1,
+ V2_TYPJOUR=1,
+ V2_MORIHDEP="21:00:00",
+ V2_MDESHARR="22:00:00",
+ V2_MDISTTOT=3, # km
+ IDENT_JOUR=1,
+ V2_MTP=mode,
+ V2_MDESDEP=home_department,
+ V2_MORIDEP=home_department,
+ NDEP=4,
+ V2_MOBILREF=1,
+ PONDKI=3.0,
+ )
+ )
os.mkdir("%s/entd_2008" % output_path)
- pd.DataFrame.from_records(data["Q_MENAGE"]).to_csv("%s/entd_2008/Q_menage.csv" % output_path, index = False, sep = ";")
- pd.DataFrame.from_records(data["Q_TCM_MENAGE"]).to_csv("%s/entd_2008/Q_tcm_menage_0.csv" % output_path, index = False, sep = ";")
- pd.DataFrame.from_records(data["Q_INDIVIDU"]).to_csv("%s/entd_2008/Q_individu.csv" % output_path, index = False, sep = ";")
- pd.DataFrame.from_records(data["Q_TCM_INDIVIDU"]).to_csv("%s/entd_2008/Q_tcm_individu.csv" % output_path, index = False, sep = ";")
- pd.DataFrame.from_records(data["K_DEPLOC"]).to_csv("%s/entd_2008/K_deploc.csv" % output_path, index = False, sep = ";")
-
+ pd.DataFrame.from_records(data["Q_MENAGE"]).to_csv(
+ "%s/entd_2008/Q_menage.csv" % output_path, index=False, sep=";"
+ )
+ pd.DataFrame.from_records(data["Q_TCM_MENAGE"]).to_csv(
+ "%s/entd_2008/Q_tcm_menage_0.csv" % output_path, index=False, sep=";"
+ )
+ pd.DataFrame.from_records(data["Q_INDIVIDU"]).to_csv(
+ "%s/entd_2008/Q_individu.csv" % output_path, index=False, sep=";"
+ )
+ pd.DataFrame.from_records(data["Q_TCM_INDIVIDU"]).to_csv(
+ "%s/entd_2008/Q_tcm_individu.csv" % output_path, index=False, sep=";"
+ )
+ pd.DataFrame.from_records(data["K_DEPLOC"]).to_csv(
+ "%s/entd_2008/K_deploc.csv" % output_path, index=False, sep=";"
+ )
# Data set: EGT
print("Creating EGT ...")
- data = dict(
- households = [],
- persons = [],
- trips = []
- )
+ data = dict(households=[], persons=[], trips=[])
person_index = 0
for household_index in range(HTS_HOUSEHOLDS):
@@ -448,30 +624,50 @@ def create(output_path):
region = df[df["municipality"] == municipality]["region"].values[0]
department = df[df["municipality"] == municipality]["department"].values[0]
- data["households"].append(dict(
- RESDEP = department, NQUEST = household_id, POIDSM = 1.0,
- NB_VELO = random.randint(3), NB_VD = random.randint(2),
- RESCOMM = municipality, NB_2RM = 0,
- MNP = 3, REVENU = random.randint(12)
- ))
+ data["households"].append(
+ dict(
+ RESDEP=department,
+ NQUEST=household_id,
+ POIDSM=1.0,
+ NB_VELO=random.randint(3),
+ NB_VD=random.randint(2),
+ RESCOMM=municipality,
+ NB_2RM=0,
+ MNP=3,
+ REVENU=random.randint(12),
+ )
+ )
for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1):
studies = random.random_sample() < 0.3
- data["persons"].append(dict(
- RESDEP = department, NP = person_id, POIDSP = 1.0,
- NQUEST = household_id, SEXE = random.choice([1, 2]),
- AGE = random.randint(90), PERMVP = random.choice([1, 2]),
- ABONTC = random.choice([1, 2]), OCCP = 3 if studies else 2,
- PERM2RM = random.choice([1, 2]), NBDEPL = 2, CS8 = random.randint(9)
- ))
+ data["persons"].append(
+ dict(
+ RESDEP=department,
+ NP=person_id,
+ POIDSP=1.0,
+ NQUEST=household_id,
+ SEXE=random.choice([1, 2]),
+ AGE=random.randint(90),
+ PERMVP=random.choice([1, 2]),
+ ABONTC=random.choice([1, 2]),
+ OCCP=3 if studies else 2,
+ PERM2RM=random.choice([1, 2]),
+ NBDEPL=2,
+ CS8=random.randint(9),
+ )
+ )
home_department = department
home_municipality = municipality
work_municipality = random.choice(df["municipality"].unique())
- work_region = df[df["municipality"] == work_municipality]["region"].values[0]
- work_department = df[df["municipality"] == work_municipality]["department"].values[0]
+ work_region = df[df["municipality"] == work_municipality]["region"].values[
+ 0
+ ]
+ work_department = df[df["municipality"] == work_municipality][
+ "department"
+ ].values[0]
purpose = 4 if studies else 2
mode = random.choice([1, 2, 3, 5, 7])
@@ -484,43 +680,97 @@ def create(output_path):
origin_hour = 0
origin_minute = 12
- data["trips"].append(dict(
- NQUEST = household_id, NP = person_id,
- ND = 1, ORDEP = home_department, DESTDEP = work_department,
- ORH = origin_hour, ORM = origin_minute, DESTH = 9, DESTM = 0, ORCOMM = home_municipality,
- DESTCOMM = work_municipality, DPORTEE = 3, MODP_H7 = 2,
- DESTMOT_H9 = purpose, ORMOT_H9 = 1
- ))
-
- data["trips"].append(dict(
- NQUEST = household_id, NP = person_id,
- ND = 2, ORDEP = work_department, DESTDEP = home_department,
- ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality,
- DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
- DESTMOT_H9 = 5, ORMOT_H9 = purpose
- ))
-
- data["trips"].append(dict(
- NQUEST = household_id, NP = person_id,
- ND = 3, ORDEP = home_department, DESTDEP = home_department,
- ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality,
- DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
- DESTMOT_H9 = 1, ORMOT_H9 = 5
- ))
+ data["trips"].append(
+ dict(
+ NQUEST=household_id,
+ NP=person_id,
+ ND=1,
+ ORDEP=home_department,
+ DESTDEP=work_department,
+ ORH=origin_hour,
+ ORM=origin_minute,
+ DESTH=9,
+ DESTM=0,
+ ORCOMM=home_municipality,
+ DESTCOMM=work_municipality,
+ DPORTEE=3,
+ MODP_H7=2,
+ DESTMOT_H9=purpose,
+ ORMOT_H9=1,
+ )
+ )
+
+ data["trips"].append(
+ dict(
+ NQUEST=household_id,
+ NP=person_id,
+ ND=2,
+ ORDEP=work_department,
+ DESTDEP=home_department,
+ ORH=8,
+ ORM=0,
+ DESTH=9,
+ DESTM=0,
+ ORCOMM=work_municipality,
+ DESTCOMM=home_municipality,
+ DPORTEE=3,
+ MODP_H7=2,
+ DESTMOT_H9=5,
+ ORMOT_H9=purpose,
+ )
+ )
+
+ data["trips"].append(
+ dict(
+ NQUEST=household_id,
+ NP=person_id,
+ ND=3,
+ ORDEP=home_department,
+ DESTDEP=home_department,
+ ORH=17,
+ ORM=0,
+ DESTH=18,
+ DESTM=0,
+ ORCOMM=home_municipality,
+ DESTCOMM=home_municipality,
+ DPORTEE=3,
+ MODP_H7=2,
+ DESTMOT_H9=1,
+ ORMOT_H9=5,
+ )
+ )
# Tail
- data["trips"].append(dict(
- NQUEST = household_id, NP = person_id,
- ND = 4, ORDEP = home_department, DESTDEP = home_department,
- ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality,
- DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
- DESTMOT_H9 = 5, ORMOT_H9 = 1
- ))
+ data["trips"].append(
+ dict(
+ NQUEST=household_id,
+ NP=person_id,
+ ND=4,
+ ORDEP=home_department,
+ DESTDEP=home_department,
+ ORH=22,
+ ORM=0,
+ DESTH=21,
+ DESTM=0,
+ ORCOMM=home_municipality,
+ DESTCOMM=home_municipality,
+ DPORTEE=3,
+ MODP_H7=2,
+ DESTMOT_H9=5,
+ ORMOT_H9=1,
+ )
+ )
os.mkdir("%s/egt_2010" % output_path)
- pd.DataFrame.from_records(data["households"]).to_csv("%s/egt_2010/Menages_semaine.csv" % output_path, index = False, sep = ",")
- pd.DataFrame.from_records(data["persons"]).to_csv("%s/egt_2010/Personnes_semaine.csv" % output_path, index = False, sep = ",")
- pd.DataFrame.from_records(data["trips"]).to_csv("%s/egt_2010/Deplacements_semaine.csv" % output_path, index = False, sep = ",")
+ pd.DataFrame.from_records(data["households"]).to_csv(
+ "%s/egt_2010/Menages_semaine.csv" % output_path, index=False, sep=","
+ )
+ pd.DataFrame.from_records(data["persons"]).to_csv(
+ "%s/egt_2010/Personnes_semaine.csv" % output_path, index=False, sep=","
+ )
+ pd.DataFrame.from_records(data["trips"]).to_csv(
+ "%s/egt_2010/Deplacements_semaine.csv" % output_path, index=False, sep=","
+ )
# Data set: Census
print("Creating census ...")
@@ -532,40 +782,68 @@ def create(output_path):
iris = df["iris"].iloc[random.randint(len(df))]
department = iris[:2]
- if iris.endswith("0000"): iris = iris[:-4] + "XXXX"
+ if iris.endswith("0000"):
+ iris = iris[:-4] + "XXXX"
- if random.random_sample() < 0.1: # For some, commune is not known
+ if random.random_sample() < 0.1: # For some, commune is not known
iris = "ZZZZZZZZZ"
destination_municipality = random.choice(df["municipality"].unique())
- destination_department = df[df["municipality"] == destination_municipality]["department"].values[0]
+ destination_department = df[df["municipality"] == destination_municipality][
+ "department"
+ ].values[0]
for person_index in range(CENSUS_HOUSEHOLD_MEMBERS):
- persons.append(dict(
- CANTVILLE = "ABCE", NUMMI = household_id,
- AGED = "%03d" % random.randint(90), COUPLE = random.choice([1, 2]),
- CS1 = random.randint(9),
- DEPT = department, IRIS = iris, REGION = region, ETUD = random.choice([1, 2]),
- ILETUD = 4 if department != destination_department else 0,
- ILT = 4 if department != destination_department else 0,
- IPONDI = float(1.0),
- SEXE = random.choice([1, 2]),
- TACT = random.choice([1, 2]),
- TRANS = 4, VOIT = random.randint(3), DEROU = random.randint(2)
- ))
+ persons.append(
+ dict(
+ CANTVILLE="ABCE",
+ NUMMI=household_id,
+ AGED="%03d" % random.randint(90),
+ COUPLE=random.choice([1, 2]),
+ CS1=random.randint(9),
+ DEPT=department,
+ IRIS=iris,
+ REGION=region,
+ ETUD=random.choice([1, 2]),
+ ILETUD=4 if department != destination_department else 0,
+ ILT=4 if department != destination_department else 0,
+ IPONDI=float(1.0),
+ SEXE=random.choice([1, 2]),
+ TACT=random.choice([1, 2]),
+ TRANS=4,
+ VOIT=random.randint(3),
+ DEROU=random.randint(2),
+ )
+ )
columns = [
- "CANTVILLE", "NUMMI", "AGED", "COUPLE", "CS1", "DEPT", "IRIS", "REGION",
- "ETUD", "ILETUD", "ILT", "IPONDI",
- "SEXE", "TACT", "TRANS", "VOIT", "DEROU"
+ "CANTVILLE",
+ "NUMMI",
+ "AGED",
+ "COUPLE",
+ "CS1",
+ "DEPT",
+ "IRIS",
+ "REGION",
+ "ETUD",
+ "ILETUD",
+ "ILT",
+ "IPONDI",
+ "SEXE",
+ "TACT",
+ "TRANS",
+ "VOIT",
+ "DEROU",
]
df_persons = pd.DataFrame.from_records(persons)[columns]
df_persons.columns = columns
- with zipfile.ZipFile("%s/rp_2019/RP2019_INDCVI_csv.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/rp_2019/RP2019_INDCVI_csv.zip" % output_path, "w"
+ ) as archive:
with archive.open("FD_INDCVI_2019.csv", "w") as f:
- df_persons.to_csv(f, sep = ";")
+ df_persons.to_csv(f, sep=";")
# Data set: commute flows
print("Creating commute flows ...")
@@ -574,11 +852,15 @@ def create(output_path):
observations = COMMUTE_FLOW_OBSERVATIONS
# ... work
- df_work = pd.DataFrame(dict(
- COMMUNE = municipalities[random.randint(0, len(municipalities), observations)],
- DCLT = municipalities[random.randint(0, len(municipalities), observations)],
- TRANS = random.randint(1, 6, size = (observations,))
- ))
+ df_work = pd.DataFrame(
+ dict(
+ COMMUNE=municipalities[
+ random.randint(0, len(municipalities), observations)
+ ],
+ DCLT=municipalities[random.randint(0, len(municipalities), observations)],
+ TRANS=random.randint(1, 6, size=(observations,)),
+ )
+ )
df_work["ARM"] = "Z"
df_work["IPONDI"] = 1.0
@@ -586,25 +868,33 @@ def create(output_path):
columns = ["COMMUNE", "DCLT", "TRANS", "ARM", "IPONDI"]
df_work.columns = columns
- with zipfile.ZipFile("%s/rp_2019/RP2019_MOBPRO_csv.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/rp_2019/RP2019_MOBPRO_csv.zip" % output_path, "w"
+ ) as archive:
with archive.open("FD_MOBPRO_2019.csv", "w") as f:
- df_work.to_csv(f, sep = ";")
+ df_work.to_csv(f, sep=";")
# ... education
- df_education = pd.DataFrame(dict(
- COMMUNE = municipalities[random.randint(0, len(municipalities), observations)],
- DCETUF = municipalities[random.randint(0, len(municipalities), observations)]
- ))
+ df_education = pd.DataFrame(
+ dict(
+ COMMUNE=municipalities[
+ random.randint(0, len(municipalities), observations)
+ ],
+ DCETUF=municipalities[random.randint(0, len(municipalities), observations)],
+ )
+ )
df_education["ARM"] = "Z"
df_education["IPONDI"] = 1.0
df_education["AGEREV10"] = 1
- columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI","AGEREV10"]
+ columns = ["COMMUNE", "DCETUF", "ARM", "IPONDI", "AGEREV10"]
df_education.columns = columns
- with zipfile.ZipFile("%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/rp_2019/RP2019_MOBSCO_csv.zip" % output_path, "w"
+ ) as archive:
with archive.open("FD_MOBSCO_2019.csv", "w") as f:
- df_education.to_csv(f, sep = ";")
+ df_education.to_csv(f, sep=";")
# Data set: BD-TOPO
print("Creating BD-TOPO ...")
@@ -615,43 +905,56 @@ def create(output_path):
x = df_selection["geometry"].centroid.x.values
y = df_selection["geometry"].centroid.y.values
- z = random.randint(100, 400, observations) # Not used but keeping unit test hashes constant
+ z = random.randint(
+ 100, 400, observations
+ ) # Not used but keeping unit test hashes constant
ids = [
- "BATIMENT{:016d}".format(n) for n in random.randint(1000, 1000000, observations)
+ "BATIMENT{:016d}".format(n) for n in random.randint(1000, 1000000, observations)
]
-
- ids[0] = ids[1] # setting multiple adresses for 1 building usecase
-
- df_bdtopo = gpd.GeoDataFrame({
- "nombre_de_logements": random.randint(0, 10, observations),
- "cleabs": ids,
- "geometry": [
- geo.Point(x, y) for x, y in zip(x, y)
- ]
- }, crs = "EPSG:2154")
+
+ ids[0] = ids[1] # setting multiple adresses for 1 building usecase
+
+ df_bdtopo = gpd.GeoDataFrame(
+ {
+ "nombre_de_logements": random.randint(0, 10, observations),
+ "cleabs": ids,
+ "geometry": [geo.Point(x, y) for x, y in zip(x, y)],
+ },
+ crs="EPSG:2154",
+ )
# polygons as buildings from iris centroid points
- df_bdtopo.set_geometry(df_bdtopo.buffer(40),inplace=True,drop=True,crs="EPSG:2154")
+ df_bdtopo.set_geometry(
+ df_bdtopo.buffer(40), inplace=True, drop=True, crs="EPSG:2154"
+ )
os.mkdir("{}/bdtopo_idf".format(output_path))
- df_bdtopo.to_file("{}/bdtopo_idf/content.gpkg".format(output_path), layer = "batiment")
+ df_bdtopo.to_file(
+ "{}/bdtopo_idf/content.gpkg".format(output_path), layer="batiment"
+ )
bdtopo_date = "2022-03-15"
bdtopo_departments = ["1A", "1B", "1C", "1D", "2A", "2B", "2C", "2D"]
- with py7zr.SevenZipFile("{}/bdtopo_idf/bdtopo.7z".format(output_path), "w") as archive:
- archive.write("{}/bdtopo_idf/content.gpkg".format(output_path), "content/content.gpkg")
+ with py7zr.SevenZipFile(
+ "{}/bdtopo_idf/bdtopo.7z".format(output_path), "w"
+ ) as archive:
+ archive.write(
+ "{}/bdtopo_idf/content.gpkg".format(output_path), "content/content.gpkg"
+ )
os.remove("{}/bdtopo_idf/content.gpkg".format(output_path))
-
+
for department in bdtopo_departments:
shutil.copyfile(
- "{}/bdtopo_idf/bdtopo.7z".format(output_path),
+ "{}/bdtopo_idf/bdtopo.7z".format(output_path),
"{}/bdtopo_idf/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_{}.7z".format(
- output_path, department, bdtopo_date))
-
+ output_path, department, bdtopo_date
+ ),
+ )
+
os.remove("{}/bdtopo_idf/bdtopo.7z".format(output_path))
-
+
# Data set: BAN
print("Creating BAN ...")
@@ -663,16 +966,26 @@ def create(output_path):
y = df_selection["geometry"].centroid.y.values
municipality = df["municipality"].unique()
- df_ban = pd.DataFrame({
- "code_insee": municipality[random.randint(0, len(municipality), observations)],
- "x": x,
- "y": y})
+ df_ban = pd.DataFrame(
+ {
+ "code_insee": municipality[
+ random.randint(0, len(municipality), observations)
+ ],
+ "x": x,
+ "y": y,
+ }
+ )
- df_ban = df_ban[:round(len(x)*.8)]
+ df_ban = df_ban[: round(len(x) * 0.8)]
os.mkdir("%s/ban_idf" % output_path)
for dep in df["department"].unique():
- df_ban.to_csv("%s/ban_idf/adresses-%s.csv.gz" % (output_path, dep), compression='gzip', sep=";", index=False)
+ df_ban.to_csv(
+ "%s/ban_idf/adresses-%s.csv.gz" % (output_path, dep),
+ compression="gzip",
+ sep=";",
+ index=False,
+ )
# Data set: SIRENE
print("Creating SIRENE ...")
@@ -681,25 +994,35 @@ def create(output_path):
identifiers = random.randint(0, 99999999, observations)
- df_sirene = pd.DataFrame({
- "siren": identifiers,
- "siret": identifiers,
- "codeCommuneEtablissement": municipalities[random.randint(0, len(municipalities), observations)],
- "etatAdministratifEtablissement": "A"
- })
+ df_sirene = pd.DataFrame(
+ {
+ "siren": identifiers,
+ "siret": identifiers,
+ "codeCommuneEtablissement": municipalities[
+ random.randint(0, len(municipalities), observations)
+ ],
+ "etatAdministratifEtablissement": "A",
+ }
+ )
df_sirene["activitePrincipaleEtablissement"] = "52.1"
df_sirene["trancheEffectifsEtablissement"] = "03"
-
os.mkdir("%s/sirene" % output_path)
- df_sirene.to_csv(output_path + "/sirene/StockEtablissement_utf8.zip", index = False, compression={'method': 'zip', 'archive_name': 'StockEtablissement_utf8.csv'})
-
+ df_sirene.to_csv(
+ output_path + "/sirene/StockEtablissement_utf8.zip",
+ index=False,
+ compression={"method": "zip", "archive_name": "StockEtablissement_utf8.csv"},
+ )
df_sirene = df_sirene[["siren"]].copy()
df_sirene["categorieJuridiqueUniteLegale"] = "1000"
- df_sirene.to_csv(output_path + "/sirene/StockUniteLegale_utf8.zip", index = False, compression={'method': 'zip', 'archive_name': 'StockUniteLegale_utf8.csv'})
+ df_sirene.to_csv(
+ output_path + "/sirene/StockUniteLegale_utf8.zip",
+ index=False,
+ compression={"method": "zip", "archive_name": "StockUniteLegale_utf8.csv"},
+ )
# Data set: SIRENE GEOLOCATION
print("Creating SIRENE GEOLOCATION...")
@@ -708,32 +1031,53 @@ def create(output_path):
x = df_selection["geometry"].centroid.x.values
y = df_selection["geometry"].centroid.y.values
- codes_com = df_codes["DEPCOM"].iloc[random.randint(0, len(df_iris), observations)]
+ codes_com = df_codes["DEPCOM"].iloc[random.randint(0, len(df_iris), observations)]
+
+ df_sirene_geoloc = pd.DataFrame(
+ {
+ "siret": identifiers,
+ "x": x,
+ "y": y,
+ "plg_code_commune": codes_com,
+ }
+ )
- df_sirene_geoloc = pd.DataFrame({
- "siret": identifiers,
- "x": x,
- "y": y,
- "plg_code_commune":codes_com,
- })
-
- df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'})
+ df_sirene_geoloc.to_csv(
+ "%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip"
+ % output_path,
+ index=False,
+ sep=";",
+ compression={
+ "method": "zip",
+ "archive_name": "GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv",
+ },
+ )
# Data set: Urban type
print("Creating urban type ...")
- df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" })
+ df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns={"DEPCOM": "CODGEO"})
df_urban_type = df_urban_type.drop_duplicates()
- df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))]
+ df_urban_type["STATUT_2017"] = [
+ ["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))
+ ]
- df_urban_type = pd.concat([df_urban_type, pd.DataFrame({
- "CODGEO": ["75056", "69123", "13055"],
- "STATUT_2017": ["C", "C", "C"]
- })])
+ df_urban_type = pd.concat(
+ [
+ df_urban_type,
+ pd.DataFrame(
+ {"CODGEO": ["75056", "69123", "13055"], "STATUT_2017": ["C", "C", "C"]}
+ ),
+ ]
+ )
os.mkdir("%s/urban_type" % output_path)
- with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w"
+ ) as archive:
with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f:
- df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False)
+ df_urban_type.to_excel(
+ f, startrow=5, sheet_name="Composition_communale", index=False
+ )
# Data set: OSM
# We add add a road grid of 500m
@@ -754,10 +1098,14 @@ def create(output_path):
for i in range(lengthx):
for j in range(lengthy):
- df_nodes.append(dict(
- id = node_index,
- geometry = geo.Point(anchor_x + 500 * i + 250, anchor_y - 500 * j - 250)
- ))
+ df_nodes.append(
+ dict(
+ id=node_index,
+ geometry=geo.Point(
+ anchor_x + 500 * i + 250, anchor_y - 500 * j - 250
+ ),
+ )
+ )
if j < lengthy - 1:
links.append([node_index, node_index + 1])
@@ -767,75 +1115,117 @@ def create(output_path):
node_index += 1
- df_nodes = gpd.GeoDataFrame(df_nodes, crs = "EPSG:2154")
+ df_nodes = gpd.GeoDataFrame(df_nodes, crs="EPSG:2154")
df_nodes = df_nodes.to_crs("EPSG:4326")
for row in df_nodes.itertuples():
- osm.append('' % (
- row[1], row[2].y, row[2].x
- ))
+ osm.append(
+ ''
+ % (row[1], row[2].y, row[2].x)
+ )
for index, link in enumerate(links):
- osm.append('' % (index + 1))
+ osm.append(
+ '' % (index + 1)
+ )
osm.append('' % link[0])
osm.append('' % link[1])
osm.append('')
- osm.append('')
+ osm.append("")
- osm.append('')
+ osm.append("")
import gzip
+
os.mkdir("%s/osm_idf" % output_path)
with gzip.open("%s/osm_idf/ile-de-france-220101.osm.gz" % output_path, "wb+") as f:
f.write(bytes("\n".join(osm), "utf-8"))
-
import subprocess
- subprocess.check_call([
- shutil.which("osmosis"), "--read-xml", "%s/osm_idf/ile-de-france-220101.osm.gz" % output_path,
- "--write-pbf", "%s/osm_idf/ile-de-france-220101.osm.pbf" % output_path
- ])
-
+ subprocess.check_call(
+ [
+ shutil.which("osmosis"),
+ "--read-xml",
+ "%s/osm_idf/ile-de-france-220101.osm.gz" % output_path,
+ "--write-pbf",
+ "%s/osm_idf/ile-de-france-220101.osm.pbf" % output_path,
+ ]
+ )
# Data set: GTFS
print("Creating GTFS ...")
feed = {}
- feed["agency"] = pd.DataFrame.from_records([dict(
- agency_id = 1, agency_name = "eqasim", agency_timezone = "Europe/Paris",
- agency_url = "https://eqasim.org"
- )])
+ feed["agency"] = pd.DataFrame.from_records(
+ [
+ dict(
+ agency_id=1,
+ agency_name="eqasim",
+ agency_timezone="Europe/Paris",
+ agency_url="https://eqasim.org",
+ )
+ ]
+ )
- feed["calendar"] = pd.DataFrame.from_records([dict(
- service_id = 1, monday = 1, tuesday = 1, wednesday = 1,
- thursday = 1, friday = 1, saturday = 1, sunday = 1, start_date = "20100101",
- end_date = "20500101"
- )])
+ feed["calendar"] = pd.DataFrame.from_records(
+ [
+ dict(
+ service_id=1,
+ monday=1,
+ tuesday=1,
+ wednesday=1,
+ thursday=1,
+ friday=1,
+ saturday=1,
+ sunday=1,
+ start_date="20100101",
+ end_date="20500101",
+ )
+ ]
+ )
- feed["routes"] = pd.DataFrame.from_records([dict(
- route_id = 1, agency_id = 1, route_short_name = "EQ",
- route_long_name = "The eqasim train", route_desc = "",
- route_type = 2
- )])
+ feed["routes"] = pd.DataFrame.from_records(
+ [
+ dict(
+ route_id=1,
+ agency_id=1,
+ route_short_name="EQ",
+ route_long_name="The eqasim train",
+ route_desc="",
+ route_type=2,
+ )
+ ]
+ )
df_stops = df[df["municipality"].isin(["1B019", "2D007"])].copy()
df_stops = df_stops.to_crs("EPSG:4326")
- feed["stops"] = pd.DataFrame.from_records([dict(
- stop_id = "A", stop_code = "A", stop_name = "A",
- stop_desc = "",
- stop_lat = df_stops["geometry"].iloc[0].centroid.y,
- stop_lon = df_stops["geometry"].iloc[0].centroid.x,
- location_type = 1, parent_station = None
- ), dict(
- stop_id = "B", stop_code = "B", stop_name = "B",
- stop_desc = "",
- stop_lat = df_stops["geometry"].iloc[1].centroid.y,
- stop_lon = df_stops["geometry"].iloc[1].centroid.x,
- location_type = 1, parent_station = None
- )])
+ feed["stops"] = pd.DataFrame.from_records(
+ [
+ dict(
+ stop_id="A",
+ stop_code="A",
+ stop_name="A",
+ stop_desc="",
+ stop_lat=df_stops["geometry"].iloc[0].centroid.y,
+ stop_lon=df_stops["geometry"].iloc[0].centroid.x,
+ location_type=1,
+ parent_station=None,
+ ),
+ dict(
+ stop_id="B",
+ stop_code="B",
+ stop_name="B",
+ stop_desc="",
+ stop_lat=df_stops["geometry"].iloc[1].centroid.y,
+ stop_lon=df_stops["geometry"].iloc[1].centroid.x,
+ location_type=1,
+ parent_station=None,
+ ),
+ ]
+ )
trips = []
times = []
@@ -844,19 +1234,27 @@ def create(output_path):
for origin, destination in [("A", "B"), ("B", "A")]:
for hour in np.arange(1, 24):
- trips.append(dict(
- route_id = 1, service_id = 1, trip_id = trip_id
- ))
-
- times.append(dict(
- trip_id = trip_id, arrival_time = "%02d:00:00" % hour,
- departure_time = "%02d:00:00" % hour, stop_id = origin, stop_sequence = 1
- ))
+ trips.append(dict(route_id=1, service_id=1, trip_id=trip_id))
+
+ times.append(
+ dict(
+ trip_id=trip_id,
+ arrival_time="%02d:00:00" % hour,
+ departure_time="%02d:00:00" % hour,
+ stop_id=origin,
+ stop_sequence=1,
+ )
+ )
- times.append(dict(
- trip_id = trip_id, arrival_time = "%02d:00:00" % (hour + 1),
- departure_time = "%02d:00:00" % (hour + 1), stop_id = destination, stop_sequence = 2
- ))
+ times.append(
+ dict(
+ trip_id=trip_id,
+ arrival_time="%02d:00:00" % (hour + 1),
+ departure_time="%02d:00:00" % (hour + 1),
+ stop_id=destination,
+ stop_sequence=2,
+ )
+ )
trip_id += 1
@@ -864,28 +1262,39 @@ def create(output_path):
feed["stop_times"] = pd.DataFrame.from_records(times)
# Transfers
- feed["transfers"] = pd.DataFrame(dict(
- from_stop_id = [], to_stop_id = [], transfer_type = []
- ))
+ feed["transfers"] = pd.DataFrame(
+ dict(from_stop_id=[], to_stop_id=[], transfer_type=[])
+ )
os.mkdir("%s/gtfs_idf" % output_path)
import data.gtfs.utils
+
data.gtfs.utils.write_feed(feed, "%s/gtfs_idf/IDFM-gtfs.zip" % output_path)
# Dataset: Parc automobile
- df_vehicles_region = pd.DataFrame(index = pd.MultiIndex.from_product([
- df["region"].unique(),
- np.arange(20),
- ], names = [
- "Code région", "Age au 01/01/2021"
- ])).reset_index()
+ df_vehicles_region = pd.DataFrame(
+ index=pd.MultiIndex.from_product(
+ [
+ df["region"].unique(),
+ np.arange(20),
+ ],
+ names=["Code région", "Age au 01/01/2021"],
+ )
+ ).reset_index()
# to enforce string
- df_vehicles_region = pd.concat([df_vehicles_region, pd.DataFrame({
- "Code région": ["AB"],
- "Age au 01/01/2021": [0],
- })])
+ df_vehicles_region = pd.concat(
+ [
+ df_vehicles_region,
+ pd.DataFrame(
+ {
+ "Code région": ["AB"],
+ "Age au 01/01/2021": [0],
+ }
+ ),
+ ]
+ )
df_vehicles_region["Code région"] = df_vehicles_region["Code région"].astype(str)
@@ -893,44 +1302,58 @@ def create(output_path):
df_vehicles_region["Energie"] = "Gazole"
df_vehicles_region["Vignette crit'air"] = "Crit'air 1"
- df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"].astype(str)
- df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"].replace("20", ">20")
- df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region["Age au 01/01/2021"] + " ans"
+ df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region[
+ "Age au 01/01/2021"
+ ].astype(str)
+ df_vehicles_region["Age au 01/01/2021"] = df_vehicles_region[
+ "Age au 01/01/2021"
+ ].replace("20", ">20")
+ df_vehicles_region["Age au 01/01/2021"] = (
+ df_vehicles_region["Age au 01/01/2021"] + " ans"
+ )
- df_vehicles_commune = pd.DataFrame({
- "municipality": df["municipality"].unique()
- })
+ df_vehicles_commune = pd.DataFrame({"municipality": df["municipality"].unique()})
df_vehicles_commune["Parc au 01/01/2021"] = 100
df_vehicles_commune["Energie"] = "Gazole"
df_vehicles_commune["Vignette Crit'air"] = "Crit'air 1"
- df_vehicles_commune = pd.merge(df_vehicles_commune, df[[
- "municipality", "region", "department"
- ]], on = "municipality")
+ df_vehicles_commune = pd.merge(
+ df_vehicles_commune,
+ df[["municipality", "region", "department"]],
+ on="municipality",
+ )
- df_vehicles_commune = df_vehicles_commune.rename(columns = {
- "municipality": "Code commune",
- "department": "Code départment",
- "region": "Code région",
- })
+ df_vehicles_commune = df_vehicles_commune.rename(
+ columns={
+ "municipality": "Code commune",
+ "department": "Code départment",
+ "region": "Code région",
+ }
+ )
os.mkdir("%s/vehicles" % output_path)
-
- with zipfile.ZipFile("%s/vehicles/parc_vp_regions.zip" % output_path, "w") as archive:
+
+ with zipfile.ZipFile(
+ "%s/vehicles/parc_vp_regions.zip" % output_path, "w"
+ ) as archive:
with archive.open("Parc_VP_Regions_2021.xlsx", "w") as f:
df_vehicles_region.to_excel(f)
- with zipfile.ZipFile("%s/vehicles/parc_vp_communes.zip" % output_path, "w") as archive:
+ with zipfile.ZipFile(
+ "%s/vehicles/parc_vp_communes.zip" % output_path, "w"
+ ) as archive:
with archive.open("Parc_VP_Communes_2021.xlsx", "w") as f:
df_vehicles_commune.to_excel(f)
+
if __name__ == "__main__":
import shutil
import sys
import os
+
folder = sys.argv[1]
os.makedirs(folder, exist_ok=True)
for dir in os.listdir(folder):
- shutil.rmtree(os.path.join(folder,dir))
+ shutil.rmtree(os.path.join(folder, dir))
create(sys.argv[1])