diff --git a/.gitignore b/.gitignore index a5914526..d382c642 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ + ## .DS_Store @@ -47,4 +48,8 @@ vignettes/*.pdf energy/*.DS_Store # diagnostic figures (meas) -energy/figures-and-results/compare-diagnostic-scenarios.R \ No newline at end of file +energy/figures-and-results/compare-diagnostic-scenarios.R + +# adding data folder +data/ + diff --git a/energy/data-processing-prep/create_ccs_scens.R b/energy/data-processing-prep/create_ccs_scens.R index 84e31a4e..e76e66b9 100644 --- a/energy/data-processing-prep/create_ccs_scens.R +++ b/energy/data-processing-prep/create_ccs_scens.R @@ -1,12 +1,15 @@ ## Tracey Mangin ## October 22, 2021 ## add infinity price to ccs +# revised: feb 16 2024 by Haejin +# Updated: 2/18/24 by Maxwell library(tidyverse) library(data.table) # paths ----- -scen_path = '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/project-materials/scenario-inputs' +scen_path = '/capstone/freshcair/meds-freshcair-capstone/data/inputs/scenarios' +file_path = '/capstone/freshcair/meds-freshcair-capstone/data/processed' # added file path, b/c read and store at the same place ## files ccs_ext_file = 'ccs_extraction_scenarios.csv' @@ -25,24 +28,24 @@ setorder(ccs_infin, "year", "ccs_scenario", "ccs_price", "units") ## bind ccs_ext_revised <- rbind(ccs_scens_ext, ccs_infin) -fwrite(ccs_ext_revised, file.path(scen_path, "ccs_extraction_scenarios_revised.csv")) +fwrite(ccs_ext_revised, file.path(file_path , "ccs_extraction_scenarios_revised.csv")) # revised file path ## refining - load ccs scenarios -ccs_scens_ref = fread(file.path(scen_path, ccs_ref_file), header = T) - - -## ccs infinity -ccs_infin_r <- unique(ccs_scens_ref[, .(year, units)]) -ccs_infin_r[, ccs_scenario := "no ccs"] -ccs_infin_r[, ccs_price := Inf] - -setorder(ccs_infin_r, "year", "ccs_scenario", "ccs_price", "units") - -## bind -ccs_ref_revised <- rbind(ccs_scens_ref, ccs_infin_r) - -fwrite(ccs_ref_revised, file.path(scen_path, "ccs_refining_scenarios_revised.csv")) +# ccs_scens_ref = fread(file.path(scen_path, ccs_ref_file), header = T) +# +# +# ## ccs infinity +# ccs_infin_r <- unique(ccs_scens_ref[, .(year, units)]) +# ccs_infin_r[, ccs_scenario := "no ccs"] +# ccs_infin_r[, ccs_price := Inf] +# +# setorder(ccs_infin_r, "year", "ccs_scenario", "ccs_price", "units") +# +# ## bind +# ccs_ref_revised <- rbind(ccs_scens_ref, ccs_infin_r) +# +# fwrite(ccs_ref_revised, file.path(file_path, "ccs_refining_scenarios_revised.csv")) # revised the file path diff --git a/energy/data-processing-prep/extraction/clean_doc_prod.R b/energy/data-processing-prep/extraction/clean_doc_prod.R index 5e6a482b..6a69ae1a 100644 --- a/energy/data-processing-prep/extraction/clean_doc_prod.R +++ b/energy/data-processing-prep/extraction/clean_doc_prod.R @@ -2,17 +2,25 @@ ## April 21, 2020 ## Data cleaning -- oil production and injection data ## Data from DOC +# updated: 02/09/2024 by Maxwell + +# add update: Feb 14 2024 by Haejin ## libraries library(tidyverse) library(readr) library(lubridate) library(rebus) -library(readtext) +#library(readtext) # update -haejin library(readxl) +library(here) +library(dplyr) # update -haejin ## set directory -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/" +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure +getwd() + ## read in data # ------------------------------------- @@ -25,33 +33,37 @@ data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-pr ## WellTypeCode -- The code for the Completion type. ## OilorCondensateProduced -## all wells -all_wells <- read_xlsx(paste0(data_directory, "raw/All_wells_20200417.xlsx")) +# UPDATED - MP +all_wells <- read_xlsx("data/inputs/extraction/All_wells_20200417.xlsx") +# UPDATE - not all of the data is in the folders, must not have been loaded somehow +# But the paths will be correct when the data is inputted properly ## well production -prod_7785 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1977_1985/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_8689 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1986_1989/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_9094 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1990_1994/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_9599 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1995_1999/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_0004 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2000_2004/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_0509 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2005_2009/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_1514 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2010_2014/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_15 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2015/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_16 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2016/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_17 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2017/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_18 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2018/CaliforniaOilAndGasWellMonthlyProduction.csv")) -prod_19 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2019/CaliforniaOilAndGasWellMonthlyProduction.csv")) +prod_7785 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1977_1985/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_8689 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1986_1989/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_9094 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1990_1994/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_9599 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1995_1999/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_0004 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2000_2004/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_0509 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2005_2009/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_1514 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2010_2014/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_15 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2015/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_16 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2016/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_17 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2017/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_18 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2018/CaliforniaOilAndGasWellMonthlyProduction.csv") +prod_19 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2019/CaliforniaOilAndGasWellMonthlyProduction.csv") ## bind rows monthly_prod <- rbind(prod_7785, prod_8689, prod_9094, prod_9599, prod_0004, prod_0509, prod_1514, prod_15, prod_16, prod_17, prod_18, prod_19) -## county codes -ccodes <- read_csv(paste0(data_directory, "raw/prod/county_codes.csv")) %>% +## county codes - UPDATED - MP +ccodes <- read_csv("data/inputs/extraction/county_codes.csv") %>% rename(county_name = county, county = number) %>% - select(county_name, county) + as.data.frame() %>% # add this + dplyr::select(county_name, county) # add dplyr:: + ## well type code welltype_df <- tibble(WellTypeCode = c("AI", "DG", "GD", "GS", @@ -72,22 +84,27 @@ all_prod <- monthly_prod %>% left_join(welltype_df) %>% mutate(well_type_name = ifelse(is.na(well_type_name), WellTypeCode, well_type_name)) -saveRDS(all_prod, file = paste0(data_directory, "processed/well_prod_m.rds")) + + + +# UPDATED - MP +saveRDS(all_prod, file = "data/processed/well_prod_m.rds") ## injection data +# UPDATE - should work once data is inputted into all the CSV folders -Done!(haejin) ## ------------------------------ -inj_7785 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1977_1985/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_8689 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1986_1989/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_9094 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1990_1994/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_9599 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1995_1999/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_0004 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2000_2004/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_0509 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2005_2009/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_1514 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2010_2014/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_15 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2015/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_16 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2016/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_17 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2017/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_18 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2018/CaliforniaOilAndGasWellMonthlyInjection.csv")) -inj_19 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2019/CaliforniaOilAndGasWellMonthlyInjection.csv")) +inj_7785 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1977_1985/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_8689 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1986_1989/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_9094 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1990_1994/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_9599 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1995_1999/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_0004 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2000_2004/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_0509 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2005_2009/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_1514 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2010_2014/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_15 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2015/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_16 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2016/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_17 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2017/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_18 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2018/CaliforniaOilAndGasWellMonthlyInjection.csv") +inj_19 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2019/CaliforniaOilAndGasWellMonthlyInjection.csv") ## bind rows monthly_inj <- rbind(inj_7785, inj_8689, inj_9094, inj_9599, inj_0004, inj_0509, inj_1514, inj_15, inj_16, inj_17, @@ -98,26 +115,31 @@ all_inject <- monthly_inj %>% mutate(county = as.numeric(str_sub(APINumber, 3, 5)), year = year(InjectionDate), month = month(InjectionDate)) %>% - left_join(ccodes) %>% + left_join(ccodes) %>% left_join(welltype_df) %>% mutate(well_type_name = ifelse(is.na(well_type_name), WellTypeCode, well_type_name)) -saveRDS(all_inject, file = paste0(data_directory, "processed/well_inject_m.rds")) +## missing data_directory -- Haejin +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/" + + +saveRDS(all_inject, file = paste0(data_directory, "processed/well_inject_m.rds")) # have a error message : Error: Status code 401 returned by RStudio Server when executing 'console_input' ## well data +# UPDATE - should work once data is inputted into all the CSV folders ## ----------------------------- -wells_7785 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1977_1985/CaliforniaOilAndGasWells.csv")) -wells_8689 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1986_1989/CaliforniaOilAndGasWells.csv")) -wells_9094 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1990_1994/CaliforniaOilAndGasWells.csv")) -wells_9599 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_1995_1999/CaliforniaOilAndGasWells.csv")) -wells_0004 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2000_2004/CaliforniaOilAndGasWells.csv")) -wells_0509 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2005_2009/CaliforniaOilAndGasWells.csv")) -wells_1014 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2010_2014/CaliforniaOilAndGasWells.csv")) -wells_15 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2015/CaliforniaOilAndGasWells.csv")) -wells_16 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2016/CaliforniaOilAndGasWells.csv")) -wells_17 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2017/CaliforniaOilAndGasWells.csv")) -wells_18 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2018/CaliforniaOilAndGasWells.csv")) -wells_19 <- read_csv(paste0(data_directory, "raw/hist_well/CSV_2019/CaliforniaOilAndGasWells.csv")) +wells_7785 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1977_1985/CaliforniaOilAndGasWells.csv") +wells_8689 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1986_1989/CaliforniaOilAndGasWells.csv") +wells_9094 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1990_1994/CaliforniaOilAndGasWells.csv") +wells_9599 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_1995_1999/CaliforniaOilAndGasWells.csv") +wells_0004 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2000_2004/CaliforniaOilAndGasWells.csv") +wells_0509 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2005_2009/CaliforniaOilAndGasWells.csv") +wells_1014 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2010_2014/CaliforniaOilAndGasWells.csv") +wells_15 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2015/CaliforniaOilAndGasWells.csv") +wells_16 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2016/CaliforniaOilAndGasWells.csv") +wells_17 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2017/CaliforniaOilAndGasWells.csv") +wells_18 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2018/CaliforniaOilAndGasWells.csv") +wells_19 <- read_csv("data/inputs/extraction/monthly-prod-inj-wells/CSV_2019/CaliforniaOilAndGasWells.csv") ## figure this ish out @@ -209,7 +231,7 @@ wells2 <- wells_19 %>% # test2 <- str_replace_all(test2, pattern = "Sterling, East " %R% OPEN_PAREN %R% "ABD" %R% CLOSE_PAREN, "Sterling East ABD") # test2 <- str_replace_all(test2, pattern = "Compton Landing, S., Gas " %R% OPEN_PAREN %R% "ABD" %R% CLOSE_PAREN, "Compton Landing S. Gas ABD") -fix_2019 <- readLines(paste0(data_directory, "raw/hist_well/CSV_2019/CaliforniaOilAndGasWells.csv")) +fix_2019 <- readLines(paste0(data_directory, "inputs/extraction/monthly-prod-inj-wells/CSV_2019/CaliforniaOilAndGasWells.csv")) # update by Haejin fix_20192<- str_replace_all(fix_2019, pattern = "8-9B INT, Sec. 32", "8-9B INT Sec. 32") @@ -220,8 +242,8 @@ fix_20192<- str_replace_all(fix_2019, pattern = "8-9B INT, Sec. 32", "8-9B INT S #040212008000 #040112009400 -writeLines(fix_20192, paste0(data_directory, "processed/wells_19.csv")) +writeLines(fix_20192, paste0(data_directory, "processed/wells_19.csv")) # update by Haejin -wells_2019 <- read_csv(paste0(data_directory, "processed/parseprobs/wells_19.csv")) +wells_2019 <- read_csv(paste0(data_directory, "processed/wells_19.csv")) diff --git a/energy/data-processing-prep/extraction/income_data.R b/energy/data-processing-prep/extraction/income_data.R index 27ee4eec..3cb8ceed 100644 --- a/energy/data-processing-prep/extraction/income_data.R +++ b/energy/data-processing-prep/extraction/income_data.R @@ -1,20 +1,21 @@ ## Tracey Mangin ## October 19, 2021 ## Census data +## revised 02/13/2024 - Haejin library(censusapi) library(tidycensus) library(tidyverse) library(data.table) -main_path <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/' +main_path <- '/capstone/freshcair/meds-freshcair-capstone/' mycensuskey <- "ae05491f7dfe185b0af5b9d56f1287b4c2c78eca" -# apis <- listCensusApis() +#apis <- listCensusApis() # View(apis) # -# availablevars <- listCensusMetadata(name="cps/asec/mar", vintage = 2021) +availablevars <- listCensusMetadata(name="cps/asec/mar", vintage = 2021) # View(availablevars) # # @@ -44,7 +45,7 @@ income <- get_acs(state = "CA", geography = "tract", income <- income %>% mutate(source = "2015-2019 5-year ACS, 2019 dollars") -fwrite(income, paste0(main_path, "data/Census/ca-median-house-income.csv")) +fwrite(income, paste0(main_path, "data/inputs/gis/census-tract/ca-median-house-income.csv")) ## repeat for county @@ -54,7 +55,7 @@ county_income <- get_acs(state = "CA", geography = "county", county_income <- county_income %>% mutate(county = str_remove(NAME, " County, California"), source = "2015-2019 5-year ACS, 2019 dollars") %>% - select(county, variable, estimate, moe, source) + dplyr::select(county, variable, estimate, moe, source) # add dplyr - haejin -fwrite(county_income, paste0(main_path, "data/Census/ca-median-house-income-county.csv")) +fwrite(county_income, paste0(main_path, "data/inputs/gis/census-tract/ca-median-house-income-county.csv")) diff --git a/energy/data-processing-prep/extraction/opgee-carb-results.R b/energy/data-processing-prep/extraction/opgee-carb-results.R index a343dd95..890d91a0 100644 --- a/energy/data-processing-prep/extraction/opgee-carb-results.R +++ b/energy/data-processing-prep/extraction/opgee-carb-results.R @@ -4,9 +4,9 @@ # ------------------------------------------- INPUTS ----------------------------------- - data_dir = '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/OPGEE/' - opgee_fil = 'OPGEE_v2.0_with-CARB-inputs.xlsm' - names_fil = 'opgee_field_names.csv' + data_dir = '/capstone/freshcair/meds-freshcair-capstone/data/inputs/' + opgee_fil = 'OPGEE_v2.0_with-CARB-inputs.xlsm' ## no file here - haejin + names_fil = 'opgee_field_names.csv' ## no file here - haejin # ------------------------------------------- MAIN ----------------------------------- diff --git a/energy/data-processing-prep/extraction/process-monthly-inj.R b/energy/data-processing-prep/extraction/process-monthly-inj.R index c85d2c0d..d0fdc0b0 100644 --- a/energy/data-processing-prep/extraction/process-monthly-inj.R +++ b/energy/data-processing-prep/extraction/process-monthly-inj.R @@ -1,10 +1,11 @@ ## Tracey Mangin ## April 29, 2021 ## process well injection, save for later use through out +# revised: feb 14, 2024 by Haejin # ------------------------------------------- INPUTS ----------------------------------- -data_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" +data_dir <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" minj_fil <- "well_inject_m.rds" wells_19_fil <- "wells_19.csv" diff --git a/energy/data-processing-prep/extraction/process-monthly-prod.R b/energy/data-processing-prep/extraction/process-monthly-prod.R index da834805..b4b92ae1 100644 --- a/energy/data-processing-prep/extraction/process-monthly-prod.R +++ b/energy/data-processing-prep/extraction/process-monthly-prod.R @@ -1,10 +1,11 @@ ## Tracey Mangin ## March 10, 2021 ## process well production, save for later use through out +## revised: feb 14, 2024 -haejin # ------------------------------------------- INPUTS ----------------------------------- -data_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" +data_dir <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" mprod_fil <- "well_prod_m.rds" wells_19_fil <- "wells_19.csv" diff --git a/energy/data-processing-prep/extraction/rystad_processing.R b/energy/data-processing-prep/extraction/rystad_processing.R index c077eec4..f5a7ef9c 100644 --- a/energy/data-processing-prep/extraction/rystad_processing.R +++ b/energy/data-processing-prep/extraction/rystad_processing.R @@ -2,29 +2,32 @@ ## July 15, 2020 ## Script for cleaning Rystad data +# revised: Feb 14, 2024 - Haejin + ## load libraries library(tidyverse) library(rebus) library(stringi) library(data.table) +library(dplyr) ## file paths -rystad_path <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/Rystad/data/" -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" +rystad_path <- "/capstone/freshcair/meds-freshcair-capstone/data/" +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" ## laod data -# economics_df <- read_csv(paste0(rystad_path, "raw/archive/ca_asset_opex_apex_govtt.csv")) -economics_df_update <- read_csv(paste0(rystad_path, "raw/asset_opex_capex_govtt.csv")) -production_df <- read_csv(paste0(rystad_path, "raw/ca_production.csv")) -econ_cats <- read_csv(paste0(rystad_path, "raw/asset_econ_categories.csv")) -err_df <- fread(paste0(rystad_path, "raw/resources_prod_myprod.csv"), skip = 1) -rystad_capex_recov_bbl <- read_csv(paste0(rystad_path, "raw/capex_per_recoverable_bbl.csv")) -api_asset <- read.csv(paste0(rystad_path, "raw/asset-wells.csv")) -rystad_capex_bbl_nom <- read_csv(paste0(rystad_path, "raw/capex_per_bbl_nom.csv")) -rystad_opex_bbl_nom <- read_csv(paste0(rystad_path, "raw/opex_per_bbl_nom.csv")) -asset_rename <- read_csv(paste0(rystad_path, "processed/rystad_asset_rename.csv")) -well_cost_eur <- read.csv(paste0(rystad_path, "raw/well_cost_per_eur.csv")) -field_asset <- read_csv(paste0(rystad_path, "raw/field_to_asset.csv")) +#economics_df <- read_csv(paste0(rystad_path, "proprietery-data/ca_asset_opex_apex_govtt.csv")) +economics_df_update <- read_csv(paste0(rystad_path, "proprietery-data/asset_opex_capex_govtt.csv")) +production_df <- read_csv(paste0(rystad_path, "proprietery-data/ca_production.csv")) +econ_cats <- read_csv(paste0(rystad_path, "proprietery-data/asset_econ_categories.csv")) +err_df <- fread(paste0(rystad_path, "proprietery-data/resources_prod_myprod.csv"), skip = 1) +rystad_capex_recov_bbl <- read_csv(paste0(rystad_path, "proprietery-data/capex_per_recoverable_bbl.csv")) +api_asset <- read.csv(paste0(rystad_path, "proprietery-data/asset-wells.csv")) +rystad_capex_bbl_nom <- read_csv(paste0(rystad_path, "proprietery-data/capex_per_bbl_nom.csv")) +rystad_opex_bbl_nom <- read_csv(paste0(rystad_path, "proprietery-data/opex_per_bbl_nom.csv")) +asset_rename <- read_csv(paste0(rystad_path, "proprietery-data/rystad_asset_rename.csv")) +well_cost_eur <- read.csv(paste0(rystad_path, "proprietery-data/well_cost_per_eur.csv")) +field_asset <- read_csv(paste0(rystad_path, "proprietery-data/field_to_asset.csv")) ## well prod @@ -34,24 +37,24 @@ well_prod <- fread(paste0(data_directory, prod_file), colClasses = c('api_ten_di 'doc_field_code' = 'character')) -## economics +## economics --- not using - by Haejin ##----------------------------------------- ## clean data -economics_df2 <- janitor::clean_names(economics_df) %>% - select(-data_values) %>% - rename(usd_nom = sum) %>% - mutate(usd_nom = usd_nom * 1e6, - string_end = str_detect(asset, pattern = ", US" %R% END), - ca_string = str_detect(asset, pattern = "_CA_"), - clean_asset = str_remove(asset, pattern = ", US" %R% END), - location = sub("_CA.*", "", clean_asset), - location = ifelse(ca_string == FALSE, clean_asset, location), - location = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, "United States", location), - company = sub(".*_CA_", "", clean_asset), - company = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, sub("_United States*.", "", clean_asset), company), - company = ifelse(company == location, NA, company)) %>% - select(original_asset_name = asset, location, company, economics_group, year, usd_nom) +#economics_df2 <- janitor::clean_names(economics_df) %>% +# dplyr::select(-data_values) %>% +# rename(usd_nom = sum) %>% +# mutate(usd_nom = usd_nom * 1e6, +# string_end = str_detect(asset, pattern = ", US" %R% END), +# ca_string = str_detect(asset, pattern = "_CA_"), +# clean_asset = str_remove(asset, pattern = ", US" %R% END), +# location = sub("_CA.*", "", clean_asset), +# location = ifelse(ca_string == FALSE, clean_asset, location), +# location = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, "United States", location), +# company = sub(".*_CA_", "", clean_asset), +# company = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, sub("_United States*.", "", clean_asset), company), +# company = ifelse(company == location, NA, company)) %>% +# dplyr::select(original_asset_name = asset, location, company, economics_group, year, usd_nom) # ## add field names # fieldnames2 <- fieldnames %>% @@ -64,7 +67,7 @@ economics_df2 <- janitor::clean_names(economics_df) %>% -write_csv(economics_df2, paste0(rystad_path, "processed/ca_asset_opex_capex_govtt_clean.csv")) +#write_csv(economics_df2, paste0(rystad_path, "processed/ca_asset_opex_capex_govtt_clean.csv")) ## updated economics variables (only oil and condensate operations included) @@ -84,7 +87,7 @@ economics_up_df2 <- janitor::clean_names(economics_df_update) %>% company = sub(".*_CA_", "", clean_asset), company = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, sub("_United States*.", "", clean_asset), company), company = ifelse(company == location, NA, company)) %>% - select(original_asset_name = asset, location, company, economics_group, year, usd_nom) + dplyr::select(original_asset_name = asset, location, company, economics_group, year, usd_nom) write_csv(economics_up_df2, paste0(rystad_path, "processed/oil_asset_opex_capex_govtt_clean.csv")) @@ -93,7 +96,7 @@ write_csv(economics_up_df2, paste0(rystad_path, "processed/oil_asset_opex_capex_ ##----------------------------------------- prod_df2 <- janitor::clean_names(production_df) %>% - select(-data_values) %>% + dplyr::select(-data_values) %>% rename(bbls = sum) %>% mutate(bbls = bbls * 1e6, string_end = str_detect(asset, pattern = ", US" %R% END), @@ -105,7 +108,7 @@ prod_df2 <- janitor::clean_names(production_df) %>% company = sub(".*_CA_", "", clean_asset), company = ifelse(str_detect(clean_asset, pattern = "_United States") == TRUE, sub("_United States*.", "", clean_asset), company), company = ifelse(company == location, NA, company)) %>% - select(original_asset_name = asset, location, company, data_source, data_type, oil_and_gas_category, year, bbls) + dplyr::select(original_asset_name = asset, location, company, data_source, data_type, oil_and_gas_category, year, bbls) write_csv(prod_df2, paste0(rystad_path, "processed/ca_oil_production.csv")) @@ -123,15 +126,15 @@ prod_df2 %>% ## econ categories ## ---------------------------- econ_cats2 <- econ_cats %>% - filter(!is.na(X1)) %>% - filter(X1 != "Economics Group") %>% - rename(econ_group = X1, - econ_cat = X2, + dplyr::filter(!is.na(...1)) %>% # revise X1 to ...1 + dplyr::filter(...1 != "Economics Group") %>% # revise X1 to ...1 - by Haejin + rename(econ_group = ...1, # revise X1 to ...1 - by Haejin + econ_cat = ...2, # revise X2 to ...2 original_asset_name = Year) %>% pivot_longer(`1970`:`2050`, names_to = "year", values_to = "million_usd") %>% mutate(year = as.numeric(year), usd = as.numeric(million_usd) * 1e6) %>% - select(-million_usd) + dplyr::select(-million_usd) write_csv(econ_cats2, paste0(rystad_path, "processed/asset_economics_cats.csv")) @@ -157,9 +160,9 @@ fwrite(err_df2_wide, paste0(rystad_path, "processed/economically_recoverable_res ## ------------------------------------ capex_recov_bbl2 <- rystad_capex_recov_bbl %>% - filter(!is.na(X1)) %>% - filter(X1 != "Asset") %>% - rename(original_asset_name = X1, + dplyr::filter(!is.na(...1)) %>% # revise X1 to ...1 and add dplyr - by Haejin + dplyr::filter(...1 != "Asset") %>% # revise X1 to ...1 - by Haejin + rename(original_asset_name = ...1, # revise X1 to ...1 - by Haejin economics_group = Year) %>% pivot_longer(`1970`:`2050`, names_to = "year", values_to = "usd_per_bbl") %>% mutate(year = as.numeric(year)) %>% @@ -183,7 +186,7 @@ api_asset2 <- api_asset %>% ## how many have more than one asset? n_asset <- api_asset2 %>% - select(APINumber, asset_name) %>% + dplyr::select(APINumber, asset_name) %>% group_by(APINumber) %>% mutate(n = n()) %>% ungroup() @@ -196,7 +199,7 @@ diff_df <- api_asset2 %>% ## clean asset names api_asset3 <- api_asset2 %>% left_join(asset_rename) %>% - select(-asset_name) + dplyr::select(-asset_name) # write_csv(api_asset3, paste0(rystad_path, "processed/rystad_asset_apis.csv")) @@ -212,7 +215,7 @@ api_asset4 <- api_asset3 %>% api_adj = ifelse(api_n < 12, paste0(api_adj, "00000"), APINumber), api_adj2 = ifelse(nchar(api_adj) > 12 & state == "04", str_sub(api_adj, 1, 12), api_adj)) %>% rename(orig_api = APINumber) %>% - select(APINumber = api_adj2, original_asset_name) %>% + dplyr::select(APINumber = api_adj2, original_asset_name) %>% mutate(api_ten_digit = str_sub(APINumber, 1, 10)) ## unique api_ten_digit - field combos @@ -264,7 +267,7 @@ field_n_asset <- api_asset_match_rev %>% mutate(rel_field = n_wells_asset / n_wells_field, rel_prod = bbl_prod / field_prod) -anti_join(api_asset4 %>% select(original_asset_name) %>% unique(), field_n_asset %>% select(original_asset_name) %>% unique()) +anti_join(api_asset4 %>% dplyr::select(original_asset_name) %>% unique(), field_n_asset %>% dplyr::select(original_asset_name) %>% unique()) write_csv(field_n_asset, paste0(rystad_path, "processed/field_rystad_match_apis_revised.csv")) @@ -274,15 +277,15 @@ rystad_capex_bbl_nom2 <- rystad_capex_bbl_nom %>% pivot_longer(`1900`:`2099`, names_to = "year", values_to = "capex_per_bbl_nom") %>% mutate(year = as.numeric(year), capex_per_bbl_nom = as.numeric(capex_per_bbl_nom)) %>% - select(-`Economics Group`) + dplyr::select(-`Economics Group`) write_csv(rystad_capex_bbl_nom2, paste0(rystad_path, "processed/rystad_capex_bbl_nom_clean.csv")) ## opex per bbl nominal rystad_opex_bbl_nom2 <- rystad_opex_bbl_nom[3:nrow(rystad_opex_bbl_nom),] %>% - select(-Year) %>% - rename(original_asset_name = X1) %>% + dplyr::select(-Year) %>% + rename(original_asset_name = ...1) %>% # revise X1 to ...1 - by Haejin pivot_longer(`1970`:`2050`, names_to = "year", values_to = "opex_per_bbl_nom") %>% mutate(year = as.numeric(year), opex_per_bbl_nom = as.numeric(opex_per_bbl_nom)) @@ -300,7 +303,7 @@ well_cost_eur2 <- well_cost_eur %>% mutate(APInumber = as.character(str_remove_all(APInumber, pattern = "-"))) %>% mutate(both_na = ifelse(is.na(X) & is.na(well_cost_eurusd_per_bbl), 1, 0)) %>% filter(both_na != 1) %>% - select(-X, -both_na) + dplyr::select(-X, -both_na) write_csv(well_cost_eur2, paste0(rystad_path, "processed/well_cost_per_eur_clean.csv")) @@ -312,7 +315,7 @@ write_csv(well_cost_eur2, paste0(rystad_path, "processed/well_cost_per_eur_clean field_asset2 <- field_asset %>% rename(days_prod = `Days on Production (Days)`) %>% filter(!is.na(Asset)) %>% - select(-days_prod) %>% + dplyr::select(-days_prod) %>% unique() write_csv(field_asset2, paste0(rystad_path, "processed/rystad_field_asset.csv")) diff --git a/energy/data-processing-prep/extraction/zero_prod.R b/energy/data-processing-prep/extraction/zero_prod.R index 0b13b61e..7e5fb2d8 100644 --- a/energy/data-processing-prep/extraction/zero_prod.R +++ b/energy/data-processing-prep/extraction/zero_prod.R @@ -1,6 +1,7 @@ ## Tracey Mangin ## April 22, 2021 ## zero production investigation +# revised: Feb 14, 2024 - Haejin library(tidyverse) library(data.table) @@ -9,15 +10,15 @@ library(zoo) library(readxl) library(openxlsx) -## set directory -proj_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/" -raw_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/raw/" -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" -output_dir <- "outputs/exit/" +## set directory +proj_dir <- "/capstone/freshcair/meds-freshcair-capstone/data/" # revised file directory -Haejin +raw_dir <- "/capstone/freshcair/meds-freshcair-capstone/data/inputs/" # revised file directory -Haejin +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" # revised file directory -Haejin +output_dir <- "proprietery-data/" ## files prod_file <- "well_prod_m_processed.csv" -well_file <- "AllWells_table/AllWells_20210427.csv" +well_file <- "extraction/AllWells_20210427.csv" # revised file directory -Haejin ## read in files well_prod <- fread(paste0(data_directory, prod_file), colClasses = c('api_ten_digit' = 'character', @@ -32,7 +33,7 @@ all_wells <- fread(paste0(raw_dir, well_file)) ## wells status status <- all_wells %>% mutate(api_ten_digit = paste0("0", API)) %>% - select(api_ten_digit, well_status = WellStatus) + dplyr::select(api_ten_digit, well_status = WellStatus) #revise dplyr - haejin ## find wells that produce at some point over time period pos_well_api_prod <- well_prod %>% @@ -40,7 +41,7 @@ pos_well_api_prod <- well_prod %>% summarise(oil_total = sum(OilorCondensateProduced, na.rm = T)) %>% ungroup() %>% mutate(pos_pro = ifelse(oil_total > 0, 1, 0)) %>% - filter(pos_pro == 1) + dplyr::filter(pos_pro == 1) #revise dplyr - haejin pos_api_vec <- pos_well_api_prod$api_ten_digit @@ -51,7 +52,7 @@ pos_api_field_prod <- well_prod %>% summarise(oil_total = sum(OilorCondensateProduced, na.rm = T)) %>% ungroup() %>% mutate(pos_pro = ifelse(oil_total > 0, 1, 0)) %>% - filter(pos_pro == 1) + dplyr::filter(pos_pro == 1) #revise dplyr - haejin pos_api_field_vec <- pos_api_field_prod$api_field_code @@ -213,7 +214,7 @@ filt_zero_prod <- function(n_month_val) { ## make data table out_df <- tmp_zero_prod %>% - select(api_ten_digit, zero_prod_months, well_status) %>% + dplyr::select(api_ten_digit, zero_prod_months, well_status) %>% # add dplyr - haejin mutate(year_cut_off = n_month_val / 12) } diff --git a/energy/data-processing-prep/social_cost_carbon.R b/energy/data-processing-prep/social_cost_carbon.R index fbd1fe46..04ad42e7 100644 --- a/energy/data-processing-prep/social_cost_carbon.R +++ b/energy/data-processing-prep/social_cost_carbon.R @@ -2,10 +2,12 @@ ## November 16, 2021 ## create social cost of carbon df +## revised: feb 16 2024 by Haejin + ## libraries library(tidyverse) library(data.table) -library(blscrapeR) +#library(blscrapeR) # library(tabulizer) library(zoo) library(openxlsx) @@ -13,13 +15,13 @@ library(readxl) ## path -main_path <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/' +main_path <- '/capstone/freshcair/meds-freshcair-capstone/' # revised file path ## CPI data carbon_px_file <- 'carbon_price_scenarios_revised.xlsx' ## read in carbon file -cpi_df <- setDT(read.xlsx(paste0(main_path, 'data/stocks-flows/processed/', carbon_px_file), sheet = 'BLS Data Series', startRow = 12)) +cpi_df <- setDT(read.xlsx(paste0(main_path, 'data/inputs/', carbon_px_file), sheet = 'BLS Data Series', startRow = 12)) cpi_df <- cpi_df[Year %in% c(2019, 2020), .(Year, Annual)] @@ -66,5 +68,5 @@ scc_df[, social_cost_co2_19 := social_cost_co2 / cpi2020 * cpi2019] scc_df[, scc_ref := 'https://www.whitehouse.gov/wp-content/uploads/2021/02/TechnicalSupportDocument_SocialCostofCarbonMethaneNitrousOxide.pdf'] -fwrite(scc_df, file.path(main_path, 'data/stocks-flows/processed/social_cost_carbon.csv'), row.names = F) +fwrite(scc_df, file.path(main_path, 'data/processed/social_cost_carbon.csv'), row.names = F) diff --git a/energy/data-processing-prep/stocks_flows.R b/energy/data-processing-prep/stocks_flows.R index 9845e7a9..68542c80 100644 --- a/energy/data-processing-prep/stocks_flows.R +++ b/energy/data-processing-prep/stocks_flows.R @@ -1,11 +1,13 @@ ## Tracey Mangin ## February 20, 2020 ## Data cleaning: Focus areas 1 and 2 +# Updated 2/19/24 - MP -## set directory -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/" -save_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/project-materials/focus-areas-1-2/" +# +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure +getwd() ## attach libraries @@ -15,8 +17,8 @@ library(rebus) library(readxl) library(countrycode) library(rJava) -library(tabulizer) -library(tabulizerjars) +# library(tabulizer) +# library(tabulizerjars) library(lubridate) library(scales) library(openxlsx) @@ -26,11 +28,11 @@ library(openxlsx) ## --------------------------------------------------------------------------------------- ## read in data -port_imports <- read_csv(paste0(data_directory, "raw/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv"), skip = 4) ## first four rows mess up data +port_imports <- read_csv(paste0(data_directory, "inputs/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv"), skip = 4) ## first four rows mess up data port_imports <- clean_names(port_imports) ## get info from raw data -port_imports_info <- read_csv(paste0(data_directory, "raw/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv")) +port_imports_info <- read_csv(paste0(data_directory, "inputs/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv")) port_imports_info <- port_imports_info[1:3, 1] colnames(port_imports_info) <- c("info") @@ -67,105 +69,150 @@ write_csv(port_imports_clean, path = paste0(data_directory, "processed/crude_imp ## WTI monthly prices of crude +## Crude imports to CA ports by export country -- NOT NEEDED IN UPDATED MODEL - MP ## --------------------------------------------------------------------------------------- +# +# ## read in data +# port_imports <- read_csv(paste0(data_directory, "raw/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv"), skip = 4) ## first four rows mess up data +# port_imports <- clean_names(port_imports) +# +# ## get info from raw data +# port_imports_info <- read_csv(paste0(data_directory, "raw/Imports_of_Heavy_Sour_to_Los_Angeles_CA.csv")) +# port_imports_info <- port_imports_info[1:3, 1] +# colnames(port_imports_info) <- c("info") +# +# ## break out information +# oiltype <- c("Heavy Sweet", "Light Sour", "Light Sweet", "Medium", "Heavy Sour") +# +# port_imports_clean <- port_imports %>% +# rename(cats = span_style_float_right_thousand_barrels_span) %>% +# mutate(export_region = countrycode(cats, 'country.name', 'country.name'), +# export_region = ifelse(cats == "World", "World", export_region), +# port = str_detect(cats, pattern = ", CA" %R% END), +# port = ifelse(port == TRUE, cats, NA), +# oil = ifelse(cats %in% oiltype, cats, NA)) %>% +# select(cats, export_region, port, oil, jan_2009:nov_2019) %>% +# fill(export_region, .direction = 'down') %>% +# fill(port, .direction = 'down') %>% +# select(-cats) %>% +# filter(!is.na(oil)) %>% +# pivot_longer(jan_2009:nov_2019, names_to = "date", values_to = "barrels_thous") %>% +# mutate(barrels_thous = as.numeric(ifelse(barrels_thous == "--", 0, barrels_thous)), +# month_orig = str_extract(date, pattern = ANY_CHAR %R% ANY_CHAR %R% ANY_CHAR), +# month2 = paste0(toupper(substr(month_orig, 1, 1)), substr(month_orig, 2, nchar(month_orig))), +# month = match(month2, month.abb), +# year = as.numeric(str_extract(date, pattern = DIGIT %R% DIGIT %R% DIGIT %R% DIGIT))) %>% +# select(-month_orig, -month2) %>% +# mutate(source = port_imports_info$info[3], +# link = port_imports_info$info[1], +# download_date = port_imports_info$info[2]) %>% +# mutate(region_type = ifelse(export_region == "World", "world", "country")) %>% +# select(export_region, region_type, port:download_date) +# +# ## save clean file +# write_csv(port_imports_clean, path = paste0(data_directory, "processed/crude_imports_port.csv")) -## read in data -spt_price_m <- read_xls(paste0(data_directory, "raw/PET_PRI_SPT_S1_M.xls"), sheet = 2, skip = 2) -colnames(spt_price_m) <- c("date", "cushing_ok_wti_FOB", "europe_brent_FOB") - -spt_price_m2 <- spt_price_m %>% - pivot_longer(cushing_ok_wti_FOB:europe_brent_FOB, names_to = "price", values_to = "value") %>% - mutate(unit = "dollars_per_barrel", - description = ifelse(price == "cushing_ok_wti_FOB", "Cushing, OK WTI Spot Price FOB", "Europe Brent Spot Price FOB"), - product = "crude_oil") %>% - select(date, description, product, price, value, unit) %>% - mutate(source = "EIA", - url = "https://www.eia.gov/dnav/pet/pet_pri_spt_s1_m.htm") - -## save clean file -write_csv(spt_price_m2, path = paste0(data_directory, "processed/spot_price_wti_m.csv")) - -## WTI annual prices of crude -## --------------------------------------------------------------------------------------- - -## read in data -spt_price_a <- read_xls(paste0(data_directory, "raw/PET_PRI_SPT_S1_A.xls"), sheet = 2, skip = 2) -colnames(spt_price_a) <- c("date", "cushing_ok_wti_FOB", "europe_brent_FOB") - -spt_price_a2 <- spt_price_a %>% - pivot_longer(cushing_ok_wti_FOB:europe_brent_FOB, names_to = "price", values_to = "value") %>% - mutate(unit = "dollars_per_barrel", - description = ifelse(price == "cushing_ok_wti_FOB", "Cushing, OK WTI Spot Price FOB", "Europe Brent Spot Price FOB"), - product = "crude_oil") %>% - select(date, description, product, price, value, unit) %>% - mutate(source = "EIA", - url = "https://www.eia.gov/dnav/pet/pet_pri_spt_s1_a.htm") - -## save clean file -write_csv(spt_price_a2, file = paste0(data_directory, "processed/eia_spot_price_a.csv")) - -## Domestic Crude Oil First Purchase Prices for Selected Crude Streams -## --------------------------------------------------------------------------------------- - -## read in data -firstp_p_streams <- read_xls(paste0(data_directory, "raw/PET_PRI_DFP2_K_M.xls"), sheet = 2, skip = 2) -colnames(firstp_p_streams) <- c("date", "ak_ns", "ca_kr", "ca_ms", "hls", "lls", "mb", "wti", "wts", "ws") - -firstp_p_streams2 <- firstp_p_streams %>% - pivot_longer(ak_ns:ws, names_to = "crude_stream", values_to = "price") %>% - mutate(unit = "dollars_per_barrel", - description = ifelse(crude_stream == "ak_ns", "Alaska North Slope First Purchase Price", - ifelse(crude_stream == "ca_kr", "California Kern River First Purchase Price", - ifelse(crude_stream == "ca_ms", "California Midway-Sunset First Purchase Price", - ifelse(crude_stream == "hls", "Heavy Louisiana Sweet First Purchase Price", - ifelse(crude_stream == "lls", "Light Louisiana Sweet First Purchase Price", - ifelse(crude_stream == "mb", "Mars Blend First Purchase Price", - ifelse(crude_stream == "wti", "West Texas Intermediate First Purchase Price", - ifelse(crude_stream == "wts", "West Texas Sour First Purchase Price", "Wyoming Sweet First Purchase Price")))))))), - product = "crude_oil") %>% - select(date, description, product, crude_stream, price, unit) %>% - mutate(source = "EIA", - url = "https://www.eia.gov/dnav/pet/pet_pri_dfp2_k_m.htm") - -## save clean file -write_csv(firstp_p_streams2, path = paste0(data_directory, "processed/domestic_crude_first_p_price_streams.csv")) - - -## Crude oil productoin in California -## --------------------------------------------------------------------------------------- -## read in data -crude_prod_ca <- read_xls(paste0(data_directory, "raw/MCRFPCA1m.xls"), sheet = 2, skip = 2) -colnames(crude_prod_ca) <- c("date", "crude_prod_ca_thous_b") +# ## WTI monthly prices of crude -- NOT NEEDED IN UPDATED MODEL - MP +# ## --------------------------------------------------------------------------------------- +# +# ## read in data +# spt_price_m <- read_xls(paste0(data_directory, "raw/PET_PRI_SPT_S1_M.xls"), sheet = 2, skip = 2) +# colnames(spt_price_m) <- c("date", "cushing_ok_wti_FOB", "europe_brent_FOB") +# +# spt_price_m2 <- spt_price_m %>% +# pivot_longer(cushing_ok_wti_FOB:europe_brent_FOB, names_to = "price", values_to = "value") %>% +# mutate(unit = "dollars_per_barrel", +# description = ifelse(price == "cushing_ok_wti_FOB", "Cushing, OK WTI Spot Price FOB", "Europe Brent Spot Price FOB"), +# product = "crude_oil") %>% +# select(date, description, product, price, value, unit) %>% +# mutate(source = "EIA", +# url = "https://www.eia.gov/dnav/pet/pet_pri_spt_s1_m.htm") +# +# ## save clean file +# write_csv(spt_price_m2, path = paste0(data_directory, "processed/spot_price_wti_m.csv")) -crude_prod_ca2 <- crude_prod_ca %>% - mutate(description = "California Field Production of Crude Oil", - source = "EIA", - url = "https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=mcrfpca1&f=m") +# ## WTI annual prices of crude - NOT NEEDED FOR UPDATED MODEL - MP +# ## --------------------------------------------------------------------------------------- +# +# ## read in data +# spt_price_a <- read_xls(paste0(data_directory, "raw/PET_PRI_SPT_S1_A.xls"), sheet = 2, skip = 2) +# colnames(spt_price_a) <- c("date", "cushing_ok_wti_FOB", "europe_brent_FOB") +# +# spt_price_a2 <- spt_price_a %>% +# pivot_longer(cushing_ok_wti_FOB:europe_brent_FOB, names_to = "price", values_to = "value") %>% +# mutate(unit = "dollars_per_barrel", +# description = ifelse(price == "cushing_ok_wti_FOB", "Cushing, OK WTI Spot Price FOB", "Europe Brent Spot Price FOB"), +# product = "crude_oil") %>% +# select(date, description, product, price, value, unit) %>% +# mutate(source = "EIA", +# url = "https://www.eia.gov/dnav/pet/pet_pri_spt_s1_a.htm") +# +# ## save clean file +# write_csv(spt_price_a2, file = paste0(data_directory, "processed/eia_spot_price_a.csv")) -## save clean file -write_csv(crude_prod_ca2, path = paste0(data_directory, "processed/ca_crude_prod_m.csv")) +# ## Domestic Crude Oil First Purchase Prices for Selected Crude Streams - NOT NEEDED FOR UPDATED MODEL - MP +# ## --------------------------------------------------------------------------------------- +# +# ## read in data +# firstp_p_streams <- read_xls(paste0(data_directory, "raw/PET_PRI_DFP2_K_M.xls"), sheet = 2, skip = 2) +# colnames(firstp_p_streams) <- c("date", "ak_ns", "ca_kr", "ca_ms", "hls", "lls", "mb", "wti", "wts", "ws") +# +# firstp_p_streams2 <- firstp_p_streams %>% +# pivot_longer(ak_ns:ws, names_to = "crude_stream", values_to = "price") %>% +# mutate(unit = "dollars_per_barrel", +# description = ifelse(crude_stream == "ak_ns", "Alaska North Slope First Purchase Price", +# ifelse(crude_stream == "ca_kr", "California Kern River First Purchase Price", +# ifelse(crude_stream == "ca_ms", "California Midway-Sunset First Purchase Price", +# ifelse(crude_stream == "hls", "Heavy Louisiana Sweet First Purchase Price", +# ifelse(crude_stream == "lls", "Light Louisiana Sweet First Purchase Price", +# ifelse(crude_stream == "mb", "Mars Blend First Purchase Price", +# ifelse(crude_stream == "wti", "West Texas Intermediate First Purchase Price", +# ifelse(crude_stream == "wts", "West Texas Sour First Purchase Price", "Wyoming Sweet First Purchase Price")))))))), +# product = "crude_oil") %>% +# select(date, description, product, crude_stream, price, unit) %>% +# mutate(source = "EIA", +# url = "https://www.eia.gov/dnav/pet/pet_pri_dfp2_k_m.htm") +# +# ## save clean file +# write_csv(firstp_p_streams2, path = paste0(data_directory, "processed/domestic_crude_first_p_price_streams.csv")) -## Emissions by Facility -## --------------------------------------------------------------------------------------- -emissions_fac <- read_csv(paste0(data_directory, "raw/EmissionsByFacility.csv")) -emissions_fac <- clean_names(emissions_fac) -# ## how many refineries in 2017? -# refin_2017 <- emissions_fac %>% -# filter(year == 2017, -# naics_code == 324110) +# ## Crude oil production in California -- NOT NEEDED FOR FINAL MODEL +# ## --------------------------------------------------------------------------------------- +# +# ## read in data +# crude_prod_ca <- read_xls(paste0(data_directory, "raw/MCRFPCA1m.xls"), sheet = 2, skip = 2) +# colnames(crude_prod_ca) <- c("date", "crude_prod_ca_thous_b") # -# refin_2014 <- emissions_fac %>% -# filter(year == 2014, -# naics_code == 324110) +# crude_prod_ca2 <- crude_prod_ca %>% +# mutate(description = "California Field Production of Crude Oil", +# source = "EIA", +# url = "https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=mcrfpca1&f=m") # -# refin_2013 <- emissions_fac %>% -# filter(year == 2013, -# naics_code == 324110) +# ## save clean file +# write_csv(crude_prod_ca2, path = paste0(data_directory, "processed/ca_crude_prod_m.csv")) -## save clean file -write_csv(emissions_fac, path = paste0(data_directory, "processed/emissions_by_facility.csv")) +# ## Emissions by Facility - NOT NEEDED FOR UPDATED MODEL +# ## --------------------------------------------------------------------------------------- +# emissions_fac <- read_csv(paste0(data_directory, "raw/EmissionsByFacility.csv")) +# emissions_fac <- clean_names(emissions_fac) +# +# # ## how many refineries in 2017? +# # refin_2017 <- emissions_fac %>% +# # filter(year == 2017, +# # naics_code == 324110) +# # +# # refin_2014 <- emissions_fac %>% +# # filter(year == 2014, +# # naics_code == 324110) +# # +# # refin_2013 <- emissions_fac %>% +# # filter(year == 2013, +# # naics_code == 324110) +# +# ## save clean file +# write_csv(emissions_fac, path = paste0(data_directory, "processed/emissions_by_facility.csv")) ## oil and gas production by county ## --------------------------------------------------------------------------------------- diff --git a/energy/extraction-segment/compile-outputs/compile_extraction_outputs_full.R b/energy/extraction-segment/compile-outputs/compile_extraction_outputs_full.R index c979401a..5c941d86 100644 --- a/energy/extraction-segment/compile-outputs/compile_extraction_outputs_full.R +++ b/energy/extraction-segment/compile-outputs/compile_extraction_outputs_full.R @@ -20,17 +20,17 @@ save_external <- 0 cur_date = Sys.Date() ## paths -main_path <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/' -sp_data_path <- paste0(main_path, "data/GIS/raw/") +main_path <- '/capstone/freshcair/meds-freshcair-capstone/' +sp_data_path <- paste0(main_path, "data/input/gis/") ## UPDATE THESE WITH NEW RUNS!!!!! -extraction_folder_path <- 'outputs/predict-production/extraction_2022-11-15/' +extraction_folder_path <- 'outputs/processed' extraction_folder_name <- 'revision-setbacks/' -data_path <-'data/stocks-flows/processed/' +data_path <-'data/processed/' ## health code paths -source_path <- paste0(main_path, 'data/health/source_receptor_matrix/') -inmap_ex_path <- paste0(main_path, "data/health/source_receptor_matrix/inmap_processed_srm/extraction") +source_path <- paste0(main_path, 'data/inputs/health/source_receptor_matrix/') +inmap_ex_path <- paste0(main_path, "data/inputs/health/source_receptor_matrix/inmap_processed_srm/extraction") ## external paths main_path_external <- '/Volumes/calepa/' @@ -91,7 +91,7 @@ dir.create(health_county_save_path, showWarnings = FALSE) ## -------------------------------- ## DAC and CES -dac_ces <- read_xlsx(paste0(main_path, 'data/health/raw/ces3results.xlsx')) +dac_ces <- read_xlsx(paste0(main_path, 'data/inputs/health/ces3results.xlsx')) ces_county <- dac_ces %>% select(`Census Tract`, `California County`) %>% @@ -100,24 +100,24 @@ ces_county <- dac_ces %>% mutate(census_tract = paste0("0", census_tract, sep="")) ## income -- cencus tract -med_house_income <- fread(paste0(main_path, "data/Census/ca-median-house-income.csv"), stringsAsFactors = F) +med_house_income <- fread(paste0(main_path, "data/inputs/gis/census-tract/ca-median-house-income.csv"), stringsAsFactors = F) # not exist ------- med_house_income[, census_tract := paste0("0", GEOID)] med_house_income <- med_house_income[, .(census_tract, estimate)] setnames(med_house_income, "estimate", "median_hh_income") ## income -- county -county_income <- fread(paste0(main_path, "data/Census/ca-median-house-income-county.csv"), stringsAsFactors = F) +county_income <- fread(paste0(main_path, "data/input/gis/census-tract/ca-median-house-income-county.csv"), stringsAsFactors = F) # not exist ----- county_income <- county_income[, .(county, estimate)] setnames(county_income, "estimate", "median_hh_income") ## monthly well production -well_prod <- fread(paste0(main_path, "/data/stocks-flows/processed/", prod_file), colClasses = c('api_ten_digit' = 'character', +well_prod <- fread(paste0(main_path, "data/processed/", prod_file), colClasses = c('api_ten_digit' = 'character', 'doc_field_code' = 'character')) ## historical GHG emissions, 2019 ## -------------------------- -hist_ghg <- fread(paste0(main_path, 'data/stocks-flows/processed/', ghg_hist_file), header = T) +hist_ghg <- fread(paste0(main_path, 'data/processed/', ghg_hist_file), header = T) hist_ghg <- hist_ghg[segment %chin% c('Oil & Gas: Production & Processing') & year == 2019, .(segment, unit, year, value)] @@ -127,7 +127,7 @@ ghg_2019 <- as.numeric(hist_ghg[, value][1]) ## ghg factors -ghg_factors = fread(file.path(main_path, 'outputs/stocks-flows', ghg_file), header = T, colClasses = c('doc_field_code' = 'character')) +ghg_factors = fread(file.path(main_path, 'proprietery-data/stocks-flows', ghg_file), header = T, colClasses = c('doc_field_code' = 'character')) ghg_factors_2019 = ghg_factors[year == 2019, c('doc_field_code', 'year', 'upstream_kgCO2e_bbl')] ## oil prices diff --git a/energy/extraction-segment/model-prep/create_entry_econ_variables.R b/energy/extraction-segment/model-prep/create_entry_econ_variables.R index f659cd15..a64f0cd9 100644 --- a/energy/extraction-segment/model-prep/create_entry_econ_variables.R +++ b/energy/extraction-segment/model-prep/create_entry_econ_variables.R @@ -1,14 +1,15 @@ ## Tracey Mangin ## August 30, 2020 ## create +# revise Feb 14, 2024 - Haejin library(tidyverse) ## set directory -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" -rystad_path <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/Rystad/data/" -save_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/outputs/" +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" +rystad_path <- "/capstone/freshcair/meds-freshcair-capstone/data/" +save_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/outputs/" ## read in the data rystad_econ <- read_csv(paste0(rystad_path, "processed/oil_asset_opex_capex_govtt_clean.csv")) @@ -51,17 +52,17 @@ rystad_econ_adj <- rystad_econ %>% ifelse(location == "Venice Beach", "Venice Beach (ABD)", ifelse(location == "Ventura (Yuma Energy acquisition)", "Ventura", ifelse(location == "West Montalvo Offshore", "Montalvo West", location)))))))))))))))))) %>% - filter(economics_group %in% c("Capex", "Opex")) %>% - select(original_asset_name, adj_location, year, economics_group, usd_nom) %>% - pivot_wider(names_from = economics_group, values_from = usd_nom) %>% - rename(FieldName = adj_location, capex = Capex, opex = Opex) %>% - filter(year <= 2019) + dplyr::filter(economics_group %in% c("Capex", "Opex")) %>% # add dplyr - haejin + dplyr::select(original_asset_name, adj_location, year, economics_group, usd_nom) %>% # add dplyr - haejin + pivot_wider(names_from = economics_group, values_from = usd_nom) %>% + rename(FieldName = adj_location, capex = Capex, opex = Opex) %>% + dplyr::filter(year <= 2019) # add dplyr - haejin ## prep capex per bbl reserves rystad_capex_bbl2 <- rystad_capex_bbl %>% - select(original_asset_name, year, capex_per_bbl_reserves = usd_per_bbl) + dplyr::select(original_asset_name, year, capex_per_bbl_reserves = usd_per_bbl) # add dplyr - haejin ## join with production, calculate capex and opex per econ_per_bbl_df <- left_join(rystad_econ_adj, rystad_prod_adj) %>% @@ -70,14 +71,14 @@ econ_per_bbl_df <- left_join(rystad_econ_adj, rystad_prod_adj) %>% left_join(rystad_capex_bbl2) %>% left_join(rystad_capex_bbl_nom_df) %>% left_join(rystad_opex_bbl_nom_df) %>% - select(original_asset_name, FieldName, year, rystad_prod_bbl = total_prod_bbls, capex, capex_bbl_rp, capex_per_bbl_reserves, capex_per_bbl_nom, opex, opex_bbl_rp, opex_per_bbl_nom) + dplyr::select(original_asset_name, FieldName, year, rystad_prod_bbl = total_prod_bbls, capex, capex_bbl_rp, capex_per_bbl_reserves, capex_per_bbl_nom, opex, opex_bbl_rp, opex_per_bbl_nom) ## add sum my prod, two versions of depletion, max resource ## prep meas's df ratio_df2 <- ratio_df %>% - select(asset, year, cumsum_div_my_prod, cumsum_div_max_resources) %>% - filter(year <= 2019) %>% + dplyr::select(asset, year, cumsum_div_my_prod, cumsum_div_max_resources) %>% # add dplyr - haejin + dplyr::filter(year <= 2019) %>% # add dplyr - haejin rename(original_asset_name = asset) @@ -86,15 +87,15 @@ econ_per_bbl_df2 <- econ_per_bbl_df %>% rename(asset = original_asset_name) %>% left_join(asset_sum_my_production) %>% mutate(sum_err_my_production_bbl = sum_err_my_production * 1e6) %>% - select(-units) %>% + dplyr::select(-units) %>% # add dplyr - haejin left_join(asset_max_resources) %>% mutate(max_err_resources_bbl = max_err_resources * 1e6) %>% - select(-units) %>% + dplyr::select(-units) %>% left_join(asset_yr_cumsum_production) %>% mutate(cumsum_err_production_bbl = cumsum_err_production * 1e6) %>% - select(-production, - units) %>% + dplyr::select(-production, - units) %>% # add dplyr - haejin rename(original_asset_name = asset) %>% left_join(ratio_df2) # ## save to use later -write_csv(econ_per_bbl_df2, path = "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/outputs/stocks-flows/rystad_entry_variables.csv") +write_csv(econ_per_bbl_df2, path = "/capstone/freshcair/meds-freshcair-capstone/data/proprietery-data/rystad_entry_variables.csv") diff --git a/energy/extraction-segment/model-prep/economically_recoverable_resources.R b/energy/extraction-segment/model-prep/economically_recoverable_resources.R index 8537acf8..331bd49f 100644 --- a/energy/extraction-segment/model-prep/economically_recoverable_resources.R +++ b/energy/extraction-segment/model-prep/economically_recoverable_resources.R @@ -2,14 +2,16 @@ ## august 8, 2020 ## script for analyzing economically recoverable resources scenarios +# revise: Feb 14, 2024 - Haejin + # load libraries -------- library(data.table) # inputs and file paths ----- - rystad_path = '/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/Rystad/data/' - err_file = 'processed/economically_recoverable_resources_scenarios_wide.csv' + rystad_path = '/capstone/freshcair/meds-freshcair-capstone/data/' + err_file = '/processed/economically_recoverable_resources_scenarios_wide.csv' #--missing, need to run other file to generate this ------ # read in data ------ diff --git a/energy/extraction-segment/model-prep/gen_well_setback_status.R b/energy/extraction-segment/model-prep/gen_well_setback_status.R index 98aa2467..c08aea36 100644 --- a/energy/extraction-segment/model-prep/gen_well_setback_status.R +++ b/energy/extraction-segment/model-prep/gen_well_setback_status.R @@ -5,14 +5,15 @@ # written: 4/9/2021 | modified: 4/12/2021 # modified by Tracey Mangin ################################################################## +# revise : Feb 14, 2024 by Haejin # comment out and add your own machine's file path -home <- "/Volumes/GoogleDrive/Shared drives" -buffer_path <- "emlab/projects/current-projects/calepa-cn/data/GIS/processed/fracktracker-sr" -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" +home <- "/capstone/freshcair/meds-freshcair-capstone" +buffer_path <- "data/processed" +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/inputs/extraction" ## files -prod_file <- "well_prod_m_processed.csv" +prod_file <- "well_prod_m_processed.csv" # load packages library(sf) @@ -35,14 +36,14 @@ ca <- st_as_sf(map("state", plot = FALSE, fill = TRUE)) %>% ################################# READ DATA AND TRANSFORM -buff1000 <- sf::st_read(file.path(home, buffer_path, "buffer_1000ft.shp")) +buff1000 <- sf::st_read(file.path(home, buffer_path, "buffer_1000ft.shp")) # no file here -buff2500 <- sf::st_read(file.path(home, buffer_path, "buffer_2500ft.shp")) +buff2500 <- sf::st_read(file.path(home, buffer_path, "buffer_2500ft.shp")) # no file here -buff5280 <- sf::st_read(file.path(home, buffer_path, "buffer_5280ft.shp")) +buff5280 <- sf::st_read(file.path(home, buffer_path, "buffer_5280ft.shp")) # no file here # transform to NAD83(NSRS2007) / California Albers as well for wells and field boundaries -wells <- sf::st_read(file.path(home, "emlab/projects/current-projects/calepa-cn/data/GIS/raw/allwells_gis/Wells_All.shp")) %>% +wells <- sf::st_read(file.path(home, "emlab/projects/current-projects/calepa-cn/data/GIS/raw/allwells_gis/Wells_All.shp")) %>% # ---- missing file -------- st_transform(ca_crs) %>% dplyr::select(API, WellStatus) %>% unique() @@ -52,7 +53,7 @@ wells2 <- sf::st_read(file.path(home, "emlab/projects/current-projects/calepa-cn dplyr::select(API, WellStatus, FieldName) %>% unique() -boundaries <- st_read(file.path(home, "emlab/projects/current-projects/calepa-cn/data/GIS/raw/field-boundaries/DOGGR_Admin_Boundaries_Master.shp")) %>% st_transform(3488) +boundaries <- st_read(file.path(home, "/data/gis/field-boundaries/DOGGR_Admin_Boundaries_Master.shp")) %>% st_transform(3488) # update ## monthly well production well_prod <- fread(paste0(data_directory, prod_file), colClasses = c('api_ten_digit' = 'character', @@ -221,7 +222,7 @@ ggplot(data = buff1000) + # summarize(n_in = sum(in_setback_orig)) # save output -write_csv(wells_within_df_all, file.path(home, "emlab/projects/current-projects/calepa-cn/outputs/setback/model-inputs/wells_in_setbacks_revised.csv")) +write_csv(wells_within_df_all, file.path(home, "emlab/projects/current-projects/calepa-cn/outputs/setback/model-inputs/wells_in_setbacks_revised.csv")) # missing ------- ## make maps @@ -331,7 +332,7 @@ View(field_boundaries2 %>% ungroup()) ## add number of wells to each field -n_wells <- sf::st_read(file.path(home, "emlab/projects/current-projects/calepa-cn/data/GIS/raw/allwells_gis/Wells_All.shp")) %>% +n_wells <- sf::st_read(file.path(home, "emlab/projects/current-projects/calepa-cn/data/GIS/raw/allwells_gis/Wells_All.shp")) %>% # missing one st_drop_geometry() %>% filter(!WellStatus %in% c("Abeyance")) %>% group_by(FieldName) %>% @@ -358,7 +359,7 @@ field_boundaries3 <- field_boundaries2 %>% # save output -write_csv(field_boundaries3, file.path(home, "emlab/projects/current-projects/calepa-cn/outputs/setback/model-inputs/setback_coverage_R.csv")) +write_csv(field_boundaries3, file.path(home, "/data/processed/setback_coverage_R.csv")) ## save maps for examining @@ -374,7 +375,7 @@ coverage_map <- mapview(wells2, layer.name = "Wells", label = 'WellStatus', cex = 0.3, alpha = 0, legend = FALSE) # save output -mapshot(coverage_map, url = paste0(home, "/emlab/projects/current-projects/calepa-cn/outputs/setback/review/coverage_map.html"), selfcontained = F) +mapshot(coverage_map, url = paste0(home, "data/processed/coverage_map.html"), selfcontained = F) diff --git a/energy/extraction-segment/model-prep/match_fields_assets.R b/energy/extraction-segment/model-prep/match_fields_assets.R index e74385b4..433a2634 100644 --- a/energy/extraction-segment/model-prep/match_fields_assets.R +++ b/energy/extraction-segment/model-prep/match_fields_assets.R @@ -2,14 +2,18 @@ ## May 11, 2021 ## Match fields to assets +## revised: feb 16 2024 by haejin + # ------------------------------------------- INPUTS ----------------------------------- -data_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/stocks-flows/processed/" -rystad_path <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/Rystad/data/" -sp_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/data/GIS/raw/field-boundaries/" -save_directory <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/outputs/" -proj_dir <- "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/" -output_dir <- "outputs/stocks-flows/" -match_out_path <- "stocks-flows/entry-model-input/final/" +data_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" +rystad_path <- "/capstone/freshcair/meds-freshcair-capstone/data/processed/" +sp_dir <- "/capstone/freshcair/meds-freshcair-capstone/data/inputs/gis/" +save_directory <- "/capstone/freshcair/meds-freshcair-capstone/data/proprietery-data/" +proj_dir <- "/capstone/freshcair/meds-freshcair-capstone/" +output_dir <- "proprietery-data" +match_out_path <- "proprietery-data" + + ## files rystad_file <- "field_rystad_match_apis_revised.csv" @@ -27,7 +31,7 @@ library(data.table) library(mapview) library(sf) library(rgeos) #nearest poly to each point -library(nngeo) #nearest point to each poly +#library(nngeo) #nearest point to each poly ## step 0: fields matched to assets through wells @@ -39,7 +43,7 @@ well_prod <- fread(paste0(data_directory, prod_file), colClasses = c('api_ten_di ## asset to field match using well APIs -field_asset_match <- fread(paste0(rystad_path, "processed/", rystad_file), colClasses = c('doc_field_code' = 'character')) +field_asset_match <- fread(paste0(rystad_path, rystad_file), colClasses = c('doc_field_code' = 'character')) field_asset_match[, c("doc_fieldname", "bbl_prod", "n_wells_field", "field_prod", "rel_field", "rel_prod") := NULL] ## compute productive fields @@ -48,20 +52,20 @@ productive_fields <- productive_fields[total_prod > 0] ## field to asset match by well apis well_match_df <- productive_fields %>% - select(doc_field_code) %>% + dplyr::select(doc_field_code) %>% left_join(field_asset_match) fieldcodes <- well_prod %>% - select(doc_field_code, doc_fieldname) %>% - unique() %>% - filter(doc_field_code %in% productive_fields$doc_field_code) + dplyr::select(doc_field_code, doc_fieldname) %>% + raster::unique() %>% + dplyr::filter(doc_field_code %in% productive_fields$doc_field_code) ## fields that get matched with assets field_asset_well_match <- well_match_df %>% - select(doc_field_code, original_asset_name) %>% - unique() %>% - filter(!is.na(original_asset_name)) + dpylr::select(doc_field_code, original_asset_name) %>% + raster::unique() %>% + dplyr::filter(!is.na(original_asset_name)) ## save file # write_csv(field_asset_well_match, path = "/Volumes/GoogleDrive/Shared\ drives/emlab/projects/current-projects/calepa-cn/outputs/stocks-flows/entry-model-input/well_doc_asset_match.csv") diff --git a/energy/extraction-segment/model-prep/well_setback_sp_prep.R b/energy/extraction-segment/model-prep/well_setback_sp_prep.R index 27c07aaf..ab3ccea3 100644 --- a/energy/extraction-segment/model-prep/well_setback_sp_prep.R +++ b/energy/extraction-segment/model-prep/well_setback_sp_prep.R @@ -1,10 +1,11 @@ ## Tracey Mangin ## May 7, 2021 ## prep FrackTracker data for analysis +##revised : Feb 14, 2024 by Haejin # comment out and add your own machine's file path -home <- "/Volumes/GoogleDrive/Shared drives" -ft_path <- "emlab/projects/current-projects/calepa-cn/data/FracTracker/FracTrackerSetbackgdb-newest/FracTrackerSetbackgdb/FracTrackerSetbackdata.gdb" +home <- "/capstone/freshcair/meds-freshcair-capstone" +ft_path <- "/data/inputs/FracTracker/FracTrackerSetbackgdb-newest/FracTrackerSetbackgdb/FracTrackerSetbackdata.gdb" #--- missing file ---- save_path <- paste0(home, "/emlab/projects/current-projects/calepa-cn/data/GIS/processed/fracktracker-sr/") # load packages @@ -27,7 +28,7 @@ ca <- st_as_sf(map("state", plot = FALSE, fill = TRUE)) %>% ################################# READ DATA AND TRANSFORM # to get the names of layers in the shapefile -layers <- sf::st_layers(dsn = file.path(home, "emlab/projects/current-projects/calepa-cn/data/FracTracker/FracTrackerSetbackgdb-newest/FracTrackerSetbackgdb/FracTrackerSetbackdata.gdb")) +layers <- sf::st_layers(dsn = file.path(home, "emlab/projects/current-projects/calepa-cn/data/FracTracker/FracTrackerSetbackgdb-newest/FracTrackerSetbackgdb/FracTrackerSetbackdata.gdb")) #### ----- missing ## read in the SR layers layer_vec <- c("SetbackOutlines_SR_Dwellings_082220", "PlaygroundsinCities", "DayCareCenters", "reselderlyCare", diff --git a/health/scripts/appendix_maps.R b/health/scripts/appendix_maps.R index f7c8ad74..839719fb 100644 --- a/health/scripts/appendix_maps.R +++ b/health/scripts/appendix_maps.R @@ -1,5 +1,10 @@ #Danae Hernandez-Cortes hernandezcortes@ucsb.edu #INFRASTRUCTURE TO SEND TO THE MODELING TEAM + + +# UPDATED 2/13/2024 by Mariam Garcia + + #Libraries library(sf) library(tidyverse) @@ -18,6 +23,10 @@ library(dplyr) rm(list=ls()) +# Updated working directory +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure + + #DANAE'S MACHINE outputFiles <- "D:/Dropbox/UCSB-PhD/emLab/CALEPA/data/academic_output" sourceFiles <- "D:/Dropbox/UCSB-PhD/emLab/CALEPA/data/source_receptor_matrix" @@ -28,7 +37,9 @@ inmapReFiles <- "D:/Dropbox/UCSB-PhD/emLab/CALEPA/data/source_receptor_matrix/i CA<-st_read("H:/CALEPA/GIS/state/california2016.shp") -CA_counties<-st_read("D:/Dropbox/UCSB-PhD/emLab/CALEPA/data/CA_counties/CA_Counties/CA_Counties_TIGER2016.shp") + +# UPDATED - MG +CA_counties<-st_read("/capstone/freshcair/meds-freshcair-capstone/data/inputs/gis/CA_Counties/CA_Counties_TIGER2016.shp") extraction_clusters<-st_read("D:/Dropbox/UCSB-PhD/emLab/CALEPA/CALEPA_bren_computer/CALEPA/GIS/fields/academic_paper/extraction_fields_clusters_10km.shp") diff --git a/health/scripts/emission-factors.R b/health/scripts/emission-factors.R index 855c6e54..65e89c0b 100644 --- a/health/scripts/emission-factors.R +++ b/health/scripts/emission-factors.R @@ -3,6 +3,9 @@ # created: 08/11/2020 # updated: 06/26/2023 + +# Updated by Mariam Garcia 2/13/2024 + # set up environment rm(list=ls()) @@ -25,7 +28,9 @@ for (i in packages) { ## Directory #wd <- c("H:/EJ_wells") # Danae's WD -wd <- c("G:/Shared drives/emlab/projects/current-projects/calepa-cn/data/health/processed/DataFiles_EmissionsFactors") #Vincent's WD + +# UPDATED - MG +wd <- c('/capstone/freshcair/meds-freshcair-capstone') #freshCAir main directory setwd(wd) getwd() diff --git a/health/scripts/figures_map_totalpm25.R b/health/scripts/figures_map_totalpm25.R index 3e3c4ff0..28e2e133 100644 --- a/health/scripts/figures_map_totalpm25.R +++ b/health/scripts/figures_map_totalpm25.R @@ -1,6 +1,7 @@ #Danae Hernandez-Cortes hernandezcortes@ucsb.edu #vthiviergge@ucsb.edu #last change: 10/08/2021 +# modified : 02/13/2024 - Haejin #WEIGHTED POLLUTION EXPOSURE #Libraries @@ -20,7 +21,7 @@ library(RColorBrewer) #EXTRACTION rm(list=ls()) -sourceFiles <- "./calepa-cn/data/" +sourceFiles <- "./data/inputs" ######################VINCENT: PLEASE ADD THE MODIFIED CODE THAT OBTAINS EXTRACTION @@ -40,14 +41,14 @@ extraction_BAU_2019<-extraction_BAU%>%dplyr::filter(year==2019) ###################POPULATION########################## #0. Bring disadvantaged communities data from CES -ces_data <- read_csv(paste0(sourceFiles,"health/raw/ces3results_part.csv",sep=""), stringsAsFactors = FALSE) %>% +ces_data <- read_csv(paste0(sourceFiles,"health/ces3results_part.csv",sep=""), stringsAsFactors = FALSE) %>% dplyr::select("census_tract","total_population") -CA<-st_read(paste0(sourceFiles,"GIS/raw/state/california2016.shp",sep="")) +#CA<-st_read(paste0(sourceFiles,"GIS/raw/state/california2016.shp",sep="")) # missing data ------ -CA_counties<-st_read(paste0(sourceFiles,"GIS/raw/CA_Counties/CA_Counties_TIGER2016.shp",sep="")) +CA_counties<-st_read(paste0(sourceFiles,"gis/CA_Counties/CA_Counties_TIGER2016.shp",sep="")) -CA_ct<-st_read(paste0(sourceFiles,"GIS/raw/census-tract/tl_2019_06_tract.shp",sep="")) +CA_ct<-st_read(paste0(sourceFiles,"gis/census-tract/tl_2019_06_tract.shp",sep="")) #Merge refining diff --git a/health/scripts/health_data.R b/health/scripts/health_data.R index e877c55b..20aab1a9 100644 --- a/health/scripts/health_data.R +++ b/health/scripts/health_data.R @@ -2,6 +2,7 @@ # vthivierge@ucsb.edu # created: 08/25/2021 # updated: 05/26/2022 +# updated: 02/12/2024 # set up environment ######################################## @@ -12,16 +13,15 @@ options(java.parameters = "-Xmx8000m") ## Packages packages=c("xlsx", "gdata", "dplyr","tidyr", "stringr", "fuzzyjoin", "stringr", - "ggplot2", "stargazer", "plm", "cowplot", "sf", "lwgeom","data.table") + "ggplot2", "stargazer", "plm", "cowplot", "sf", "lwgeom","data.table", "here") lapply(1:length(packages), function(x) ifelse((require(packages[x],character.only=TRUE)==FALSE),install.packages(packages[x]), require(packages[x],character.only=TRUE))) -#Set directory - -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -setwd('../../..') #Goes back to home project directory +# Set directory +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure getwd() # Load and process data ##################################### @@ -30,8 +30,10 @@ getwd() #census tract age-population -ct_raw <- read.csv("./calepa-cn/data/benmap/raw/nhgis0001_ts_geog2010_tract.csv", stringsAsFactors = FALSE); str(ct_raw) -ct_age_desc <- read.csv("./calepa-cn/data/benmap/raw/age_group_desc.csv", stringsAsFactors = FALSE); str(ct_age_desc) +# UPDATED - MP +ct_raw <- read.csv("data/inputs/health/nhgis0001_ts_geog2010_tract.csv", stringsAsFactors = FALSE); str(ct_raw) +# UPDATED - MP +ct_age_desc <- read.csv("data/inputs/health/age_group_desc.csv", stringsAsFactors = FALSE); str(ct_age_desc) ct_ca <- ct_raw %>% `colnames<-`(tolower(colnames(ct_raw)))%>% @@ -56,7 +58,8 @@ age_group_ct <- ct_ca %>% #CDOF demographic projections -cdof_raw <- fread("./calepa-cn/data/benmap/raw/CDOF_p2_Age_1yr_Nosup.csv", stringsAsFactors = FALSE, blank.lines.skip = TRUE)%>% +# UPDATED - MP +cdof_raw <- fread("data/inputs/health/CDOF_p2_Age_1yr_Nosup.csv", stringsAsFactors = FALSE, blank.lines.skip = TRUE)%>% select(-Column1:-Column16331)%>% gather(year,pop,'2010':'2060')%>% mutate(pop = as.numeric(str_replace(pop,",","")), @@ -89,8 +92,8 @@ cdof_pred <- cdof_raw %>% #County name to merged with BenMAP row/col county indicators -ct <- read_sf("./calepa-cn/data/benmap/raw/County_def.shp") - +# UPDATED - MP +ct <- read_sf("data/inputs/health/County_def.shp") county <- as.data.frame(cbind(ct$NAME, ct$STATE_NAME, ct$ROW, ct$COL), stringsAsFactors = F) colnames(county) <- c("county","state", "row", "col") @@ -99,7 +102,8 @@ county$col <- as.integer(county$col) # Mortality incidence data (2015 baseline) -incidence_ca <- read.csv("./calepa-cn/data/benmap/raw/Mortality Incidence (2015).csv", stringsAsFactors = F) %>% +# UPDATED - MP +incidence_ca <- read.csv("data/inputs/health/Mortality Incidence (2015).csv", stringsAsFactors = F) %>% filter(Endpoint == "Mortality, All Cause") %>% select(-Endpoint.Group,-Race:-Ethnicity, -Type)%>% left_join(county, by = c("Column"="col","Row"="row"))%>% @@ -148,9 +152,9 @@ ct_incidence_ca <- temp_ct_ca %>% select(-start.age.x,-start.age.y,-end.age.x, -end.age.y, -county.x,-county.y,-value.x,-value.y) - -write.csv(ct_incidence_ca,file = "./calepa-cn/data/benmap/processed/ct_incidence_ca.csv", row.names = FALSE) -ct_incidence_ca <- read.csv("./calepa-cn/data/benmap/processed/ct_incidence_ca.csv", stringsAsFactors = FALSE) +# UPDATED - MP +write.csv(ct_incidence_ca,file = "data/processed/ct_incidence_ca.csv", row.names = FALSE) +ct_incidence_ca <- read.csv("data/processed/ct_incidence_ca.csv", stringsAsFactors = FALSE) ##Projected population data and mortality incidence @@ -181,8 +185,9 @@ ct_inc_45 <- ct_inc_45_temp%>% ## Output final population and mortality incidence data -write.csv(ct_inc_45,file = "./calepa-cn/data/benmap/processed/ct_inc_45.csv", row.names = FALSE) -#ct_inc_45 <- fread("./calepa-cn/data/benmap/processed/ct_inc_45.csv", stringsAsFactors = FALSE) +# UPDATED - MP +write.csv(ct_inc_45,file = "data/processed/ct_inc_45.csv", row.names = FALSE) +ct_inc_45 <- fread("data/processed/ct_inc_45.csv", stringsAsFactors = FALSE) ## Census tract level for labor team @@ -191,4 +196,5 @@ ct_pop_45 <- ct_inc_45 %>% summarise(county = first(county), pop = sum(pop)) -write.csv(ct_pop_45,file = "./calepa-cn/data/benmap/processed/ct_pop_45.csv", row.names = FALSE) +# UPDATED - MP +write.csv(ct_pop_45,file = "data/processed/ct_pop_45.csv", row.names = FALSE) diff --git a/health/scripts/health_mortality.R b/health/scripts/health_mortality.R index 7b76504d..34c6a3d1 100644 --- a/health/scripts/health_mortality.R +++ b/health/scripts/health_mortality.R @@ -3,6 +3,9 @@ # created: 08/25/2021 # updated: 09/13/2021 +# UPDATED MG 2/13/2024 + + # set up environment ######################################## rm(list=ls()) @@ -10,7 +13,7 @@ rm(list=ls()) ## Packages packages=c("xlsx", "gdata", "dplyr","tidyr", "stringr", "fuzzyjoin", "stringr", "tictoc", - "ggplot2", "stargazer", "plm", "cowplot", "sf", "lwgeom","data.table") + "ggplot2", "stargazer", "plm", "cowplot", "sf", "lwgeom","data.table","tidyverse") lapply(1:length(packages), function(x) ifelse((require(packages[x],character.only=TRUE)==FALSE),install.packages(packages[x]), @@ -18,16 +21,23 @@ lapply(1:length(packages), function(x) #Set directory -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -setwd('../../..') #Goes back to home project directory +#setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) + +# UPDATED MG +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure + getwd() # Load and merge processed data ##################################### #1 Disadvantaged community definition -ces3 <- read.csv("./calepa-cn/data/health/processed/ces3_data.csv", stringsAsFactors = FALSE)%>% - select(census_tract,disadvantaged) +# UPDATED MG +ces3 <- read.csv("data/inputs/health/ces3_data.csv", stringsAsFactors = FALSE) + +# UPDATED MG +ces3 <- ces3 %>% + dplyr::select(census_tract,disadvantaged) #2 Merge pollution reduction scenarios @@ -52,7 +62,9 @@ toc() #3 Merge population and incidence -ct_inc_pop_45 <- read.csv("./calepa-cn/data/benmap/processed/ct_inc_45.csv", stringsAsFactors = FALSE)%>% + +# UPDATED -MG +ct_inc_pop_45 <- read.csv("./data/processed/ct_inc_45.csv", stringsAsFactors = FALSE)%>% mutate(ct_id = as.numeric(paste0(stringr::str_sub(gisjoin,3,3), stringr::str_sub(gisjoin,5,7), stringr::str_sub(gisjoin,9,14))))%>% diff --git a/health/scripts/inmap-to-csv.R b/health/scripts/inmap-to-csv.R index 84823ca3..4cf6c4f1 100644 --- a/health/scripts/inmap-to-csv.R +++ b/health/scripts/inmap-to-csv.R @@ -3,6 +3,8 @@ # created: 08/06/2020 # updated: 09/28/2020 +# updated: 02/14/2024 + # set up environment @@ -24,15 +26,19 @@ for (i in packages) { ## Directory -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #In scripts folder -setwd('../../..') #Goes back to home project directory +#setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #In scripts folder + +# Updated working directory +setwd('/capstone/freshcair/meds-freshcair-capstone') getwd() ## Read census tract shp file -census_tract <- read_sf("./calepa-cn/data/inmap/census-tract/tl_2019_06_tract.shp")%>% - select(-STATEFP:-TRACTCE,-NAME:-INTPTLON)%>% +# UPDATED - MG + +census_tract <- read_sf("./data/inputs/gis/census-tract/tl_2019_06_tract.shp")%>% + dplyr::select(-STATEFP:-TRACTCE,-NAME:-INTPTLON)%>% st_transform(crs=3310) ## Load shape files names diff --git a/health/scripts/main_maps.R b/health/scripts/main_maps.R index 7ac32ddb..3a005855 100644 --- a/health/scripts/main_maps.R +++ b/health/scripts/main_maps.R @@ -1,4 +1,7 @@ #Danae Hernandez-Cortes hernandezcortes@ucsb.edu + +# Updated: 2/14/2024 by MG + #EXTRACTION MAP #Libraries library(sf) @@ -31,12 +34,17 @@ inmapReFiles <- "C:/Users/dhern125/Dropbox/UCSB-PhD/emLab/CALEPA/data/source_re #inmapExFiles <- "emlab/projects/current-projects/calepa-cn/data/health/source_receptor_matrix/inmap_processed_srm/extraction" #inmapReFiles <- "emlab/projects/current-projects/calepa-cn/data/health/source_receptor_matrix/inmap_processed_srm/refining" +# Updating working directory +setwd('/capstone/freshcair/meds-freshcair-capstone') + + ############################################# # PREPARE FILES FROM INMAP OUTPUT: EXTRACTION ############################################# +# UPDATED - MG #Census tracts -CA_ct<-st_read("C:/Users/dhern125/Dropbox/UCSB-PhD/emLab/CALEPA/data/census_tracts/census-tract/tl_2019_06_tract.shp") +CA_ct<-st_read("./data/inputs/gis/census-tract/tl_2019_06_tract.shp") #EXTRACTION MAP CLUSTER 1 sites_vector <- c(1) diff --git a/health/scripts/source_receptor_matrix.R b/health/scripts/source_receptor_matrix.R index 4ab5117b..ce620a14 100644 --- a/health/scripts/source_receptor_matrix.R +++ b/health/scripts/source_receptor_matrix.R @@ -1,12 +1,14 @@ ## Directory +# updated: 2/12/24 library(tidyverse) library(sf) -# setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #In scripts folder -# setwd('../../..') #Goes back to home project directory -# getwd() +# setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure +getwd() + ## paths main_path <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/' @@ -14,13 +16,13 @@ sp_data_path <- paste0(main_path, "data/GIS/raw/") health_data_path <- paste0(main_path, "data/health/source_receptor_matrix/inmap_output_srm/") srm_save_path <- paste0(main_path, "data/health/source_receptor_matrix/inmap_processed_srm/") -## Read census tract shp file -census_tract <- read_sf(paste0(sp_data_path,"census-tract/tl_2019_06_tract.shp")) %>% +## Read census tract shp file - UPDATED - MP +census_tract <- read_sf('data/inputs/gis/census-tract/tl_2019_06_tract.shp') %>% select(-STATEFP:-TRACTCE,-NAME:-INTPTLON)%>% st_transform(crs=3310) -## counties, no islands -CA_counties <- st_read(paste0(sp_data_path, "CA_counties_noislands/CA_Counties_TIGER2016_noislands.shp")) %>% +## counties, no islands - UPDATED - MP +CA_counties <- st_read('data/inputs/gis/CA_counties_noislands/CA_Counties_TIGER2016_noislands.shp') %>% st_transform(crs=3310) %>% select(OBJECTID, GEOID) diff --git a/health/scripts/srm_extraction_population.R b/health/scripts/srm_extraction_population.R index 5c3b952d..c7ee4b33 100644 --- a/health/scripts/srm_extraction_population.R +++ b/health/scripts/srm_extraction_population.R @@ -1,5 +1,8 @@ #Danae Hernandez-Cortes hernandezcortes@ucsb.edu #INFRASTRUCTURE TO SEND TO THE MODELING TEAM + +# Updated 2/14/2024 by MG + #Libraries library(sf) library(tidyverse) @@ -18,10 +21,8 @@ library(dplyr) rm(list=ls()) -inmapExFiles <- "calepa-cn/data/health/source_receptor_matrix/inmap_processed_srm/extraction" -sourceFiles <- "calepa-cn/data/health/source_receptor_matrix/inmap_processed_srm" -mainFiles <- "calepa-cn/data/health/raw" - +# setting working directory +setwd('/capstone/freshcair/meds-freshcair-capstone') # (0) Load CES3.0 diff --git a/labor/processing/ica_multiplier_process.R b/labor/processing/ica_multiplier_process.R index 29ad224b..d3992766 100644 --- a/labor/processing/ica_multiplier_process.R +++ b/labor/processing/ica_multiplier_process.R @@ -2,6 +2,7 @@ # Chris Malloy (cmalloy@ucsb.edu) # created: 07/28/2021 # updated: 08/17/2021 +# updated : 02/19/2024 ############################################################################################ # Set up environment @@ -25,28 +26,21 @@ library("lubridate") library("writexl") library("tigris") library("sf") - +library(dplyr) #Set wd - -#Chris' macbook -ica_dollar <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/labor/processed/implan-results/academic-paper-multipliers/ica' -impact_dollar <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/labor/processed/implan-results/academic-paper-multipliers/impact' -statewide_processed <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/labor/processed/implan-results/statewide/processed' -processed <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/labor/processed/implan-results/academic-paper-multipliers/processed' -fte <- '/Volumes/GoogleDrive/Shared drives/emlab/projects/current-projects/calepa-cn/data/labor/processed/implan-results' - - +setwd('/capstone/freshcair/meds-freshcair-capstone') # Sets directory based on Taylor structure +getwd() ############################################################################################ #Read in spreadsheet for FTE conversion -setwd(fte) -fte_convert <- read_xlsx('fte-convert.xlsx')%>% - rename(fte_per_job = FTEperTotalEmp, ind_code = Implan546Index) %>% - select(ind_code,fte_per_job) +# UPDATED - MG 2/19/2024 +fte_convert <- read_xlsx('data/inputs/labor/fte-convert.xlsx')%>% + dplyr::rename(fte_per_job = FTEperTotalEmp, ind_code = Implan546Index) %>% + dplyr::select(ind_code,fte_per_job) #1. Import ICA files, clean up names, and add county & segment identifiers @@ -55,24 +49,28 @@ fte_convert <- read_xlsx('fte-convert.xlsx')%>% ### NOTE: all multipliers are for $1 million of output value in an industry -setwd(ica_dollar) +setwd('/capstone/freshcair/meds-freshcair-capstone/data/inputs/labor/ica') ### Kern #### extraction + +# UPDATED - MG - 2/19/2024 +# UPDATE - not sure where ICA data is located ica_emp_ext_kern <- read_csv('ica-emp-ext-kern.csv') %>% - filter(is.na(X1)==F) %>% + filter(is.na(...1)==F) %>% mutate(county = "Kern", segment = "extraction") %>% - rename(industry = Impact) %>% - select(-X1,-X6) + dplyr::rename(industry = Impact) %>% + dplyr::select(-...1,-...6) + ica_comp_ext_kern <- read_csv('ica-va-ext-kern.csv',skip = 1) %>% - filter(is.na(X1)==F) %>% + filter(is.na(...1)==F) %>% mutate(county = "Kern", segment = "extraction") %>% - rename(industry = `Industry Display`, direct_comp = `Employee Compensation`, - direct_proprietor_income = `Proprietor Income`, direct_other_property_income = `Other Property Income`, - direct_taxes_prod_imports = `Taxes on Production & Imports`, direct_va = `Value Added`, + rename(industry = `Industry Display`, direct_comp = `Employee Compensation...3`, + direct_proprietor_income = `Proprietor Income...4`, direct_other_property_income = `Other Property Income...15`, + direct_taxes_prod_imports = `Taxes on Production & Imports...16`, direct_va = `Value Added...17`, indirect_comp = `Employee Compensation_1`, indirect_proprietor_income = `Proprietor Income_1`, indirect_other_property_income = `Other Property Income_1`, indirect_taxes_prod_imports = `Taxes on Production & Imports_1`, indirect_va = `Value Added_1`, @@ -91,35 +89,36 @@ ica_emp_drill_kern <- read_csv('ica-emp-drill-kern.csv') %>% rename(industry = Impact) %>% select(-X1,-X6) - +# UPDATED - MG - 2/19/2024 ica_comp_drill_kern <- read_csv('ica-va-drill-kern.csv',skip = 1) %>% - filter(is.na(X1)==F) %>% + filter(is.na(...1)==F) %>% mutate(county = "Kern", segment = "drilling") %>% - rename(industry = `Industry Display`, direct_comp = `Employee Compensation`, - direct_proprietor_income = `Proprietor Income`, direct_other_property_income = `Other Property Income`, - direct_taxes_prod_imports = `Taxes on Production & Imports`, direct_va = `Value Added`, - indirect_comp = `Employee Compensation_1`, - indirect_proprietor_income = `Proprietor Income_1`, indirect_other_property_income = `Other Property Income_1`, - indirect_taxes_prod_imports = `Taxes on Production & Imports_1`, indirect_va = `Value Added_1`, - induced_comp = `Employee Compensation_2`, - induced_proprietor_income = `Proprietor Income_2`, induced_other_property_income = `Other Property Income_2`, - induced_taxes_prod_imports = `Taxes on Production & Imports_2`, induced_va = `Value Added_2`) %>% - dplyr::select(-`Employee Compensation_3`, -`Proprietor Income_3`, -`Other Property Income_3`, - -`Taxes on Production & Imports_3`, -`Value Added_3`,-X1) + dplyr::rename(industry = `Industry Display`, direct_comp = `Employee Compensation...3`, + direct_proprietor_income = `Proprietor Income...4`, direct_other_property_income = `Other Property Income...5`, + direct_taxes_prod_imports = `Taxes on Production & Imports...6`, direct_va = `Value Added...7`, + indirect_comp = `Employee Compensation...8`, + indirect_proprietor_income = `Proprietor Income...9`, indirect_other_property_income = `Other Property Income...10`, + indirect_taxes_prod_imports = `Taxes on Production & Imports...11`, indirect_va = `Value Added...12`, + induced_comp = `Employee Compensation...13`, + induced_proprietor_income = `Proprietor Income...14`, induced_other_property_income = `Other Property Income...15`, + induced_taxes_prod_imports = `Taxes on Production & Imports...16`, induced_va = `Value Added...17`) %>% + dplyr::select(-`Employee Compensation...18`, -`Proprietor Income...19`, -`Other Property Income...20`, + -`Taxes on Production & Imports...21`, -`Value Added...22`,-...1) +# UPDATED - MG - 2/19/2024 #### refining ica_emp_ref_kern <- read_csv('ica-emp-ref-kern.csv') %>% - filter(is.na(X1)==F) %>% + filter(is.na(...1)==F) %>% mutate(county = "Kern", segment = "refining") %>% - rename(industry = Impact) %>% - select(-X1,-X6) - + dplyr::rename(industry = Impact) %>% + dplyr::select(-...1,-...6) +# UPDATED - MG - 2/19/2024 ica_comp_ref_kern <- read_csv('ica-va-ref-kern.csv',skip = 1) %>% - filter(is.na(X1)==F) %>% + filter(is.na(...1)==F) %>% mutate(county = "Kern", segment = "refining") %>% rename(industry = `Industry Display`, direct_comp = `Employee Compensation`, direct_proprietor_income = `Proprietor Income`, direct_other_property_income = `Other Property Income`, diff --git a/pred-model/eda.R b/pred-model/eda.R new file mode 100644 index 00000000..d4493085 --- /dev/null +++ b/pred-model/eda.R @@ -0,0 +1,169 @@ +# Load necessary libraries +library(dplyr) +library(tidyverse) +library(lubridate) +library(zoo) + +# Read in well processed data +well_prod_m_processed <- read.csv("data/processed/well_prod_m_processed.csv") + +# Inspect the structure of the well_prod_m_processed dataset +str(well_prod_m_processed) + +# Calculate and print the number of unique wells +num_wells <- length(unique(well_prod_m_processed$api_ten_digit)) +paste0("Number of unique wells: ", num_wells) + +# Calculate and print the number of counties +num_counties <- length(unique(well_prod_m_processed$county)) +paste0("Number of counties: ", num_counties) + +# Calculate and print the number of unique field codes +num_fieldcodes <- length(unique(well_prod_m_processed$doc_field_code)) +paste0("Number of unique field codes: ", num_fieldcodes) + +# Extract and print all unique well production types +well_productions <- unique(well_prod_m_processed$well_type_name) +paste0(well_productions) + +# Filter wells with BTU of gas produced greater than 0, indicating production +producing_wells <- well_prod_m_processed %>% + filter(BTUofGasProduced > 0) + +# Calculate and print the percentage of each production status across all wells +percentage_production_status <- well_prod_m_processed %>% + count(ProductionStatus) %>% + mutate(Percentage = n / sum(n) * 100) + +# Calculate and print the total BTU of gas produced for each well +total_btu_per_well <- well_prod_m_processed %>% + group_by(api_ten_digit) %>% + summarise(TotalBTUofGasProduced = sum(BTUofGasProduced, na.rm = TRUE)) +print(total_btu_per_well) + +# Calculate the first production date for each well and ungroup +well_start_dates <- well_prod_m_processed %>% + filter(ProductionStatus == "Active") %>% + group_by(api_ten_digit) %>% + summarise(FirstProductionDate = min(ProductionReportDate, na.rm = TRUE)) %>% + ungroup() + +# Join the first production date back to the original dataset +well_prod_m_processed_updated <- well_prod_m_processed %>% + left_join(well_start_dates, by = "api_ten_digit") + +# Calculate the number of active years for each well +active_months_per_well <- well_prod_m_processed_updated %>% + filter(ProductionStatus == "Active") %>% + group_by(api_ten_digit) %>% + summarise(ActiveYears = n()/12) %>% + ungroup() + +# Join the active years data back to the dataset +processed <- well_prod_m_processed_updated %>% + left_join(active_months_per_well, by = "api_ten_digit") + +# Filter for wells with an active production status +processed_active <- processed %>% + filter(ProductionStatus == "Active") %>% + arrange(api_ten_digit, ProductionReportDate) %>% + group_by(api_ten_digit) %>% + mutate(MonthInProduction = row_number()) %>% # Assigns a sequential number to each active month + ungroup() + +# Calculate the last production date for each well +well_last_dates <- well_prod_m_processed %>% + filter(ProductionStatus == "Active") %>% + group_by(api_ten_digit) %>% + summarise(LastProductionDate = max(ProductionReportDate, na.rm = TRUE)) %>% + ungroup() + +# Join the last production date to the active wells dataset +processed_active <- processed_active %>% + left_join(well_last_dates, by = "api_ten_digit") + +# Group the processed_active data and calculate summary statistics +processed_active_grouped <- processed_active %>% + group_by(api_ten_digit) %>% + summarize( + TotalProduction = sum(BTUofGasProduced, na.rm = TRUE), + ActiveYears = ActiveYears, + FirstProductionDate = min(ProductionReportDate, na.rm = TRUE), # First production date + LastProductionDate = max(ProductionReportDate, na.rm = TRUE) # Last production date + ) %>% + distinct(api_ten_digit, .keep_all = TRUE) %>% + filter(TotalProduction > 0) + +# Plot histogram of well active years +active_years_plot <- ggplot(processed_active_grouped, aes(x = ActiveYears)) + + geom_histogram(binwidth = 1, fill = "blue", color = "black") + + labs(x = "Active Years", y = "Count of Wells", title = "Histogram of Well Active Years") + + theme_bw() + +# Calculate the number of active wells per month +active_wells_per_month <- processed_active %>% + group_by(month_year = floor_date(ProductionReportDate, "month")) %>% + summarise(ActiveWells = n_distinct(api_ten_digit)) %>% + ungroup() + +# Plot the number of active wells over time +ggplot(active_wells_per_month, aes(x = month_year, y = ActiveWells)) + + geom_line() + + geom_point() + + labs(x = "Month", y = "Number of Active Wells", title = "Active Wells Over Time") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +### BTU ---------------------------------------- + +# Calculate the total BTU produced per month +total_production_per_month <- processed_active %>% + group_by(month_year = floor_date(ProductionReportDate, "month")) %>% + summarise(TotalBTUProduced = sum(BTUofGasProduced, na.rm = TRUE)) %>% + ungroup() + +# Plot the total BTU of gas produced over time +ggplot(total_production_per_month, aes(x = month_year, y = TotalBTUProduced)) + + geom_line(color = "blue") + + geom_point(color = "red") + + labs(x = "Month", y = "Total BTU of Gas Produced", title = "Total Gas Production Over Time") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +### GAS ---------------------------------------- + +# Calculate and print the total Gas Produced for each well +total_gas_per_well <- well_prod_m_processed %>% + group_by(api_ten_digit) %>% + summarise(TotalGasProduced = sum(GasProduced, na.rm = TRUE)) +print(total_gas_per_well) + +# Calculate the total Gas Produced per month +total_gas_production_per_month <- processed_active %>% + group_by(month_year = floor_date(ProductionReportDate, "month")) %>% + summarise(TotalProduced = sum(OilorCondensateProduced + GasProduced + WaterProduced, na.rm = TRUE)) %>% + ungroup() + +# Make sure data is ordered +total_gas_production_per_year <- total_gas_production_per_month %>% + arrange(month_year) + +# Calculate the 1 year rolling average for TotalGasProduced +total_gas_production_per_year$TotalGasProducedRollingAvg <- rollmean(total_gas_production_per_year$TotalGasProduced, 12, fill = NA, align = 'right') + +# Plot the total Gas Produced over time with the 1 year rolling average +ggplot(total_gas_production_per_year, aes(x = month_year)) + + geom_line(aes(y = TotalGasProducedRollingAvg), color = "red", linetype = "solid") + + labs(x = "Month", y = "Total Gas Produced (units)", title = "Total Gas Production Over Time (One Year Rolling Average)") + + theme_bw() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + + + + + + + + +