analyses.Rmd

---
title: "analyses"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(tidyverse)
library(sp)
library(sf)
source("R/download-utils.R")
```

```{r}
data_dir <- "U:\\Research\\Projects\\health\\jti_research_data\\jti_itraqi"
```

```{r}
df_facilities <- haven::read_spss(file.path(data_dir, "D_Table_FacilityDescription.sav")) 
df_facility_coords <- 
  haven::read_spss(file.path(data_dir, "D_Table_FacilityDescription.sav")) %>%
  select(facility = FACILITY_NAME_Clean, facility_latitude = Latitude, facility_longitude = Longitude)

list.files(data_dir)


df <- haven::read_spss(file.path(data_dir, "D_Table_TimeAndLocation_LongFile.sav")) %>% filter(SELECT==1)
df2 <- haven::read_spss(file.path(data_dir, "D_Table_TimeAndLocation_LongFileSELECT.sav"))

df_ambulance <-
  df %>%
  group_by(pu_id) %>%
  slice(if(any(TimeWayPoint == "D_DEPART_SCENE")) 1:which.max(TimeWayPoint == "D_DEPART_SCENE") else 0) %>%
  fill(X_COORD, Y_COORD) %>%
  ungroup()

df_ambulance %>%
  select(pu_id, SourceRecordID, TimeWayPoint, DateTimePoints) %>%
  pivot_wider(names_from = TimeWayPoint, values_from = DateTimePoints) %>%
  mutate(scene_duration_mins = as.numeric(difftime(D_DEPART_SCENE, D_ON_SCENE, units = "mins"))) %>%
  ggplot(aes(scene_duration_mins)) +
  geom_histogram()


df_ambulance %>%
  select(pu_id, SourceRecordID, TimeWayPoint, DateTimePoints) %>%
  pivot_wider(names_from = TimeWayPoint, values_from = DateTimePoints) %>%
  filter(!is.na(D_AT_DEST)) %>%
  mutate(scene_duration_dept_to_dest = as.numeric(difftime(D_AT_DEST, D_DEPART_SCENE, units = "mins"))) %>%
  ggplot(aes(scene_duration_dept_to_dest)) +
  geom_histogram()

df_amb_time <-
  df %>%
  select(pu_id, SourceRecordID, TimeWayPoint, DateTimePoints) %>%
  group_by(pu_id, SourceRecordID) %>%
  filter(TimeWayPoint %in% c("D_AT_DEST", "D_DEPART_SCENE")) %>%
  pivot_wider(names_from = TimeWayPoint, values_from = DateTimePoints) %>%
  mutate(scene_duration_dept_to_dest = as.numeric(difftime(D_AT_DEST, D_DEPART_SCENE, units = "mins"))) 

df_amb_time %>%
  ggplot(aes(scene_duration_dept_to_dest)) +
  geom_histogram()

```


```{r}
df %>%
  select(pu_id, SourceRecordID, TimeWayPoint, DateTimePoints) %>%
  group_by(pu_id) %>%
  filter(TimeWayPoint %in% c("D_RECEIVED",
                             "D_AT_DEST", 
                             "D_AT_DEST_CAD", 
                             "ED_START_DATETIME_Formatted", 
                             "Hosp_START_DATETIME_Formatted")) %>%
  pivot_wider(names_from = TimeWayPoint, values_from = DateTimePoints) %>%
  mutate(start_time = D_RECEIVED,
         end_time = coalesce(D_AT_DEST, D_AT_DEST_CAD, ED_START_DATETIME_Formatted, Hosp_START_DATETIME_Formatted),
         journey_hours = as.numeric(difftime(end_time, start_time, units = "hours"))) %>%
  select(pu_id, SourceRecordID, journey_hours) %>%
  na.omit() %>%
  ggplot(aes(journey_hours)) + 
  geom_histogram()

# need to calculate time between D_ON_SCENE to D_DEPART_SCENE and subtract this from journey-hours as this was not considered into iTRAQI

df %>% plyr::count("TimeWayPoint") %>% arrange(desc(freq))


df %>% 
  filter(TimeWayPoint %in% c("D_RECEIVED",
                             "D_COMPLETED")) %>%
  group_by(pu_id, SourceRecordID) %>%
  pivot_wider(names_from = TimeWayPoint, values_from = DateTimePoints) 
```

```{r}
df

df2 <- haven::read_spss(file.path(data_dir, "D_Table_HealthService_Time_UniqueLocation_Transport1stEnc_Summary.sav"))
# df2 <- haven::read_spss(file.path(data_dir, "F_TableFlagsANDOutcomesSELECT.sav"))

```


```{r}
library(sf)
qld_boundary <- read_sf("input/qld_state_polygon_shp/QLD_STATE_POLYGON_shp.shp")
qld_fill <- "darkolivegreen3"

bad_coord_pu_ids <- c(1685722)  

df %>%
  filter(
    !is.na(X_COORD),
    TimeWayPoint == "D_RECEIVED",
    !pu_id %in% bad_coord_pu_ids
  ) %>%
  left_join(select(df_amb_time, pu_id, SourceRecordID, scene_duration_dept_to_dest), by = c("pu_id", "SourceRecordID")) %>%
  filter(!is.na(scene_duration_dept_to_dest)) %>%
  ggplot() + 
  geom_sf(data=qld_boundary, fill = qld_fill, col="transparent") +
  geom_point(aes(X_COORD, Y_COORD, col = scene_duration_dept_to_dest)) +
  # + geom_text(aes(x=X_COORD, y=Y_COORD, label = pu_id), hjust=1)
  labs(x="", y="")

```

D_RECEIVED: the time that ambulance got the call
D_CLEAR: ambulance has been available for dispatched to other cases (no longer involved in that pu_id)
ED_START_DATETIME_Formatted: patient handed over to ED


check difference between 'Hosp_START_DATETIME_Formatted' and 'ED_START_DATETIME_Formatted'
  - which comes first
  - "arrive at hosp time" = coalesce(ED-start, hosp-start)

we are attempting to replicate iTRAQI by trying to capture first call of ambulance to arrival at hospital (total time for patient retrieval)

compare observed times (DEPART_SCENE to arrival - above) to estimated times using osrm/arcgis
  - does this match up with our idea of the ideal facility?
  - did they take longer than an hour and end up at a major trauma centre
    -> if so, was there a closer, secondary centre?

  
first pass:
  > d_recieved to 'ed and first facility'
  > d_recieved to 'ed at secondary facility (first tertiary care/major trauma care - PAH/RBWH/GC/TOWNSVILLE)'
  > use 'D_Table_HealthService_Time_UniqueLocation_Transport1stEnc_Summary.sav' - `DateTimePoints_first`


are those times within the range for iTRAQI for that SA1


```{r}
tertiary_care_centres <- 
  df_facilities %>%
  filter(NeuroSurgMajor == 1) %>%
  pull(FACILITY_NAME_Clean)


df_times_to_care <-
  df2 %>% 
  mutate(tertiary_care = FACILITY_NAME_Clean %in% tertiary_care_centres) %>% 
  group_by(pu_id) %>%
  arrange(LocationChangeSequence) %>%
  slice(if(any(tertiary_care == 1)) 1:which.max(tertiary_care==1) else row_number()) %>%
  mutate(first_dest = row_number() == 2) %>%
  filter(!(FACILITY_NAME_Clean == "AMBULANCE" & first_dest)) %>%
  mutate(first_dest = row_number() == 2) %>%
  ungroup()

# time to tertiary care BY whether they went to a non-tertiary care centre first
df_times_to_care %>%
  group_by(pu_id) %>%
  filter(max(tertiary_care)==1) %>%
  summarize(
    time_to_tertiary_care = as.numeric(difftime(max(DateTimePoints_first), min(DateTimePoints_first), units = "hours")),
    n_locs = n()
  ) %>% 
  filter(n_locs %in% c(2,3)) %>%
  ggplot(aes(x=time_to_tertiary_care)) +
  geom_density() +
  facet_wrap(~n_locs) +
  scale_x_continuous(limits=c(0,5))

# map time to first acute care centre (not necessarily tertiary)
df_acute_centre_times <-
  df_times_to_care %>%
  group_by(pu_id) %>%
  slice(if(any(!is.na(FirstAcuteDestinationFlag))) 1:which.max(!is.na(FirstAcuteDestinationFlag)) else 0) %>% # keep up to first acute destination and remove those that don't have this flag included in any records
  # summarize(max(!is.na(FirstAcuteDestinationFlag))) 
  summarize(time_to_acute_care = as.numeric(difftime(max(DateTimePoints_first), min(DateTimePoints_first), units = "hours"))) %>%
  left_join((df_times_to_care %>% group_by(pu_id) %>% slice(1) %>% select(pu_id, x=X_COORD_first, y=Y_COORD_first)), by = "pu_id") %>%
  filter(pu_id != 1685722, time_to_acute_care < 5, !is.na(x), !is.na(y)) # coordinate is maybe in antarctica?

# vis of time-to-acute-care points in qld
df_acute_centre_times %>%
  ggplot() + 
  geom_sf(data=qld_boundary, fill = qld_fill, col="transparent") +
  geom_point(aes(x, y, col = time_to_acute_care)) +
  labs(x="", y="") +
  theme_bw()


itraqi_layer <- 
  readRDS_github("https://raw.githubusercontent.com/RWParsons/iTRAQI_app/main/input/layers/stacked_SA1_and_SA2_polygons_year2016_simplified.rds") %>%
  filter(SA_level == 1) %>%
  mutate(
    acute_range = str_extract(popup_acute, "\\[[0-9].*[0-9]\\]"),
    acute_range_min = str_extract_all(acute_range, '(?<=\\[)[0-9]+'),
    acute_range_max = str_extract(acute_range, '[0-9]+(?=\\])')
  ) %>%
  select(-acute_range) %>%
  mutate(across(starts_with("acute_range"), as.numeric))


coordinates(df_acute_centre_times) <- ~ x + y
df_acute_centre_times <- st_as_sf(df_acute_centre_times)
df_acute_centre_times <- st_set_crs(df_acute_centre_times, 4283)

itraqi_sa1s <- st_join(itraqi_layer, df_acute_centre_times)%>% 
  filter(!is.na(pu_id)) %>%
  select(pu_id, acute_time_est = value_acute, acute_range_min, acute_range_max, time_to_acute_care) %>%
  mutate(
    time_to_acute_care_mins = time_to_acute_care * 60,
    in_range = time_to_acute_care_mins >= acute_range_min & time_to_acute_care_mins <= acute_range_max,
    mins_difference = acute_time_est - time_to_acute_care_mins
  )
itraqi_sa1s %>% plyr::count("in_range")

itraqi_sa1s %>%
  ggplot(aes(mins_difference)) + 
  geom_histogram() +
  geom_vline(xintercept = 0) +
  labs(caption = "acute_time_est - time_to_acute_care_mins",
       title = "mins_difference > 0 means itraqi estimate was greater than observed travel time") +
  scale_x_continuous(breaks = seq(-240, 300, by = 60)) +
  theme_bw()

# when mins_differnece is positive, iTRAQI over estimated travel time.
# graph suggests that we overestimated travel time for rural retrievals but underestimated for metro
# makes sense since we incorporated air travel for acute care but our observed times are only using ambulance?
as.data.frame(cbind(pu_id=df_acute_centre_times$pu_id, st_coordinates(df_acute_centre_times))) %>%
  inner_join(select(itraqi_sa1s, pu_id, mins_difference) %>% st_drop_geometry()) %>%
  ggplot() + 
  geom_sf(data=qld_boundary, fill = qld_fill, col="transparent") +
  geom_point(aes(X, Y, col = mins_difference), alpha=0.5) +
  theme_bw() +
  labs(caption = "acute_time_est - time_to_acute_care_mins",
       title = "mins_difference > 0 means itraqi estimate was greater\nthan observed travel time\n(acute_time_est - time_to_acute_care_mins)",
       x="", y="") +
  scale_color_gradient2()
```

# questions for group
```{r, eval=FALSE}

# What does it mean when someone goes from ambulance to ambulance?
  #> adam reckons a non-transport case or a private facility
df2 %>% 
  mutate(tertiary_care = FACILITY_NAME_Clean %in% tertiary_care_centres) %>% 
  group_by(pu_id) %>%
  arrange(LocationChangeSequence) %>%
  slice(if(any(tertiary_care == 1)) 1:which.max(tertiary_care==1) else row_number()) %>%
  mutate(first_dest = row_number() == 2) %>%
  filter(first_dest & FACILITY_NAME_Clean == "AMBULANCE") %>% pull(pu_id) %>% unique() %>% length()

df2 %>% filter(pu_id == 294328)


df %>% filter(pu_id == 294328)


# events where non-ambulance destination was first
df_times_to_care %>%
  group_by(pu_id) %>%
  filter(!FACILITY_NAME_Clean == "AMBULANCE") %>%
  slice(1) %>% 
  filter(is.na(FirstAcuteDestinationFlag))

# records where they seem to have not had an AMBULANCE journey in df_times_to_care
df_times_to_care %>% filter(pu_id %in% c(528132, 2032182))

# on larger record, it seems that there the first hospital visit could have been a separate event from the reason that they are in this dataset (ambulance and then another centre afterwards)
df %>% filter(pu_id %in% c(528132, 2032182))


# check for multiple-step-uppers before gettng to tertiary care centre
df_times_to_care %>%
    group_by(pu_id) %>%
    filter(max(tertiary_care)==1) %>%
    summarize(
        time_to_tertiary_care = as.numeric(difftime(max(DateTimePoints_first), min(DateTimePoints_first), units = "hours")),
        n_locs = n()
    ) %>% filter(n_locs==4) %>% 
  pull(pu_id) %>% 
  {function(x) filter(df2, pu_id %in% x)}() %>% View()
```
# CHATS WITH ADAM


variable coalescing
	> D_AT_DEST = Coalesce(D_AT_DEST, D_AT_DEST_CAD, D_TRIAGE, D_OFF_STRETCHER, ED_START_TIME)
	> D_DEPART_SCENE = COALESCE(D_DEPART_SCENE, D_NOTIFY, D_LOADED)
	
	> for all, if there are two records of same TimeWayPoint across different SourceRecordID's, keep the earliest time


times that we need
	> PRE_HOSP_TIME = RESPONSE_TIME + TRAVEL_TIME
		> RESPONSE_TIME = D_RECEIVED to D_ON_SCENE
		> SCENE_TIME = D_ON_SCENE to D_DEPART_SCENE
		> TRAVEL_TIME = D_DEPART_SCENE to D_AT_DEST
	
	> ___
		> FIRST_FACILITY_TIME = ED_START_DATETIME_Formatted - ED_End_DATETIME_Formatted
	
	
	> time at initial facility before step up
		> initial HOSP_START to HOSP_END that is followed by HOSP_START at a higher level facility
		> we want to know whether it was a true step up (if they spend a month at first facility then the step up was not likely due to acute care of the TBI)
		
	> final centre is the highest level of centre that they recieve care at (rehab centre that follows is not included in acute time)


```{r}
# remove duplicate TimeWayPoint's (potentially from repetition across SourceRecord's)
df_deduped <- 
  df %>%
  # filter(pu_id %in% c(293, 264086)) %>%
  select(pu_id, SourceRecord, TimeWayPoint, DateTimePoints) %>%
  group_by(pu_id) %>%
  arrange(DateTimePoints) %>%
  split(.$pu_id) %>%
  lapply(
    ., 
    function(data) {
      data$rep_row <- 0
      for(r in seq_len(nrow(data)-1)) {
        twp <- data$TimeWayPoint[r]
        next_twp <- data$TimeWayPoint[r+1]
        if(twp == next_twp) {
          data$rep_row[r+1] <- 1
        }
      }
      return(data)
    }
  ) %>%
  do.call("rbind", .) %>%
  filter(rep_row != 1)

df_deduped %>%
  # select(-c(SourceRecord, rep_row)) %>%
  filter(pu_id == 2498) %>% # find that there is a duplication on D_DISPATCH but the time is not adjacent to the first so it's not removed!
  pivot_wider(names_from=TimeWayPoint, values_from = DateTimePoints)
  # mutate(
  #   D_AT_DEST = Coalesce(D_AT_DEST, D_AT_DEST_CAD, D_TRIAGE, D_OFF_STRETCHER, ED_START_TIME),
  #   D_DEPART_SCENE = COALESCE(D_DEPART_SCENE, D_NOTIFY, D_LOADED)
  # )
df %>%
  select(pu_id, SourceRecord, TimeWayPoint, DateTimePoints) %>%
  filter(TimeWayPoint %in% c("D_RECEIVED", "ED_START_DATETIME_Formatted")) %>%
  group_by(pu_id) %>%
  arrange(DateTimePoints) %>%
  mutate(txt = paste0(TimeWayPoint, collapse="")) %>%
  filter(str_detect(txt, "D_RECEIVEDED_START_DATETIME_FormattedD_RECEIVED"))

df %>% 
  select(pu_id, SourceRecord, TimeWayPoint, DateTimePoints) %>%
  filter(pu_id == 2498) %>%
  arrange(DateTimePoints)

df %>%
  select(pu_id, SourceRecord, TimeWayPoint, DateTimePoints) %>%
  filter(pu_id == 1247302) %>% View()
```