# Purpose of the program:
# =======================
# This program creates an unbalanced longitudinal long data file, using the combined files from each of the first 5 waves.
# The new data files are in R's long format.
#
# Updated by: Mossamet Nesa
# Date: 06/11/2025 

install.packages("haven", method="wininet", dependencies=TRUE)
install.packages("tidyverse", method="wininet", dependencies=TRUE)


library(haven)
library(tidyverse)


wave <- 5      #  Number of wave data files to extract. Here uses wave=5 as an example.
maxwave <- 24  # Update to the latest wave.
rls <- 240     # Update to the latest release.

origdatdir <- paste0("H:/HILDA/Release ",rls,"/files/STATA ",rls,"c") # Location of original HILDA data files
newdatadir <- "H:/new-data" # Location of writing new data files


# SECTION 1: Creating an unbalanced dataset (long-format)

setwd(origdatdir)
# Could adjust for personal needs.
var <- c("hwhmhl", "hhrhid", "hhrpid", "hhpxid", "hhresp", "hhstate", "hhsos", "hgint", "ancob", "losathl", "wsce",
         "wscei", "wscef", "wscme", "wscmei", "wscmef", "wscoe", "wscoei", "wscoef")

for( i in 1:wave) {
  file_list <- paste0("Combined_", letters[i], rls, "c.dta")
  temp <- read_dta(file_list)
  var_add <- paste0(letters[i], var) # Add wave letter onto the variable names
  temp <- temp %>% dplyr::select(xwaveid, any_of(var_add)) 
  # any_of() lets the program avoid selecting the variable not included in a specific wave and set NA to that variable. eg: "hwhmhl" not included in wave 1
  names(temp)[-1] <-substring(names(temp)[-1], 2) # Remove wave letter from variable names except for xwaveid
  #temp<- temp %>% mutate(across(everything(), ~ unclass(.x))) # remove value labels to avoid conflicts
  temp$wave <- i
  temp<- temp %>% filter(hgint!=0) # keep only interviewed person
  if (i == 1 ){
    longfile <- temp
  } else {
    longfile <- bind_rows(longfile, temp) # Append the data file from each wave
    
  }
}

# Save new data set
setwd(newdatadir)
save(longfile, file = "long-file-unbalanced.Rdata")
# write_dta(longfile, "long-file-unbalanced.dta") # Could save as a stata data file

