# Purpose of the program:
# =======================
# This program creates an unbalanced and a balanced longitudinal wide data file, using the combined files from each of the first 5 waves.
# The new data files are in R's wide format.
# 
# Created by: Mossamet Nesa
# Date: 06/11/2025

install.packages("haven", method="wininet", dependencies=TRUE)
install.packages("tidyverse", method="wininet", dependencies=TRUE)

library(haven)
library(tidyverse)

wave <- 5 #  Number of wave data files to extract. Here uses wave=5 as an example.
maxwave <- 24 # Update to the latest wave.
rls <- 240 # Update to the latest release.
origdatdir <- paste0("H:/HILDA/Release ",rls,"/files/STATA ",rls,"c") # Location of original HILDA data files
newdatadir <- "H:/new-data" # Location of writing new data files


# SECTION 1: Creating an unbalanced dataset (wide-format)

setwd(origdatdir)
var <- c("hhrhid", "hhrpid", "hhpxid", "hhresp", "hhstate", "hhsos", "ancob", "losathl", "wsce",
         "wscei", "wscef", "wscme", "wscmei", "wscmef", "wscoe", "wscoei", "wscoef")

for( i in 1:wave) {
  file_list <- paste0("Combined_", letters[i], rls,"c.dta")
  temp <- read_dta(file_list)
  var_add <- paste0(letters[i], var) # Add wave letter onto the variable names
  temp <- temp %>% dplyr::select(xwaveid, any_of(var_add)) # any_of() lets the program avoid selecting the variable not included in a specific wave and set NA for that variable.
  names(temp)[-1] <- substring(names(temp)[-1], 2) # Remove wave letter from variable names except for xwaveid
  names(temp)[-1] <- paste0(names(temp)[-1], "_w", i) # Use _w1 format to rename the variables in wave 1
  if (i == 1 ){
    widefile <- temp
  } else {
    widefile <- merge(widefile, temp, by = "xwaveid", all.x = TRUE, all.y = TRUE) 
  }
}

# Use the master file
master_file <- paste0("Master_", letters[maxwave], rls,"c.dta")
master <- read_dta(master_file)
master <- master[c("xwaveid", "ivwptn")] # Can keep more variables 
final_data <- merge(master, widefile, by = "xwaveid",  suffixes = c("", ""), all.x = TRUE) # "xwaveid" is the only common variable in these two datasets, so we match on "xwaveid"

# Save new data set
setwd(newdatadir)
save(final_data, file = "wide-file-unbalanced.Rdata")
# write_dta(final_data, "wide-file-unbalanced.dta") # Could save as a stata data file

# SECTION 2: Creating a balanced dataset (wide-format)
# We can use the variable ivwptn which contains the interview pattern for each person.

# final_data = load("wide-file-unbalanced.Rdata") # import dataset if needed. Be aware of current directory.
intvw_pattern <- paste(rep("X", wave), collapse = "") # Create the pattern that people have been interviewed in each of the first 5 waves
balwide = final_data[substr(final_data$ivwptn, 1, wave) == intvw_pattern, ] # Keep people that have been interviewed in each of the first 5 waves

setwd(newdatadir) # Set the directory to save new data file
save(balwide, file = "wide-file-balanced.Rdata")
# write.table(balwide, file = "wide-file-balanced.txt", sep = ",", row.names = FALSE) # Can also save as a txt file
