A common problem in health registry research is to collapse overlapping hospital stays to a single stay while retaining all information registered for the patient. Let’s start with looking at some example data:
pat_id <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7) hosp_stay_id <- 1:44 enter_date <- as.Date(c(19324, 19363, 19375, 19380, 19356, 19359, 19362,19368, 19369, 19373, 19375, 19376, 19382, 19423, 19423, 19425, 19429, 19373, 19395, 19403, 19437, 19321, 19422, 19437, 19438, 19443, 19444, 19445, 19454, 19454, 19458, 19459, 19460, 19464, 19467, 19468, 19510, 19510, 19511, 19511, 19360, 19397, 19432, 19439), origin="1970-01-01") exit_date <- as.Date(c(19380, 19363, 19375, 19380, 19359, 19382, 19362, 19368, 19369, 19373, 19375, 19376, 19382, 19423, 19429, 19425, 19507, 47117, 19395, 19403, 19437, 19445, 19422, 19437, 19438, 19443, 19444, 19445, 19454, 19468, 19458, 19459, NA, 19464, 19467, 19468, 19510, 19511, 19511, 19513, 19450, 19397, 19432, 19439), origin="1970-01-01") example_data <- data.frame(pat_id,hosp_stay_id, enter_date,exit_date)
library(data.table) setDT(example_data) # The code below will run but give strange results with missing data in exit date. Missing in exit date usually means patients are still hospitalized, and we could replace the missing date with the 31st December of the reporting year. Let's just exclude this entry for now: example_data <- example_data[!is.na(exit_date)] # Then order the datatable by patient id, enter date and exit date: setorder(example_data,pat_id,enter_date,exit_date) # We need a unique identifier per group of overlapping hospital stays. # Let the magic begin! example_data[, group_id:=cumsum( cummax(shift(as.integer(enter_date), fill=as.integer(exit_date)[1])) < as.integer(enter_date)) + 1, by=pat_id] # The group id is now unique per patient and group of overlapping stays # Let's turn it make it unique for each group of overlapping stays over the entire dataset: example_data[,group_id := ifelse(seq(.N)==1,1,0), by=.(pat_id,group_id) ][, group_id := cumsum(group_id)] # Let's make our example data a little prettier and easier to read by changing the column order: setcolorder(example_data, c("pat_id", "hosp_stay_id","group_id")) # Ready!Now we can conduct our analyses.