###############################################################################
# Clean and prepare the data for multilevel modeling
###############################################################################

# Read in unsorted data and (approximately) create building index variable

hvs02.unsorted <- read.csv ("hvs02_unsorted.csv")
attach.all (hvs02.unsorted)
n <- nrow (hvs02.unsorted)
sequenceno.unsorted <- sequenceno

x.bldg <- cbind (numunits, stories, cd, extwin4_2, extflr5_2, old, dilap)
bldg.unsorted <- rep (0,n)
bldg.unsorted[1] <- 1
for (i in 2:n){
  bldg.unsorted[i] <- bldg.unsorted[i-1] +
    ifelse (identical (x.bldg[i-1,], x.bldg[i,]) & sum(bldg.unsorted==bldg.unsorted[i-1])<4, 0, 1) 
}
n.bldg <- max(bldg.unsorted)

# How many buildings are there in the dataset with 1,2,3,4 apts?

table (table (bldg.unsorted))
#    1    2    3    4 
# 5405 1580  867 1182

# Read in full (sorted) data set

hvs02.sorted <- read.csv ("hvs02_sorted.csv")
attach.all (hvs02.sorted)
n <- nrow (hvs02.sorted)

# Define variables of interest

y <- rodent2
n <- length (y)
dist <- cd

# Line up the building ID's to the "sequenceno" variable

bldg <- rep (0,n)
for (i in 1:n){
  bldg[i] <- bldg.unsorted[sequenceno.unsorted==sequenceno[i]]
}

# Get district numbers for the buildings

n.bldg <- max(bldg)
bldg.dist <- rep (NA, n.bldg)
for (j in 1:n.bldg){
  dist.j <- unique(dist[bldg==j])
  if (length(dist.j)>1) stop ("not all apts in this bldg are in the same dist")
  bldg.dist[j] <- dist.j
}

###############################################################################
# Define an "apt/bldg defects score" and a "poverty score"
###############################################################################

# Impute missing values for the predictors:  for each, impute avg for other
# apts in the bldg (if available), otherwise impute the avg in the district

impute <- function (a){
  n <- length(a)
  missing <- is.na(a)
  a.imp <- a
  for (i in (1:n)[missing]){
    if (sum (bldg==bldg[i] & !missing) > 0)
      a.imp[i] <- mean (a[bldg==bldg[i]], na.rm=T)
    else
      a.imp[i] <- mean (a[dist==dist[i]], na.rm=T)
  }
  return (a.imp)
}

floor <- impute(unitflr2)
defects <- impute(extwin4_2) + impute(extflr5_2) + impute(intcrack2) +
  impute(inthole2) + impute(intleak2) + impute(intpeel_cat) + impute(dilap) +
  (1-impute(struct))
poor <- impute(poverty) + impute(povertyx2) + impute(board2) +
  impute(subsidy) + impute(ifelse (housing==3, 1, ifelse (housing==4, 2,
                         ifelse (housing==2, 3, ifelse (housing==1, 4, NA)))))

###############################################################################
# Define defects and poverty scores at the district level
###############################################################################

# Define district defects and poverty scores for each apt (no missingness here)

dist.defects.raw <- extwin4_2_Mean + extflr5_2_Mean + inthole2_Mean + intleak2_Mean + dilap_Mean + (1-struct_Mean)
dist.poor.raw <- poverty_Mean + povertyx2_Mean + pubhous_Mean + (1-ownhous_Mean) + board2_Mean

# To get district scores, compute averages over the apartments within each district

group.mean <- function (a, id){
  n.group <- max(id)
  output <- rep (NA, n.group)
  for (j in 1:n.group){
    output[j] <- mean (a[id==j])
  }
  return (output)
}
dist.defects <- group.mean (dist.defects.raw, dist)
dist.poor <- group.mean (dist.poor.raw, dist)

###############################################################################
# Divide the districts into strata
###############################################################################

n.dist <- max (dist)
n.strata <- 10
n.per.stratum <- n.dist/n.strata + .001
stratum <- floor (rank(dist.poor)/n.per.stratum) + 1
dist.renumbered <- rep (NA, n.dist)
for (j in 1:n.strata){
  dist.renumbered[stratum==j] <- 1:sum(stratum==j)
}
dist.id <- dist.renumbered[dist]
dist.stratum <- stratum[dist]

###############################################################################
# Save the full apt-level data into 1 file, "apt_all.dat"
###############################################################################

apt.all.data <- cbind (hvs02.sorted, bldg, dist, dist.stratum, dist.id)
write.table (apt.all.data, "apt_all.dat")
rodents.data <- cbind (hvs02.sorted, bldg)
write.table (rodents.data, "rodents.dat")

###############################################################################
# Save the data into 3 files, "apt.dat", "bldg.dat", and "dist.dat"
###############################################################################

apt.data <- cbind (y, defects, poor, race, floor, dist, bldg)
bldg.data <- cbind (bldg.dist)
dist.data <- cbind (dist.defects, dist.poor)
write.table (apt.data, "apt.dat")
write.table (bldg.data, "bldg.dat")
write.table (dist.data, "dist.dat")

###############################################################################
# To speed the computations (to get started), just work with a random sample
# of 1000 bldgs
###############################################################################

n.keep <- 1000
bldg.sample <- sort (sample (n.bldg, n.keep))
apt.include <- rep (FALSE,n)
for (i in 1:n.keep){
  apt.include[bldg==bldg.sample[i]] <- TRUE
}

write.table (rodents.data[apt.include,], "rodents.dat")

y.subset <- y[apt.include]
defects.subset <- defects[apt.include]
poor.subset <- poor[apt.include]
race.subset <- race[apt.include]
floor.subset <- floor[apt.include]
dist.subset <- dist[apt.include]

# Renumber the buildings (for convenience in later computation)

bldg.include <- bldg[apt.include]
bldg.subset <- rep (NA, sum(apt.include))
for (i in 1:length(bldg.sample)){
  bldg.subset[bldg.include==bldg.sample[i]] <- i
}

###############################################################################
# Save the subset data into 2 files, "apt.subset.dat" and "bldg.subset.dat".
# (No need for "dist.subset.dat" since the subset includes data from all dists
###############################################################################

apt.subset.data <- cbind (y.subset, defects.subset, poor.subset, race.subset, floor.subset, dist.subset, bldg.subset)
bldg.subset.data <- cbind (bldg.dist[bldg.include])
write.table (apt.subset.data, "apt.subset.dat")
write.table (bldg.subset.data, "bldg.subset.dat")

attach.overwrite <- function(a, overwrite=TRUE, name="attach.all"){
#
# Version of the "attach" function that overwrites variables if necessary.
# (The usual "attach" function in R only attaches the variables that are not
# yet defined.)
#
  if (overwrite){
    for (j in 1:length(a)){
      if (names(a)[j] %in% ls(.GlobalEnv))
        remove (list=names(a)[j], envir=.GlobalEnv)
    }
  }
  attach (a, name=name)
}