# GSS Tutorial Script (Analysis) # Felipe A. Osorio # Created June 18, 2012 # Updated August 09, 2012 # Consider: # 1. How important is religion to you? /RELITEN /relig /pray # 2. Correlation with party ID # 3. for: Whites, Blacks, & Hispanics # 4. Time series visualization # ...3 lines, 3 different colors tracking correlation over time # 5. simple regression considering demographics vars (e.g. age, urban, education, etc) rm(list=ls()) options(scipen=5) library(survey) library(foreign) library(car) setwd("/Users/lucas/Dropbox/GSS Youtube") gssFullPanel <- read.dta("gss2012full.dta") gssPanel <- subset(gssFullPanel, select = c(id, year, sex, age, educ, region, partyid, reliten, hispanic, race, ethnic, wtssnr)) gssPanel$hispanico1 <- ifelse(gssPanel$hispanic != "not hispanic", 1, 0) #only avail 2000-2006 # summary(gssPanel) time <- unique(gssPanel$year) hindex <- c(17, 20, 22, 25, 38) # mexico, puerto rico, spain, filipino, other spanish; (29 = other) gssPanel$hispanico2 <- ifelse(gssPanel$ethnic %in% hindex, 1, 0) gssPanel$hispanico1[is.na(gssPanel$hispanico1)] <- 0 #two ethnicity indicators gssPanel$hispanico2[is.na(gssPanel$hispanico2)] <- 0 # based on GSS questions levels(gssPanel$race) <- c("hispanic","white","black","other") gssPanel$row.no <- 1:length(gssPanel$id) all.hispanics <- gssPanel$row.no[(gssPanel$hispanico1 > 0) | (gssPanel$hispanico2 > 0)] gssPanel$race[all.hispanics] <- "hispanic" levels(gssPanel$reliten) <- c(0, 3, 1, 2, 0, 0, NA) #quantifying religious importance levels(gssPanel$partyid) <- c(-3, -2, -1, 0, 1, 2, 3, NA, 0, NA) #unidimensional gradations of party ID dataMat <- data.frame(as.numeric(gssPanel$reliten), as.numeric(gssPanel$partyid)) names(dataMat) <- c("iReligion", "partyID") dataMat$iReligion[dataMat$iReligion == 1] <- 0 #correcting numeric to match factor levels dataMat$iReligion[dataMat$iReligion == 3] <- 1 dataMat$iReligion[dataMat$iReligion == 2] <- 3 dataMat$iReligion[dataMat$iReligion == 4] <- 2 by(dataMat, gssPanel$race, cor, use = "complete.obs") # RESULTS BY RACE, AGGREGATED OVER TIME corS <- by(dataMat, list(gssPanel$race, gssPanel$year), cor, use = "pairwise.complete.obs") #by year corSeries <- unlist(corS); corN <- length(corSeries) # messy data structure requires careful extraction hispx <- seq(2, corN, by = 16) hispCor <- corSeries[hispx] whitex <- seq(6, corN, by = 16) whiteCor <- corSeries[whitex] blackx <- seq(10, corN, by = 16) blackCor <- corSeries[blackx] otherx <- seq(14, corN, by = 16) otherCor <- corSeries[otherx] corData <- data.frame(time, hispCor, whiteCor, blackCor, otherCor) names(corData) <- c("Year", "Hispanics", "Whites", "Blacks", "Others") corData <- subset(corData, Year >= 1974) # No Data for 1972-73 =( load("/Users/lucas/Dropbox/GSS Youtube/corData.R") attach(corData) # consider overlaying plots meaningfully? plot(Year, Hispanics); lines(lowess(Year, Hispanics), col="darkgreen") plot(Year, Whites); lines(lowess(Year, Whites), col="darkred") plot(Year, Blacks); lines(lowess(Year, Blacks)) plot(Year, Others); lines(lowess(Year, Others), col = "pink") par(mfrow=c(2,2)) plot(Year, Hispanics); lines(lowess(Year, Hispanics), col="darkgreen") plot(Year, Whites); lines(lowess(Year, Whites), col="darkred") plot(Year, Blacks); lines(lowess(Year, Blacks)) plot(Year, Others); lines(lowess(Year, Others), col = "pink") detach(corData) for (year in time[3:28]) { #check statistical significance for each correlation coef plotted for (race in levels(gssPanel$race)) { tempCor <- cor.test(~ dataMat$iReligion + dataMat$partyID, dataMat, (gssPanel$year == year) & (gssPanel$race == race))$conf.int #also p.values print(c(year, race, tempCor)) } # note 1970s offer very few data points } # account for undersampling of minorities, weigh the covariance matrix # cov.wt(corData, wt = gssPanel$wtssnr, method = "unbiased", cor = TRUE)