DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn about:
load("data/Student_Attributes.rda")
head(stuatt[, 1:4], 7)
sid school_year male race_ethnicity
1 1 2004 1 B
2 1 2005 1 H
3 1 2006 1 H
4 1 2007 1 H
5 2 2006 0 W
6 2 2007 0 B
7 3 2006 1 H
first_9th_school_year_reported variablestuatt$first_9th_year_reported <- NULL
NULL, another R quirklength(unique(stuatt$sid))
[1] 21803
length(unique(stuatt$sid, stuatt$male))
[1] 21806
testuniqueness <- function(id, group) {
length(unique(id)) == length(unique(id, group))
} # Need better varname and some optimization to the speed of this code
testuniqueness(stuatt$sid, stuatt$male)
[1] FALSE
testuniqueness(stuatt$sid, stuatt$race_ethnicity)
[1] FALSE
testuniqueness(stuatt$sid, stuatt$birth_date)
[1] FALSE
stuatt[17:21, 1:3]
sid school_year male
17 7 2004 1
18 7 2005 1
19 7 2006 1
20 7 2007 0
21 7 2008 1
plyr strategy we learned in Tutorial 3library(plyr)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)))
table(sturow$nvals_gender)
1 2
21799 4
# A function to find the most frequent value
library(eeptools)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)),
gender_mode = statamode(male), gender_recent = tail(male, 1))
head(sturow[7:10, ])
sid nvals_gender gender_mode gender_recent
7 7 2 1 1
8 8 1 1 1
9 9 1 1 1
10 10 1 1 1
stuatt and sturow and we need to replace some values from stuatt with some values from sturowmerge to the rescue!merge our two data objects into a temporary data object called tempdftempdf <- merge(stuatt, sturow) # R finds the linking variable already
head(tempdf[17:21, c(1, 2, 3, 10, 11)])
sid school_year male nvals_gender gender_mode
17 7 2004 1 2 1
18 7 2005 1 2 1
19 7 2006 1 2 1
20 7 2007 0 2 1
21 7 2008 1 2 1
print(subset(tempdf[, c(1, 2, 3, 10, 11)], sid == 12506))
sid school_year male nvals_gender gender_mode
50064 12506 2004 1 2 .
50065 12506 2005 0 2 .
print(subset(tempdf[, c(1, 2, 3, 10, 11, 12)], sid == 12506))
sid school_year male nvals_gender gender_mode gender_recent
50064 12506 2004 1 2 . 0
50065 12506 2005 0 2 . 0
gender_recent variable when there is not a value of gender_mode that is validrecoding our variable!tempdf$male to be the same as tempdf$gender_modetempdf$male is now a "." indicating no modal category exists, we assign tempdf$gender_recent to be tempdf$maletestuniqueness(tempdf$id,tempdf$male) to check if it workedtempdf$male <- tempdf$gender_mode
tempdf$male[tempdf$male == "."] <- tempdf$gender_recent[tempdf$male == "."]
# we have to put the filter on both sides of the assignment operator
testuniqueness(tempdf$id, tempdf$male)
[1] TRUE
rm(sturow)
stuatt <- tempdf
stuatt$nvals_gender <- NULL
stuatt$gender_mode <- NULL
stuatt$gender_recent <- NULL
# or just run stuatt<-tempdf[,1:9]
rm(tempdf)
race_ethnicity variable to numeric and add labels to itfactor variable type like R can, and rely on numeric coding schemesmale variable as a factor with values M and Fsummary(stuatt$race_ethnicity)
A B H M/O W NA's
7303 25321 30444 2809 20528 1129
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
summary(stuatt$race_ethnicity)
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
[1] 1129
stuatt$race_ethnicity <- as.character(stuatt$race_ethnicity)
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
stuatt$race_ethnicity <- factor(stuatt$race_ethnicity)
summary(stuatt$race_ethnicity)
A AI B H M/O W
7303 1129 25321 30444 2809 20528
stuatt[7:9, c("sid", "school_year", "race_ethnicity")]
sid school_year race_ethnicity
7 3 2006 H
8 3 2006 B
9 3 2007 B
nvals <- ddply(stuatt, .(sid, school_year), summarize, nvals_race = length(unique(race_ethnicity)),
tmphispanic = length(which(race_ethnicity == "H")))
tempdf <- merge(stuatt, nvals)
# Clean up
rm(nvals)
# Recode race_ethnicity
tempdf$race2 <- tempdf$race_ethnicity
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic == 1] <- "H"
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic != 1] <- "M/O"
tempdf$race_ethnicity <- tempdf$race2
# Clean up by removing old variables
tempdf$race2 <- NULL
tempdf$nvals_race <- NULL
tempdf$tmphispanic <- NULL
# Resort our result
tempdf <- tempdf[order(tempdf$sid, tempdf$school_year), ]
sid school_year race_ethnicity
56201 3 2006 H
56202 3 2006 H
81064 8552 2005 W
81065 8552 2006 M/O
81066 8552 2006 M/O
6162 11382 2005 H
6163 11382 2005 H
6164 11382 2006 H
sid school_year race_ethnicity
7 3 2006 H
8 3 2006 B
34290 8552 2005 W
34291 8552 2006 A
34292 8552 2006 W
45674 11382 2005 H
45675 11382 2005 M/O
45676 11382 2006 H
stuatt <- tempdf
rm(tempdf)
# Stupid hack workaround of ddply bug when running too many of these
# sequentially
ddply_race <- function(x, y, z) {
NewColName <- "race_ethnicity"
z <- ddply(x, .(y, z), .fun = function(xx, col) {
c(nvals_race = length(unique(xx[, col])))
}, NewColName)
z$sid <- z$y
z$school_year <- z$z
z$y <- NULL
z$z <- NULL
return(z)
}
nvals <- ddply_race(stuatt, stuatt$sid, stuatt$school_year)
tempdf <- merge(stuatt, nvals)
tempdf$temp_ishispanic <- NA
tempdf$temp_ishispanic[tempdf$race_ethnicity == "H" & tempdf$nvals_race > 1] <- 1
head(stuatt[, c("sid", "school_year", "race_ethnicity")])
sid school_year race_ethnicity
1 1 2004 B
2 1 2005 H
3 1 2006 H
4 1 2007 H
44618 2 2006 W
44619 2 2007 B
tempdf <- ddply(stuatt, .(sid), summarize, var_temp = statamode(race_ethnicity),
nvals = length(unique(race_ethnicity)), most_recent_year = max(school_year),
most_recent_var = tail(race_ethnicity, 1))
tempdf$race2[tempdf$var_temp != "."] <- tempdf$var_temp[tempdf$var_temp != "."]
tempdf$race2[tempdf$var_temp == "."] <- paste(tempdf$most_recent_var[tempdf$var_temp ==
"."])
tempdf <- merge(stuatt, tempdf)
head(tempdf[, c(1, 2, 4, 14)], 7)
sid school_year race_ethnicity race2
1 1 2004 B H
2 1 2005 H H
3 1 2006 H H
4 1 2007 H H
5 2 2006 W B
6 2 2007 B B
7 3 2006 H H
summarize in the ddply call in this situationtask1 <- function(df, id, year, var) {
require(plyr)
mdf <- eval(parse(text = paste("ddply(", df, ",.(", id, "),summarize,\nvar_temp=statamode(",
var, "),\nnvals=length(unique(", var, ")),most_recent_year=max(", year,
"),\nmost_recent_var=tail(", var, ",1))", sep = "")))
mdf$var2[mdf$var_temp != "."] <- mdf$var_temp[mdf$var_temp != "."]
mdf$var2[mdf$var_temp == "."] <- as.character(mdf$most_recent_var[mdf$var_temp ==
"."])
ndf <- eval(parse(text = paste("merge(", df, ",mdf)", sep = "")))
rm(mdf)
return(ndf)
}
# Note data must be sorted
tempdf <- task1(stuatt, stuatt$sid, stuatt$school_year, stuatt$race_ethnicity)
It is good to include the session info, e.g. this document is produced with knitr version 0.9.6. Here is my session info:
print(sessionInfo(), locale = FALSE)
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] sandwich_2.2-9 quantreg_4.94 SparseM_0.96 gridExtra_0.9.1
[5] mgcv_1.7-22 eeptools_0.1 mapproj_1.2-0 maps_2.3-0
[9] proto_0.3-10 plyr_1.8 stringr_0.6.2 ggplot2_0.9.3
[13] lmtest_0.9-30 zoo_1.7-9 knitr_0.9.6
loaded via a namespace (and not attached):
[1] codetools_0.2-8 colorspace_1.2-0 dichromat_1.2-4
[4] digest_0.6.0 evaluate_0.4.3 formatR_0.7
[7] gtable_0.1.2 labeling_0.1 lattice_0.20-10
[10] MASS_7.3-22 Matrix_1.0-10 munsell_0.4
[13] nlme_3.1-106 RColorBrewer_1.0-5 reshape2_1.2.2
[16] scales_0.2.3 tools_2.15.2
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.