DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn about:
load("data/Student_Attributes.rda")
head(stuatt[, 1:4], 7)
## sid school_year male race_ethnicity
## 1 1 2004 1 B
## 2 1 2005 1 H
## 3 1 2006 1 H
## 4 1 2007 1 H
## 5 2 2006 0 W
## 6 2 2007 0 B
## 7 3 2006 1 H
first_9th_school_year_reported variablestuatt$first_9th_year_reported <- NULL
NULL, another R quirklength(unique(stuatt$sid))
## [1] 21803
length(unique(stuatt$sid, stuatt$male))
## [1] 21806
testuniqueness <- function(id, group) {
length(unique(id)) == length(unique(id, group))
} # Need better varname and some optimization to the speed of this code
testuniqueness(stuatt$sid, stuatt$male)
## [1] FALSE
testuniqueness(stuatt$sid, stuatt$race_ethnicity)
## [1] FALSE
testuniqueness(stuatt$sid, stuatt$birth_date)
## [1] FALSE
stuatt[17:21, 1:3]
## sid school_year male
## 17 7 2004 1
## 18 7 2005 1
## 19 7 2006 1
## 20 7 2007 0
## 21 7 2008 1
plyr strategy we learned in Tutorial 3library(plyr)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)))
table(sturow$nvals_gender)
##
## 1 2
## 21799 4
# A function to find the most frequent value
library(eeptools)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)),
gender_mode = statamode(male), gender_recent = tail(male, 1))
head(sturow[7:10, ])
## sid nvals_gender gender_mode gender_recent
## 7 7 2 1 1
## 8 8 1 1 1
## 9 9 1 1 1
## 10 10 1 1 1
stuatt and sturow and we need to replace some values from stuatt with some values from sturowmerge to the rescue!merge our two data objects into a temporary data object called tempdftempdf <- merge(stuatt, sturow) # R finds the linking variable already
head(tempdf[17:21, c(1, 2, 3, 10, 11)])
## sid school_year male nvals_gender gender_mode
## 17 7 2004 1 2 1
## 18 7 2005 1 2 1
## 19 7 2006 1 2 1
## 20 7 2007 0 2 1
## 21 7 2008 1 2 1
print(subset(tempdf[, c(1, 2, 3, 10, 11)], sid == 12506))
## sid school_year male nvals_gender gender_mode
## 50064 12506 2004 1 2 .
## 50065 12506 2005 0 2 .
print(subset(tempdf[, c(1, 2, 3, 10, 11, 12)], sid == 12506))
## sid school_year male nvals_gender gender_mode gender_recent
## 50064 12506 2004 1 2 . 0
## 50065 12506 2005 0 2 . 0
gender_recent variable when there is not gender_mode that is validrecoding our variable!tempdf$male to be the same as tempdf$gender_modetempdf$male is now a “.” indicating no modal category exists, we assign tempdf$gender_recent to be tempdf$maletestuniqueness(tempdf$id,tempdf$male) to check if it workedtempdf$male <- tempdf$gender_mode
tempdf$male[tempdf$male == "."] <- tempdf$gender_recent[tempdf$male == "."]
# we have to put the filter on both sides of the assignment operator
testuniqueness(tempdf$id, tempdf$male)
## [1] TRUE
rm(sturow)
stuatt <- tempdf
stuatt$nvals_gender <- NULL
stuatt$gender_mode <- NULL
stuatt$gender_recent <- NULL
# or just run stuatt<-tempdf[,1:9]
rm(tempdf)
race_ethnicity variable to numeric and add labels to itfactor variable type like R can, and rely on numeric coding schemesmale variable as a factor with values M and Fsummary(stuatt$race_ethnicity)
## A B H M/O W NA's
## 7303 25321 30444 2809 20528 1129
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
summary(stuatt$race_ethnicity)
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
## [1] 1129
stuatt$race_ethnicity <- as.character(stuatt$race_ethnicity)
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
stuatt$race_ethnicity <- factor(stuatt$race_ethnicity)
summary(stuatt$race_ethnicity)
## A AI B H M/O W
## 7303 1129 25321 30444 2809 20528
stuatt[7:9, c("sid", "school_year", "race_ethnicity")]
## sid school_year race_ethnicity
## 7 3 2006 H
## 8 3 2006 B
## 9 3 2007 B
nvals <- ddply(stuatt, .(sid, school_year), summarize, nvals_race = length(unique(race_ethnicity)),
tmphispanic = length(which(race_ethnicity == "H")))
tempdf <- merge(stuatt, nvals)
# Clean up
rm(nvals)
# Recode race_ethnicity
tempdf$race2 <- tempdf$race_ethnicity
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic == 1] <- "H"
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic != 1] <- "M/O"
tempdf$race_ethnicity <- tempdf$race2
# Clean up by removing old variables
tempdf$race2 <- NULL
tempdf$nvals_race <- NULL
tempdf$tmphispanic <- NULL
# Resort our result
tempdf <- tempdf[order(tempdf$sid, tempdf$school_year), ]
## sid school_year race_ethnicity
## 56201 3 2006 H
## 56202 3 2006 H
## 81064 8552 2005 W
## 81065 8552 2006 M/O
## 81066 8552 2006 M/O
## 6162 11382 2005 H
## 6163 11382 2005 H
## 6164 11382 2006 H
## sid school_year race_ethnicity
## 7 3 2006 H
## 8 3 2006 B
## 34290 8552 2005 W
## 34291 8552 2006 A
## 34292 8552 2006 W
## 45674 11382 2005 H
## 45675 11382 2005 M/O
## 45676 11382 2006 H
stuatt <- tempdf
rm(tempdf)
# Stupid hack workaround of ddply bug when running too many of these
# sequentially
ddply_race <- function(x, y, z) {
NewColName <- "race_ethnicity"
z <- ddply(x, .(y, z), .fun = function(xx, col) {
c(nvals_race = length(unique(xx[, col])))
}, NewColName)
z$sid <- z$y
z$school_year <- z$z
z$y <- NULL
z$z <- NULL
return(z)
}
nvals <- ddply_race(stuatt, stuatt$sid, stuatt$school_year)
tempdf <- merge(stuatt, nvals)
tempdf$temp_ishispanic <- NA
tempdf$temp_ishispanic[tempdf$race_ethnicity == "H" & tempdf$nvals_race > 1] <- 1
head(stuatt[, c("sid", "school_year", "race_ethnicity")])
## sid school_year race_ethnicity
## 1 1 2004 B
## 2 1 2005 H
## 3 1 2006 H
## 4 1 2007 H
## 44618 2 2006 W
## 44619 2 2007 B
tempdf <- ddply(stuatt, .(sid), summarize, var_temp = statamode(race_ethnicity),
nvals = length(unique(race_ethnicity)), most_recent_year = max(school_year),
most_recent_var = tail(race_ethnicity, 1))
tempdf$race2[tempdf$var_temp != "."] <- tempdf$var_temp[tempdf$var_temp != "."]
tempdf$race2[tempdf$var_temp == "."] <- paste(tempdf$most_recent_var[tempdf$var_temp ==
"."])
tempdf <- merge(stuatt, tempdf)
head(tempdf[, c(1, 2, 4, 14)], 7)
## sid school_year race_ethnicity race2
## 1 1 2004 B H
## 2 1 2005 H H
## 3 1 2006 H H
## 4 1 2007 H H
## 5 2 2006 W B
## 6 2 2007 B B
## 7 3 2006 H H
summarize in the ddply call in this situationtask1 <- function(df, id, year, var) {
require(plyr)
mdf <- eval(parse(text = paste("ddply(", df, ",.(", id, "),summarize,\nvar_temp=statamode(",
var, "),\nnvals=length(unique(", var, ")),most_recent_year=max(", year,
"),\nmost_recent_var=tail(", var, ",1))", sep = "")))
mdf$var2[mdf$var_temp != "."] <- mdf$var_temp[mdf$var_temp != "."]
mdf$var2[mdf$var_temp == "."] <- as.character(mdf$most_recent_var[mdf$var_temp ==
"."])
ndf <- eval(parse(text = paste("merge(", df, ",mdf)", sep = "")))
rm(mdf)
return(ndf)
}
# Note data must be sorted
tempdf <- task1(stuatt, stuatt$sid, stuatt$school_year, stuatt$race_ethnicity)
Sort df on measerr and mathss. What are the highest 5 values of each.
Complete the next part of the SDP toolkit!
It is good to include the session info, e.g. this document is produced with knitr version 0.8. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] eeptools_0.1 mapproj_1.1-8.3 maps_2.2-6 proto_0.3-9.2
## [5] stringr_0.6.1 plyr_1.7.1 ggplot2_0.9.2.1 lmtest_0.9-30
## [9] zoo_1.7-9 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 colorspace_1.2-0 dichromat_1.2-4
## [4] digest_0.5.2 evaluate_0.4.2 formatR_0.6
## [7] gtable_0.1.1 labeling_0.1 lattice_0.20-10
## [10] MASS_7.3-22 memoise_0.1 munsell_0.4
## [13] RColorBrewer_1.0-5 reshape2_1.2.1 scales_0.2.2
## [16] tools_2.15.1
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.