% Tutorial 8: Advanced Topics % DPI R Bootcamp % Jared Knowles
In this lesson we hope to learn:
# Loop to calculate number of students per grade
nstudents <- rep(NA, 6)
for (i in unique(df$grade)) {
nstudents[[i - 2]] <- length(df$stuid[df$grade == i])
}
nstudents
## [1] 500 400 500 400 500 400
summary(factor(df$grade))
## 3 4 5 6 7 8
## 500 400 500 400 500 400
A = matrix(as.numeric(1:1e+05))
system.time({
Sum = 0
for (i in seq_along(A)) {
Sum = Sum + A[[i]]
}
Sum
})
## user system elapsed
## 0.33 0.00 0.40
system.time({
sum(A)
})
## user system elapsed
## 0 0 0
rm(A)
print(myfunction)print(mean) #bytecode, we can't see it
## function (x, ...)
## standardGeneric("mean")
## <environment: 0x000000001178e870>
## attr(,"generic")
## [1] "mean"
## attr(,"generic")attr(,"package")
## [1] "base"
## attr(,"package")
## [1] "base"
## attr(,"group")
## list()
## attr(,"valueClass")
## character(0)
## attr(,"signature")
## [1] "x"
## attr(,"default")
## Method Definition (Class "derivedDefaultMethod"):
##
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x000000000920dd98>
## <environment: namespace:base>
##
## Signatures:
## x
## target "ANY"
## defined "ANY"
## attr(,"skeleton")
## function (x, ...)
## UseMethod("mean")(x, ...)
## attr(,"class")
## [1] "standardGeneric"
## attr(,"class")attr(,"package")
## [1] "methods"
print(order)
## function (..., na.last = TRUE, decreasing = FALSE)
## {
## z <- list(...)
## if (any(unlist(lapply(z, is.object)))) {
## z <- lapply(z, function(x) if (is.object(x))
## xtfrm(x)
## else x)
## if (!is.na(na.last))
## return(do.call("order", c(z, na.last = na.last, decreasing = decreasing)))
## }
## else if (!is.na(na.last))
## return(.Internal(order(na.last, decreasing, ...)))
## if (any(diff(l.z <- vapply(z, length, 1L)) != 0L))
## stop("argument lengths differ")
## ans <- vapply(z, is.na, rep.int(NA, l.z[1L]))
## ok <- if (is.matrix(ans))
## !apply(ans, 1, any)
## else !any(ans)
## if (all(!ok))
## return(integer())
## z[[1L]][!ok] <- NA
## ans <- do.call("order", c(z, decreasing = decreasing))
## keep <- seq_along(ok)[ok]
## ans[ans %in% keep]
## }
## <bytecode: 0x000000000892ff20>
## <environment: namespace:base>
x<-as.character(x) is really obnoxiousdefac <- function(x) {
# assign function a name, and list its arguments
x <- as.character(x) # what does function do?
x # last line is output of function
}
a <- factor(letters)
summary(a)
summary(defac(a))
summary(as.character(a))
foo which has both a character and a missing value?extractN <- function(x) {
x <- suppressWarnings(as.numeric(x))
# ignore warnings because we don't care
x <- x[!is.na(x)]
x
}
foo <- c(1, 4, 3, NA, 5, 60, NA)
extractN(foo)
## [1] 1 4 3 5 60
A <- extractN(foo)
lme4 package to do this work, or in a Bayesian framework (fancy) we can use winBUGS or JAGSlme4 by Doug Bates (a retired UW-Madison Professor of Statistics)mymod_me<-lmer(readSS~factor(grade)+factor(race)+female+disab+ell+(1|dist)+(1|stuid),data=df)
(1|dist) denote a random effect. In this case we are measuring a random effect for the variable dist in our data.lm formula (by designlibrary(lme4)
mymod_me <- lmer(readSS ~ factor(grade) + factor(race) + female + disab + ell +
(1 | dist) + (1 | stuid), data = df)
print(mymod_me, correlation = FALSE)
## Linear mixed model fit by REML
## Formula: readSS ~ factor(grade) + factor(race) + female + disab + ell + (1 | dist) + (1 | stuid)
## Data: df
## AIC BIC logLik deviance REMLdev
## 30731 30826 -15350 30774 30699
## Random effects:
## Groups Name Variance Std.Dev.
## stuid (Intercept) 10626 103.1
## dist (Intercept) 487 22.1
## Residual 1564 39.5
## Number of obs: 2700, groups: stuid, 1200; dist, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 378.24 31.03 12.2
## factor(grade)4 62.72 3.05 20.6
## factor(grade)5 124.67 3.28 38.0
## factor(grade)6 182.67 3.99 45.8
## factor(grade)7 236.64 4.20 56.4
## factor(grade)8 301.41 4.74 63.5
## factor(race)B -61.87 28.05 -2.2
## factor(race)H -34.29 27.54 -1.2
## factor(race)I -6.09 43.93 -0.1
## factor(race)W 10.37 28.00 0.4
## female 8.76 6.21 1.4
## disab -5.26 8.50 -0.6
## ell -18.81 13.10 -1.4
mymod <- lm(readSS ~ factor(grade) + factor(race) + female + disab + ell, data = df)
qplot(readSS, predict(mymod), data = df, alpha = I(0.3), color = I("blue")) +
geom_point(aes(x = df$readSS, y = fitted(mymod_me)), alpha = I(0.4), color = "dark green") +
theme_dpi() + xlab("Observed") + ylab("Predicted") + geom_text(aes(x = 370,
y = 700), label = "Green is Results of the \n Mixed Model")
caret provides best in class data mining tools to uscaret package looks likelibrary(caret)
# Set aside test set
testset <- sample(unique(df$stuid), 500)
df$case <- 0
df$case[df$stuid %in% testset] <- 1
# Draw a training set of data (random subset of students)
training <- subset(df, case == 0)
testing <- subset(df, case == 1)
training <- training[, c(3, 6:16, 21, 22, 28, 29, 30)] # subset vars
trainX <- training[, 1:15]
refac <- function(x) as.factor(as.character(x))
trainX$stuid <- refac(trainX$stuid)
trainX$dist <- refac(trainX$dist)
trainX$year <- refac(trainX$year)
# Parameters
ctrl <- trainControl(method = "repeatedcv", number = 7, repeats = 3, summaryFunction = defaultSummary)
# Search grid
grid <- expand.grid(.interaction.depth = seq(2, 6, by = 1), .n.trees = seq(200,
700, by = 100), .shrinkage = c(0.05, 0.1))
# Boosted tree search
gbmTune <- train(x = trainX, y = training$mathSS, method = "gbm", metric = "RMSE",
trControl = ctrl, tuneGrid = grid, verbose = FALSE)
# gbmPred<-predict(gbmTune,testing[,names(trainX)])
plot(gbmTune)
n <- 10000
rep <- 5
# tLoop <- replicate(rep, system.time( integLoop(func, xint, yint, n) ))
# summary(tLoop[3,])
tVec <- replicate(rep, system.time(integVec(func, xint, yint, n)))
summary(tVec[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.008 0.020 0.020
tApply <- replicate(rep, system.time(integApply(func, xint, yint, n)))
summary(tApply[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.3 23.8 24.3 24.6 25.8 26.8
# 2 Core Cluster
library(snow)
c1 <- makeCluster(c("localhost", "localhost"), type = "SOCK")
tSnow1 <- replicate(rep, system.time(integSnow(c1, func, xint, yint, n)))
summary(tSnow1[3])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.7 25.7 25.7 25.7 25.7 25.7
stopCluster(c1)
git and GitHub can helpIt is good to include the session info, e.g. this document is produced with knitr version 0.9.6. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## attached base packages:
## [1] splines grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] R2SWF_0.4 snow_0.3-10 gbm_1.6-3.2 reshape_0.8.4
## [5] caret_5.15-048 foreach_1.4.0 cluster_1.14.3 reshape2_1.2.2
## [9] lme4_0.999999-0 Matrix_1.0-10 lattice_0.20-10 xtable_1.7-0
## [13] vcd_1.2-13 colorspace_1.2-0 MASS_7.3-22 Hmisc_3.10-1
## [17] survival_2.37-2 sandwich_2.2-9 quantreg_4.94 SparseM_0.96
## [21] gridExtra_0.9.1 mgcv_1.7-22 eeptools_0.1 mapproj_1.2-0
## [25] maps_2.3-0 proto_0.3-10 plyr_1.8 stringr_0.6.2
## [29] ggplot2_0.9.3 lmtest_0.9-30 zoo_1.7-9 knitr_0.9.6
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 compiler_2.15.2 dichromat_1.2-4
## [4] digest_0.6.0 evaluate_0.4.3 formatR_0.7
## [7] gtable_0.1.2 iterators_1.0.6 labeling_0.1
## [10] munsell_0.4 nlme_3.1-106 RColorBrewer_1.0-5
## [13] scales_0.2.3 stats4_2.15.2 tools_2.15.2
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.