Jared Knowles and Justin Meyer
We hope to move through the following topics in the next two hours.
If you don't have it already, now might be a good time to install R and RStudio to get started. Sector 67 computers already have these installed.
First, watch this tutorial
install_new<-function(mypkg){
if (mypkg %in% installed.packages()) cat("Package already installed")
else{cat("Package not found, so installing with dependencies... /n
Press CTRL C to abort.")
Sys.sleep(5)
install.packages(mypkg,repos="http://cran.wustl.edu/")
}
}
install_new('plyr')
install_new('lmtest')
install_new('ggplot2')
install_new('gridExtra')
install_new('stringr')
install_new('knitr')
install_new('quantreg')
install_new('zoo')
install_new('xtable')
install_new('lme4')
install_new('caret')
?summary??regression2 + 2 # add numbers
[1] 4
2 * pi #multiply by a constant
[1] 6.283
7 + runif(1, min = 0, max = 1) #add a random variable
[1] 7.457
4^4 # powers
[1] 256
sqrt(4^4) # functions
[1] 16
+ - = / * and exponential ^, there is also integer division %/% and remainder in integer division (known as modulo arithmetic) %%2 + 2
[1] 4
2/2
[1] 1
2 * 2
[1] 4
2^2
[1] 4
2 == 2
[1] TRUE
23%/%2
[1] 11
23%%2
[1] 1
<- is the assignment operator, it declares something is something elsefoo <- 3
foo
[1] 3
: is the sequence operator1:10
[1] 1 2 3 4 5 6 7 8 9 10
# it increments by one
a <- 100:120
a
[1] 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
[18] 117 118 119 120
# Something I want to keep from R
# Like my secret from the R engine
# Maybe intended for a human and not the computer
# Like: Look at this cool plot!
myplot(readSS,mathSS,data=df)
Easiest of all, R can generate distributions of data very easily
e.g. rnorm(100) or rbinom(100)
Go ahead, try typing hist(rnorm(10000)) into RStudio
x <- 5 #store a variable with <-
x #print the variable
[1] 5
z <- 3
ls() #list all variables
[1] "a" "foo" "x" "z"
ls.str() #list and describe variables
a : int [1:21] 100 101 102 103 104 105 106 107 108 109 ...
foo : num 3
x : num 5
z : num 3
rm(x) # delete a variable
ls()
[1] "a" "foo" "z"
a <- 3
A <- 4
print(c(a, A))
[1] 3 4
c is our friendA <- c(3, 4)
print(A)
[1] 3 4
c stands for concatenate and allows vectors to have multiple elementsc, which is one of the most used functions you will ever usec is important to put any vector together, but remember that objects within a vector must all be of the same typea <- runif(100) # Generate 100 random numbers
b <- runif(100) # 100 more
c <- NULL # Setup for loop (declare variables)
for (i in 1:100) {
# Loop just like in Java or C
c[i] <- a[i] * b[i]
}
d <- a * b
identical(c, d) # Test equality
[1] TRUE
camelCase; others are.dot.separated; others use_underscoresbase, grid, lattice, and ggplot2)summary function to a variety of object types and seeing how it adaptssummary(df[, 28:31]) #summary look at df object
schoollow readSS mathSS proflvl
Min. :0.000 Min. :252 Min. :210 advanced : 788
1st Qu.:0.000 1st Qu.:430 1st Qu.:418 basic : 523
Median :0.000 Median :495 Median :480 below basic: 210
Mean :0.242 Mean :496 Mean :483 proficient :1179
3rd Qu.:0.000 3rd Qu.:562 3rd Qu.:543
Max. :1.000 Max. :833 Max. :828
summary(df$readSS) #summary of a single column
Min. 1st Qu. Median Mean 3rd Qu. Max.
252 430 495 496 562 833
-The $ says to look for object readSS in object df
library(ggplot2) # Load graphics Package
library(eeptools)
qplot(readSS,mathSS,data=df,geom='point',alpha=I(0.3))+theme_dpi()+
opts(title='Test Score Relationship')+
geom_smooth()
Student Test Scores
length(unique(df$school))
[1] 173
length(unique(df$stuid))
[1] 1200
uniqstu <- length(unique(df$stuid))
uniqstu
[1] 1200
<, >, <=, >=, ==, and != are used to compare values across vectorsbig <- c(9, 12, 15, 25)
small <- c(9, 3, 4, 2)
# Give us a nice vector of logical values
big > small
[1] FALSE TRUE TRUE TRUE
big = small
# Oops--don't do this, reassigns big to small
print(big)
[1] 9 3 4 2
print(small)
[1] 9 3 4 2
= or == to assign anything, always use <-[] to avoid confusionbig <- c(9, 12, 15, 25)
big[big == small]
[1] 9
# Returns values where the logical vector is true
big[big > small]
[1] 12 15 25
big[big < small] # Returns an empty set
numeric(0)
%in% operator determines whether each value in the left operand can be matched with one of the values in the right operand.big <- c(9, 12, 15, 25)
small <- c(9, 12, 15, 25, 9, 1, 3)
big[small %in% big]
[1] 9 12 15 25 NA
big, but small also has objects that do not appear in big and so an NA is returnedbig[big %in% small]
[1] 9 12 15 25
NA| (or) and & (and) can be used to combine two logical values and produce another logical value as the result. The operator ! (not) negates a logical value. These operators allow complex conditions to be constructed.foo <- c("a", NA, 4, 9, 8.7)
!is.na(foo) # Returns TRUE for non-NA
[1] TRUE FALSE TRUE TRUE TRUE
class(foo)
[1] "character"
a <- foo[!is.na(foo)]
a
[1] "a" "4" "9" "8.7"
class(a)
[1] "character"
|| and && are similar, but they combine two logical vectors. The comparison is performed element by element, so the result is also a logical vector.zap <- c(1, 4, 8, 2, 9, 11)
zap[zap > 2 | zap < 8]
[1] 1 4 8 2 9 11
zap[zap > 2 & zap < 8]
[1] 4
is.numeric(A)
[1] TRUE
class(A)
[1] "numeric"
print(A)
[1] 3 4
b <- c("one", "two", "three")
print(b)
[1] "one" "two" "three"
is.numeric(b)
[1] FALSE
c <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE)
is.numeric(c)
[1] FALSE
is.character(c)
[1] FALSE
is.logical(c) # Results in a logical value
[1] TRUE
class functionclass(A)
[1] "numeric"
class(b)
[1] "character"
class(c)
[1] "logical"
myfac <- factor(c("basic", "proficient", "advanced", "minimal"))
class(myfac)
[1] "factor"
myfac # What order are the factors in?
[1] basic proficient advanced minimal
Levels: advanced basic minimal proficient
myfac_o <- ordered(myfac, levels = c("minimal", "basic", "proficient", "advanced"))
myfac_o
[1] basic proficient advanced minimal
Levels: minimal < basic < proficient < advanced
summary(myfac_o)
minimal basic proficient advanced
1 1 1 1
class(myfac_o)
[1] "ordered" "factor"
unclass(myfac_o)
[1] 2 3 4 1
attr(,"levels")
[1] "minimal" "basic" "proficient" "advanced"
defac <- unclass(myfac_o)
defac
[1] 2 3 4 1
attr(,"levels")
[1] "minimal" "basic" "proficient" "advanced"
minimal be 2 and basic be 3?# From the eeptools package
defac <- function(x) {
x <- as.character(x)
x
}
defac(myfac_o)
[1] "basic" "proficient" "advanced" "minimal"
defac <- defac(myfac_o)
defac
[1] "basic" "proficient" "advanced" "minimal"
myfac_o
[1] basic proficient advanced minimal
Levels: minimal < basic < proficient < advanced
as.numeric(myfac_o)
[1] 2 3 4 1
myfac
[1] basic proficient advanced minimal
Levels: advanced basic minimal proficient
as.numeric(myfac)
[1] 2 4 1 3
lubridate package for more advanced functionality including mathematical operations on datesmydate <- as.Date("7/20/2012", format = "%m/%d/%Y")
# Input is a character string and a parser
class(mydate) # this is date
[1] "Date"
weekdays(mydate) # what day of the week is it?
[1] "Friday"
mydate + 30 # Operate on dates
[1] "2012-08-19"
# We can parse other formats of dates
mydate2 <- as.Date("8-5-1988", format = "%d-%m-%Y")
mydate2
[1] "1988-05-08"
mydate - mydate2
Time difference of 8839 days
# Can add and subtract two date objects
as.numeric(mydate) # days since 1-1-1970
[1] 15541
as.Date(56, origin = "2013-4-29") # we can set our own origin
[1] "2013-06-24"
print(1)
[1] 1
# The 1 in braces means this element is a vector of length 1
print("This tutorial is awesome")
[1] "This tutorial is awesome"
# This is a vector of length 1 consisting of a single 'string of
# characters'
print(LETTERS)
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q"
[18] "R" "S" "T" "U" "V" "W" "X" "Y" "Z"
# This vector has 26 character elements
print(LETTERS[6])
[1] "F"
# The sixth element of this vector has length 1
length(LETTERS[6])
[1] 1
# The length of that element is a number with length 1
mymat <- matrix(1:36, nrow = 6, ncol = 6)
rownames(mymat) <- LETTERS[1:6]
colnames(mymat) <- LETTERS[7:12]
class(mymat)
[1] "matrix"
rownames(mymat)
[1] "A" "B" "C" "D" "E" "F"
colnames(mymat)
[1] "G" "H" "I" "J" "K" "L"
mymat
G H I J K L
A 1 7 13 19 25 31
B 2 8 14 20 26 32
C 3 9 15 21 27 33
D 4 10 16 22 28 34
E 5 11 17 23 29 35
F 6 12 18 24 30 36
dim and classmyarray <- array(1:42, dim = c(7, 3, 2), dimnames = list(c("tiny", "small",
"medium", "medium-ish", "large", "big", "huge"), c("slow", "moderate", "fast"),
c("boring", "fun")))
class(myarray)
[1] "array"
dim(myarray)
[1] 7 3 2
dimnames(myarray)
[[1]]
[1] "tiny" "small" "medium" "medium-ish" "large"
[6] "big" "huge"
[[2]]
[1] "slow" "moderate" "fast"
[[3]]
[1] "boring" "fun"
myarray
, , boring
slow moderate fast
tiny 1 8 15
small 2 9 16
medium 3 10 17
medium-ish 4 11 18
large 5 12 19
big 6 13 20
huge 7 14 21
, , fun
slow moderate fast
tiny 22 29 36
small 23 30 37
medium 24 31 38
medium-ish 25 32 39
large 26 33 40
big 27 34 41
huge 28 35 42
myvec <- c(1, 2, 4, 5, 9)
mylist <- list(vec = myvec, mat = mymat, arr = myarray, date = mydate)
class(mylist)
[1] "list"
length(mylist)
[1] 4
names(mylist)
[1] "vec" "mat" "arr" "date"
str(mylist)
List of 4
$ vec : num [1:5] 1 2 4 5 9
$ mat : int [1:6, 1:6] 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:6] "A" "B" "C" "D" ...
.. ..$ : chr [1:6] "G" "H" "I" "J" ...
$ arr : int [1:7, 1:3, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "dimnames")=List of 3
.. ..$ : chr [1:7] "tiny" "small" "medium" "medium-ish" ...
.. ..$ : chr [1:3] "slow" "moderate" "fast"
.. ..$ : chr [1:2] "boring" "fun"
$ date: Date[1:1], format: "2012-07-20"
$ or [[]] to extract elements@ to extract elementsmylist$vec
[1] 1 2 4 5 9
mylist[[2]][1, 3]
[1] 13
attributes function to learn about the objectattributes(mylist)
$names
[1] "vec" "mat" "arr" "date"
attributes(myarray)[1:2][2]
$dimnames
$dimnames[[1]]
[1] "tiny" "small" "medium" "medium-ish" "large"
[6] "big" "huge"
$dimnames[[2]]
[1] "slow" "moderate" "fast"
$dimnames[[3]]
[1] "boring" "fun"
str(df[, 25:32])
'data.frame': 2700 obs. of 8 variables:
$ district : int 3 3 3 3 3 3 3 3 3 3 ...
$ schoolhigh: int 0 0 0 0 0 0 0 0 0 0 ...
$ schoolavg : int 1 1 1 1 1 1 1 1 1 1 ...
$ schoollow : int 0 0 0 0 0 0 0 0 0 0 ...
$ readSS : num 357 264 370 347 373 ...
$ mathSS : num 387 303 365 344 441 ...
$ proflvl : Factor w/ 4 levels "advanced","basic",..: 2 3 2 2 2 4 4 4 3 2 ...
$ race : Factor w/ 5 levels "A","B","H","I",..: 2 2 2 2 2 2 2 2 2 2 ...
as.whatIwant as in as.factor or as.table or as.data.framepackagespackages are essentially free and open source add-ons for Rggplot2 packagelme4 package for mixed effects modelsscatterplot3d package (also webGL)sptmcaretRservefun package# You can find and install packages within R
install.packages("foo") # Name must be in quotes
install.packages(c("foo", "foo1", "foo2"))
# Packages get updated FREQUENTLY
update.packages() # Gonna update them all
library(ggplot2) and you're done!wd is both your friend and enemygetwd()C:/Users/My Documents/My Project/ which is then set as the working directorysetwd() command: setwd("PATH/TO/MY PROJECT/")NA or NaN or NULL depending on the context.a <- c(1, 2, 3) # a is a vector with three elements
# Ask R for element 4
print(a[4])
[1] NA
NA and NULL?a <- c(a, NULL) # Append NULL onto a
print(a)
[1] 1 2 3
# Notice no change
a <- c(a, NA)
print(a)
[1] 1 2 3 NA
NA can hold a place, NULL cannotNaN is even more special, and only holds things like imaginary numbersNaN stands for "Not a Number"b <- 1
b <- sqrt(-b)
Warning: NaNs produced
print(b)
[1] NaN
pi/0
[1] Inf
sin(Inf) = NaN.csv .dta .sas .spss .dat and even .xls and .xlsx with some care# Set working directory to the tutorial directory In RStudio can do this
# in 'Tools' tab
setwd("~/GitHub/r_tutorial_ed")
# Load some data
df <- read.csv("data/smalldata.csv")
# Note if we don't assign data to 'df' R just prints contents of table
'data.frame': 2700 obs. of 6 variables:
$ schoolavg: int 1 1 1 1 1 1 1 1 1 1 ...
$ schoollow: int 0 0 0 0 0 0 0 0 0 0 ...
$ readSS : num 357 264 370 347 373 ...
$ mathSS : num 387 303 365 344 441 ...
$ proflvl : Factor w/ 4 levels "advanced","basic",..: 2 3 2 2 2 4 4 4 3 2 ...
$ race : Factor w/ 5 levels "A","B","H","I",..: 2 2 2 2 2 2 2 2 2 2 ...
dim(df)
[1] 2700 32
summarysummary(df[, 1:5])
X school stuid grade
Min. : 44 Min. : 1 Min. : 205 Min. :3.00
1st Qu.: 108677 1st Qu.: 195 1st Qu.: 44205 1st Qu.:4.00
Median : 458596 Median : 436 Median : 88205 Median :5.00
Mean : 557918 Mean : 460 Mean : 99229 Mean :5.44
3rd Qu.: 972291 3rd Qu.: 717 3rd Qu.:132205 3rd Qu.:7.00
Max. :1499992 Max. :1000 Max. :324953 Max. :8.00
schid
Min. : 6.0
1st Qu.: 15.0
Median : 55.5
Mean : 52.0
3rd Qu.: 75.0
Max. :105.0
namesnames(df)
[1] "X" "school" "stuid" "grade" "schid"
[6] "dist" "white" "black" "hisp" "indian"
[11] "asian" "econ" "female" "ell" "disab"
[16] "sch_fay" "dist_fay" "luck" "ability" "measerr"
[21] "teachq" "year" "attday" "schoolscore" "district"
[26] "schoolhigh" "schoolavg" "schoollow" "readSS" "mathSS"
[31] "proflvl" "race"
attributes and classnames(attributes(df))
[1] "names" "row.names" "class"
class(df)
[1] "data.frame"
str which lists all data elements in an object and their typeIn this lesson we hope to learn:
# Set working directory to the tutorial directory In RStudio can do
# this in 'Tools' tab
setwd("~/GitHub/r_tutorial_ed")
# Load some data
load("data/smalldata.rda")
# Note if we don't assign data to 'df' R just prints contents of
# table
table function is our friendtable(df$grade, df$year)
2000 2001 2002
3 200 100 200
4 100 200 100
5 200 100 200
6 100 200 100
7 200 100 200
8 100 200 100
table(df$year, df$race)
A B H I W
2000 16 370 93 7 414
2001 16 370 93 7 414
2002 16 370 93 7 414
with(df[df$grade == 3, ], {
table(year, race)
})
race
year A B H I W
2000 4 78 22 4 92
2001 1 44 8 2 45
2002 0 74 20 1 105
with specifies a data object to work on, in this case all elements of df where grade==3table is the same command as above, but since we specified the data object in the with statement, we don't need the df$ in front of the variables of interestdf2 <- subset(df, grade == 3)
table(df2$year, df2$race)
A B H I W
2000 4 78 22 4 92
2001 1 44 8 2 45
2002 0 74 20 1 105
rm(df2)
table(df$year, df$proflvl)
advanced basic below basic proficient
2000 56 313 143 388
2001 229 183 64 424
2002 503 27 3 367
table(df$race, df$proflvl)
advanced basic below basic proficient
A 19 7 3 19
B 160 302 162 486
H 54 76 33 116
I 7 4 1 9
W 548 134 11 549
aggregate function that can be used and helps us avoid the clustering problems aboveformula (think I want variable X by grouping factor Y) and the statistic we want to compute# Reading Scores by Race
aggregate(readSS ~ race, FUN = mean, data = df)
race readSS
1 A 508.7
2 B 460.2
3 H 473.2
4 I 485.2
5 W 533.2
aggregate is cool, but it isn't very flexibleplyr packageplyr is a set of routines/logical structure for transforming, summarizing, reshaping, and reorganizing data objects of one type in R into another type (or the same type)plyr package has a number of utilities to help us split-apply-combine across data types for both input and outputfor loops to iterate over groups of students, because in R for loops are slow, inefficient, and impracticalplyr to the rescue, while not as fast as a compiled language, it is pretty dang good!
ddply has before it combines it back for us when we do the call ddply(df,.(sex,age),"nrow")
plyr has a straightforward syntaxplyr functions are in the format XXply. The two X's specify what the input file we are applying a function to is, and then what way we would like it outputted.plyr d = dataframe, l= list, m=matrix, and a=array. By far the most common usage is ddplyplyr in Tutorial 4 as well library(plyr)
myag<-ddply(df, .(dist,grade),summarize,
mean_read=mean(readSS,na.rm=T),
mean_math=mean(mathSS,na.rm=T),
sd_read=sd(readSS,na.rm=T),
sd_math=sd(mathSS,na.rm=T),
count_read=length(readSS),
count_math=length(mathSS))
summarize tells ddply what we are doing to the data framehead(myag)
dist grade mean_read mean_math sd_read sd_math count_read
1 205 3 451.7 406.1 93.52 72.45 200
2 205 4 438.9 459.9 77.76 79.10 100
3 205 5 487.9 462.6 85.30 75.10 200
4 205 6 514.7 526.8 76.83 66.04 100
5 205 7 530.0 521.5 84.82 74.85 200
6 205 8 575.5 581.2 79.58 83.45 100
count_math
1 200
2 100
3 200
4 100
5 200
6 100
order function to sort datadf.badsort <- order(df$readSS, df$mathSS)
head(df.badsort)
[1] 106 1026 2 56 122 118
df.sort <- df[order(df$readSS, df$mathSS, df$attday), ]
head(df[, c(3, 23, 29, 30)])
stuid attday readSS mathSS
1 149995 180 357.3 387.3
2 13495 180 263.9 302.6
3 106495 160 369.7 365.5
4 45205 168 346.6 344.5
5 142705 156 373.1 441.2
6 14995 157 436.8 463.4
head(df.sort[, c(3, 23, 29, 30)])
stuid attday readSS mathSS
106 106705 160 251.5 277.0
1026 80995 176 263.2 377.8
2 13495 180 263.9 302.6
56 122402 180 264.3 271.7
122 79705 168 266.4 318.7
118 40495 173 266.9 275.0
head(df[with(df, order(-readSS, -attday)), c(3, 23, 29, 30)])
stuid attday readSS mathSS
1631 145205 137 833.2 828.4
1462 107705 180 773.3 746.6
2252 122902 180 744.0 621.6
2341 44902 175 741.7 676.3
1482 134705 180 739.2 705.4
1630 14495 162 738.9 758.2
- denotes we want descending order, R's default is ascending orderM <- matrix(c(1, 2, 2, 2, 3, 6, 4, 5), 4, 2, byrow = FALSE, dimnames = list(NULL,
c("a", "b")))
M[order(M[, "a"], -M[, "b"]), ]
a b
[1,] 1 3
[2,] 2 6
[3,] 2 5
[4,] 2 4
mytab <- table(df$grade, df$year)
mytab[order(mytab[, 1]), ]
2000 2001 2002
4 100 200 100
6 100 200 100
8 100 200 100
3 200 100 200
5 200 100 200
7 200 100 200
mytab[order(mytab[, 2]), ]
2000 2001 2002
3 200 100 200
5 200 100 200
7 200 100 200
4 100 200 100
6 100 200 100
8 100 200 100
# Gives all rows that meet this requirement
df[df$readSS > 800, ]
X school stuid grade schid dist white black hisp indian
1631 1281061 852 145205 8 15 205 1 0 0 0
asian econ female ell disab sch_fay dist_fay luck ability
1631 0 0 1 0 0 0 0 0 108.3
measerr teachq year attday schoolscore district schoolhigh
1631 6.325 155.7 2001 137 227.7 19 0
schoolavg schoollow readSS mathSS proflvl race
1631 1 0 833.2 828.4 advanced W
df$grade[df$mathSS > 800]
[1] 8
# Gives all values of grade that meet this requirement
df$grade[df$black == 1 & df$readSS > 650]
[1] 8 7 8 6 6 7 8 7 8 8 8 4
df$black=1 or black==1?spread indicating whether a district has high or low spread among its student scoresmyag$spread <- NA # create variable
myag$spread[myag$sd_read < 75] <- "low"
myag$spread[myag$sd_read > 75] <- "high"
myag$spread <- as.factor(myag$spread)
summary(myag$spread)
high low
15 3
df which has multiple rows per student and myag which has multiple rows per schoolnames(myag)
[1] "dist" "grade" "mean_read" "mean_math" "sd_read"
[6] "sd_math" "count_read" "count_math" "spread"
names(df[, c(2, 3, 4, 6)])
[1] "school" "stuid" "grade" "dist"
dist and grade are in common. Is this ok?year as well?merge we want to consider with ?mergemerge automagically combine the datasimple_merge <- merge(df, myag)
names(simple_merge)
[1] "grade" "dist" "X" "school"
[5] "stuid" "schid" "white" "black"
[9] "hisp" "indian" "asian" "econ"
[13] "female" "ell" "disab" "sch_fay"
[17] "dist_fay" "luck" "ability" "measerr"
[21] "teachq" "year" "attday" "schoolscore"
[25] "district" "schoolhigh" "schoolavg" "schoollow"
[29] "readSS" "mathSS" "proflvl" "race"
[33] "mean_read" "mean_math" "sd_read" "sd_math"
[37] "count_read" "count_math" "spread"
simple_merge(df1,df2,by=c("id1","id2"))simple_merge(df1,df2,by.x=c("id1","id2"),by.y=c("id1_a","id2_a"))notsosimple_merge(df1,df2,all.x=TRUE,all.y=TRUE)x observations (df1), all the y observations (df2) or neither, or bothhead(df[, 1:10], 3)
X school stuid grade schid dist white black hisp indian
1 44 1 149995 3 105 495 0 1 0 0
2 53 1 13495 3 45 495 0 1 0 0
3 116 1 106495 3 45 495 0 1 0 0
head(widedf[, c(1, 28:40)], 3)
stuid readSS.2000 mathSS.2000 proflvl.2000 race.2000 X.2001
1 149995 357.3 387.3 basic B 441000
2 13495 263.9 302.6 below basic B 531000
3 106495 369.7 365.5 basic B 1161000
school.2001 grade.2001 schid.2001 dist.2001 white.2001 black.2001
1 1 4 105 495 0 1
2 1 4 45 495 0 1
3 1 4 45 495 0 1
hisp.2001 indian.2001
1 0 0
2 0 0
3 0 0
reshape is the way to move from wide to longwidedf <- reshape(df, timevar = "year", idvar = "stuid", direction = "wide")
idvar represents the unit we want to represent a single row, in this case each unique student gets a single rowtimevar is the variable that differenaties between two rows with the same student IDtimevar needn't always represent time!direction tells R we are going to move to wide datavarying argument we can tell R explicitly which items we want to move widereshape function works well in both directionslongdf <- reshape(widedf, idvar = "stuid", timevar = "year", varying = names(widedf[,
2:91]), direction = "long", sep = ".")
subset function to get only 4th grade scoresg4 <- subset(df, grade == 4)
dim(g4)
[1] 400 32
g4_b <- df[df$grade == 4, ]
identical(g4, g4_b)
[1] TRUE
In this lesson we hope to learn:
In this tutorial we will use a number of datasets of different types:
stulong: student-level assessment and demographics data (simulated and research ready)midwest_schools.csv: aggregate school level test score averages from a large Midwest stateload("data/midwest_schools.rda")
head(midsch[, 1:12])
district_id school_id subject grade n1 ss1 n2 ss2 predicted
1 14 130 math 4 44 433.1 40 463.0 468.7
2 70 20 math 4 18 443.0 20 477.2 476.5
3 112 80 math 4 86 445.4 94 472.6 478.4
4 119 50 math 4 95 427.1 94 460.7 464.1
5 147 60 math 4 27 424.2 27 458.7 461.8
6 147 125 math 4 17 423.5 26 463.1 461.2
residuals resid_z resid_t
1 -5.7446 -0.59190 -0.59171
2 0.7235 0.07456 0.07452
3 -5.7509 -0.59267 -0.59248
4 -3.3586 -0.34606 -0.34591
5 -3.0937 -0.31877 -0.31863
6 1.8530 0.19094 0.19085
table(midsch$test_year, midsch$grade)
4 5 6 7 8
2007 1150 1094 472 638 734
2008 1204 1146 462 588 692
2009 1173 1092 434 592 668
2010 1120 1090 428 610 686
2011 1126 1060 420 618 688
length(unique(midsch$district_id))
[1] 357
length(unique(midsch$school_id))
[1] 247
table(midsch$subject, midsch$grade)
4 5 6 7 8
math 2886 2741 1108 1523 1734
read 2887 2741 1108 1523 1734
table(midsch$district_id,midsch$grade)library(ggplot2)
qplot(ss1, ss2, data = midsch, alpha = I(0.07)) + theme_dpi() + geom_smooth() +
geom_smooth(method = "lm", se = FALSE, color = "purple")
plot of chunk diag1
data(mtcars) # load the data from R
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mpg variable thenmean(mtcars$mpg)
[1] 20.09
t.test(mtcars$mpg, mu = 18, alternative = "greater")
One Sample t-test
data: mtcars$mpg
t = 1.962, df = 31, p-value = 0.02938
alternative hypothesis: true mean is greater than 18
95 percent confidence interval:
18.28 Inf
sample estimates:
mean of x
20.09
t.test(mtcars$mpg, mu = 22, alternative = "less")
One Sample t-test
data: mtcars$mpg
t = -1.792, df = 31, p-value = 0.04144
alternative hypothesis: true mean is less than 22
95 percent confidence interval:
-Inf 21.9
sample estimates:
mean of x
20.09
t.test(mtcars$mpg,mu=18) test?In this lesson we hope to learn:
ggplot2 is pretty much the new standard in Rlibrary(ggplot2)
qplot(readSS, mathSS, data = df)
qplot(readSS, mathSS, data = df, alpha = I(0.3)) + theme_dpi()
ggplot2 has an extended syntax that makes this obviousggplot(df, aes(x = readSS, y = mathSS)) + geom_point()
# Identical to: qplot(readSS,mathSS,data=df)
aes says we are specifying aesthetics, here we specified x and y to make a two dimensional graphicdata(mpg)
qplot(displ, cty, data = mpg) + theme_dpi()
qplot(displ, cty, data = mpg, size = cyl) + theme_dpi()
qplot(displ, cty, data = mpg, shape = drv, size = I(3)) + theme_dpi()
qplot(displ, cty, data = mpg, color = class) + theme_dpi()
| Aesthetic | Discrete | Continuous |
|---|---|---|
| Color | Disparate colors | Sequential or divergent colors |
| Size | Unique size for each value | mapping to radius of value |
| Shape | A shape for each value | does not make sense |
| Aesthetic | Ordered | Unordered |
|---|---|---|
| Color | Sequential or divergent colors R | ainbow |
| Size | Increasing or decreasing radius * | does not make sense* |
| Shape | does not make sense A | shape for each value |
qplot(readSS, mathSS, data = df) + facet_wrap(~grade) + theme_dpi(base_size = 12) +
geom_smooth(method = "lm", se = FALSE, size = I(1.2))
qplot(readSS, mathSS, data = df) + facet_grid(ell ~ grade) + theme_dpi(base_size = 12) +
geom_smooth(method = "lm", se = FALSE, size = I(1.2))
library(vcd)
df$proflvl <- factor(df$proflvl, levels = c("advanced", "proficient", "basic",
"below basic"))
a <- structable(proflvl ~ race, data = df)
mosaic(a, shade = TRUE)
library(vcd)
df$proflvl <- factor(df$proflvl, levels = c("advanced", "proficient", "basic",
"below basic"))
a <- structable(female ~ race, data = df)
mosaic(a, shade = TRUE)
library(grid)
p1<-qplot(readSS,..density..,data=df,fill=race,
position='fill',geom='density')+scale_fill_brewer(
type='qual',palette=2)
p2<-qplot(readSS,..fill..,data=df,fill=race,
position='fill',geom='density')+scale_fill_brewer(
type='qual',palette=2)+ylim(c(0,1))+theme_bw()+
opts(legend.position='none',
axis.text.x=theme_blank(),
axis.text.y=theme_blank(),
axis.ticks=theme_blank(),
panel.margin=unit(0,"lines"))+ylab('')+
xlab('')
vp<-viewport(x=unit(.65,"npc"),y=unit(.73,"npc"),
width=unit(.2,"npc"),height=unit(.2,"npc"))
print(p1)
print(p2,vp=vp)
In this lesson we hope to learn:
foreign library, save, write.csv, and write.dtawrite.csv(df, file = "PATH/TO/MY.csv")
write.dta(df, file = "PATH/TO/MY.dta")
# save in the R file
save(df, file = "PATH/TO/MY.rda", compress = "xz")