Introduction to R Programming

Jared Knowles and Justin Meyer

Wi-Fi

Outline

We hope to move through the following topics in the next two hours.

Materials

If you don't have it already, now might be a good time to install R and RStudio to get started. Sector 67 computers already have these installed.

Installing R on Windows

Installation Tips

Next Install packages

First, watch this tutorial

Install Packages

Install Packages (2)

install_new<-function(mypkg){
  if (mypkg %in% installed.packages()) cat("Package already installed")
  else{cat("Package not found, so installing with dependencies... /n
           Press CTRL C to abort.")
    Sys.sleep(5)
    install.packages(mypkg,repos="http://cran.wustl.edu/")
}
}

install_new('plyr')
install_new('lmtest')
install_new('ggplot2')
install_new('gridExtra')
install_new('stringr')
install_new('knitr')
install_new('quantreg')
install_new('zoo')
install_new('xtable')
install_new('lme4')
install_new('caret')

References and Resources for the Previous Section

Overview

R

Why Use R

R Advantages Continued

R Can Compliment Other Tools

R's Drawbacks

R Vocabulary

Components of an R Setup

Self-help

Let's Look at RStudio

R As A Calculator

2 + 2  # add numbers
[1] 4
2 * pi  #multiply by a constant
[1] 6.283
7 + runif(1, min = 0, max = 1)  #add a random variable
[1] 7.457
4^4  # powers
[1] 256
sqrt(4^4)  # functions
[1] 16

Arithmetic Operators

2 + 2
[1] 4
2/2
[1] 1
2 * 2
[1] 4
2^2
[1] 4
2 == 2
[1] TRUE
23%/%2
[1] 11
23%%2
[1] 1

Other Key Symbols

foo <- 3
foo
[1] 3
1:10
 [1]  1  2  3  4  5  6  7  8  9 10
# it increments by one
a <- 100:120
a
 [1] 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
[18] 117 118 119 120

Comments in R

# Something I want to keep from R
# Like my secret from the R engine
# Maybe intended for a human and not the computer
# Like: Look at this cool plot!

myplot(readSS,mathSS,data=df)

R Advanced Math

Using the Workspace

Using the Workspace (2)

x <- 5  #store a variable with <-
x  #print the variable
[1] 5
z <- 3
ls()  #list all variables
[1] "a"   "foo" "x"   "z"  
ls.str()  #list and describe variables
a :  int [1:21] 100 101 102 103 104 105 106 107 108 109 ...
foo :  num 3
x :  num 5
z :  num 3
rm(x)  # delete a variable
ls()
[1] "a"   "foo" "z"  

R as a Language

  1. Case sensitivity matters
a <- 3
A <- 4
print(c(a, A))
[1] 3 4
  1. What happens if I type print(a,A)?

c is our friend

A <- c(3, 4)
print(A)
[1] 3 4

Language

a <- runif(100)  # Generate 100 random numbers
b <- runif(100)  # 100 more
c <- NULL  # Setup for loop (declare variables)
for (i in 1:100) {
    # Loop just like in Java or C
    c[i] <- a[i] * b[i]
}
d <- a * b
identical(c, d)  # Test equality
[1] TRUE

More Language Bugs Features

Objects

summary(df[, 28:31])  #summary look at df object
   schoollow         readSS        mathSS           proflvl    
 Min.   :0.000   Min.   :252   Min.   :210   advanced   : 788  
 1st Qu.:0.000   1st Qu.:430   1st Qu.:418   basic      : 523  
 Median :0.000   Median :495   Median :480   below basic: 210  
 Mean   :0.242   Mean   :496   Mean   :483   proficient :1179  
 3rd Qu.:0.000   3rd Qu.:562   3rd Qu.:543                     
 Max.   :1.000   Max.   :833   Max.   :828                     
summary(df$readSS)  #summary of a single column
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    252     430     495     496     562     833 

-The $ says to look for object readSS in object df

Graphics too

library(ggplot2) # Load graphics Package
library(eeptools)
qplot(readSS,mathSS,data=df,geom='point',alpha=I(0.3))+theme_dpi()+
  opts(title='Test Score Relationship')+
  geom_smooth()
Student Test Scores

Student Test Scores

Handling Data in R

length(unique(df$school))
[1] 173
length(unique(df$stuid))
[1] 1200
uniqstu <- length(unique(df$stuid))
uniqstu
[1] 1200

Special Operators

big <- c(9, 12, 15, 25)
small <- c(9, 3, 4, 2)
# Give us a nice vector of logical values
big > small
[1] FALSE  TRUE  TRUE  TRUE
big = small
# Oops--don't do this, reassigns big to small
print(big)
[1] 9 3 4 2
print(small)
[1] 9 3 4 2

Special Operators (II)

big <- c(9, 12, 15, 25)
big[big == small]
[1] 9
# Returns values where the logical vector is true
big[big > small]
[1] 12 15 25
big[big < small]  # Returns an empty set
numeric(0)

Special operators (III)

big <- c(9, 12, 15, 25)
small <- c(9, 12, 15, 25, 9, 1, 3)
big[small %in% big]
[1]  9 12 15 25 NA
big[big %in% small]
[1]  9 12 15 25

Special operators (IV)

foo <- c("a", NA, 4, 9, 8.7)
!is.na(foo)  # Returns TRUE for non-NA
[1]  TRUE FALSE  TRUE  TRUE  TRUE
class(foo)
[1] "character"
a <- foo[!is.na(foo)]
a
[1] "a"   "4"   "9"   "8.7"
class(a)
[1] "character"

Special operators (V)

zap <- c(1, 4, 8, 2, 9, 11)
zap[zap > 2 | zap < 8]
[1]  1  4  8  2  9 11
zap[zap > 2 & zap < 8]
[1] 4

Regular Expressions

R Data Modes

Data Modes in R (numeric)

is.numeric(A)
[1] TRUE
class(A)
[1] "numeric"
print(A)
[1] 3 4

Data Modes (Character)

b <- c("one", "two", "three")
print(b)
[1] "one"   "two"   "three"
is.numeric(b)
[1] FALSE

Data Modes (Logical)

c <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE)
is.numeric(c)
[1] FALSE
is.character(c)
[1] FALSE
is.logical(c)  # Results in a logical value
[1] TRUE

Easier way

class(A)
[1] "numeric"
class(b)
[1] "character"
class(c)
[1] "logical"

A Note on Vectors

Factor

myfac <- factor(c("basic", "proficient", "advanced", "minimal"))
class(myfac)
[1] "factor"
myfac  # What order are the factors in?
[1] basic      proficient advanced   minimal   
Levels: advanced basic minimal proficient

Ordering the Factor

myfac_o <- ordered(myfac, levels = c("minimal", "basic", "proficient", "advanced"))
myfac_o
[1] basic      proficient advanced   minimal   
Levels: minimal < basic < proficient < advanced
summary(myfac_o)
   minimal      basic proficient   advanced 
         1          1          1          1 

Reclassifying Factors

class(myfac_o)
[1] "ordered" "factor" 
unclass(myfac_o)
[1] 2 3 4 1
attr(,"levels")
[1] "minimal"    "basic"      "proficient" "advanced"  
defac <- unclass(myfac_o)
defac
[1] 2 3 4 1
attr(,"levels")
[1] "minimal"    "basic"      "proficient" "advanced"  

Defactor

# From the eeptools package
defac <- function(x) {
    x <- as.character(x)
    x
}
defac(myfac_o)
[1] "basic"      "proficient" "advanced"   "minimal"   
defac <- defac(myfac_o)
defac
[1] "basic"      "proficient" "advanced"   "minimal"   

Convert to Numeric?

myfac_o
[1] basic      proficient advanced   minimal   
Levels: minimal < basic < proficient < advanced
as.numeric(myfac_o)
[1] 2 3 4 1
myfac
[1] basic      proficient advanced   minimal   
Levels: advanced basic minimal proficient
as.numeric(myfac)
[1] 2 4 1 3

Dates

mydate <- as.Date("7/20/2012", format = "%m/%d/%Y")
# Input is a character string and a parser
class(mydate)  # this is date
[1] "Date"
weekdays(mydate)  # what day of the week is it?
[1] "Friday"
mydate + 30  # Operate on dates
[1] "2012-08-19"

More Dates

# We can parse other formats of dates
mydate2 <- as.Date("8-5-1988", format = "%d-%m-%Y")
mydate2
[1] "1988-05-08"

mydate - mydate2
Time difference of 8839 days
# Can add and subtract two date objects

A few notes on dates

as.numeric(mydate)  # days since 1-1-1970
[1] 15541
as.Date(56, origin = "2013-4-29")  # we can set our own origin
[1] "2013-06-24"

Why care so much about classes?

Data Structures in R

Vectors

print(1)
[1] 1
# The 1 in braces means this element is a vector of length 1
print("This tutorial is awesome")
[1] "This tutorial is awesome"
# This is a vector of length 1 consisting of a single 'string of
# characters'

Vectors 2

print(LETTERS)
 [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q"
[18] "R" "S" "T" "U" "V" "W" "X" "Y" "Z"
# This vector has 26 character elements
print(LETTERS[6])
[1] "F"
# The sixth element of this vector has length 1
length(LETTERS[6])
[1] 1
# The length of that element is a number with length 1

Matrices

mymat <- matrix(1:36, nrow = 6, ncol = 6)
rownames(mymat) <- LETTERS[1:6]
colnames(mymat) <- LETTERS[7:12]
class(mymat)
[1] "matrix"

Matrices II

rownames(mymat)
[1] "A" "B" "C" "D" "E" "F"
colnames(mymat)
[1] "G" "H" "I" "J" "K" "L"
mymat
  G  H  I  J  K  L
A 1  7 13 19 25 31
B 2  8 14 20 26 32
C 3  9 15 21 27 33
D 4 10 16 22 28 34
E 5 11 17 23 29 35
F 6 12 18 24 30 36

Arrays

myarray <- array(1:42, dim = c(7, 3, 2), dimnames = list(c("tiny", "small", 
    "medium", "medium-ish", "large", "big", "huge"), c("slow", "moderate", "fast"), 
    c("boring", "fun")))
class(myarray)
[1] "array"
dim(myarray)
[1] 7 3 2

Arrays II

dimnames(myarray)
[[1]]
[1] "tiny"       "small"      "medium"     "medium-ish" "large"     
[6] "big"        "huge"      

[[2]]
[1] "slow"     "moderate" "fast"    

[[3]]
[1] "boring" "fun"   
myarray
, , boring

           slow moderate fast
tiny          1        8   15
small         2        9   16
medium        3       10   17
medium-ish    4       11   18
large         5       12   19
big           6       13   20
huge          7       14   21

, , fun

           slow moderate fast
tiny         22       29   36
small        23       30   37
medium       24       31   38
medium-ish   25       32   39
large        26       33   40
big          27       34   41
huge         28       35   42

Lists

myvec <- c(1, 2, 4, 5, 9)
mylist <- list(vec = myvec, mat = mymat, arr = myarray, date = mydate)
class(mylist)
[1] "list"
length(mylist)
[1] 4
names(mylist)
[1] "vec"  "mat"  "arr"  "date"

Lists (II)

mylist$vec
[1] 1 2 4 5 9
mylist[[2]][1, 3]
[1] 13

So what?

attributes(mylist)
$names
[1] "vec"  "mat"  "arr"  "date"
attributes(myarray)[1:2][2]
$dimnames
$dimnames[[1]]
[1] "tiny"       "small"      "medium"     "medium-ish" "large"     
[6] "big"        "huge"      

$dimnames[[2]]
[1] "slow"     "moderate" "fast"    

$dimnames[[3]]
[1] "boring" "fun"   

Dataframes

str(df[, 25:32])
'data.frame':   2700 obs. of  8 variables:
 $ district  : int  3 3 3 3 3 3 3 3 3 3 ...
 $ schoolhigh: int  0 0 0 0 0 0 0 0 0 0 ...
 $ schoolavg : int  1 1 1 1 1 1 1 1 1 1 ...
 $ schoollow : int  0 0 0 0 0 0 0 0 0 0 ...
 $ readSS    : num  357 264 370 347 373 ...
 $ mathSS    : num  387 303 365 344 441 ...
 $ proflvl   : Factor w/ 4 levels "advanced","basic",..: 2 3 2 2 2 4 4 4 3 2 ...
 $ race      : Factor w/ 5 levels "A","B","H","I",..: 2 2 2 2 2 2 2 2 2 2 ...

Converting Between Types

Summing it Up

Other References for the Previous Section

Books

Overview

A quick note on R packages

I can haz packages?

# You can find and install packages within R
install.packages("foo")  # Name must be in quotes
install.packages(c("foo", "foo1", "foo2"))
# Packages get updated FREQUENTLY
update.packages()  # Gonna update them all

Finding Packages

The Working Directory

Ground Rules

Missing Data Symbols

a <- c(1, 2, 3)  # a is a vector with three elements
# Ask R for element 4
print(a[4])
[1] NA
a <- c(a, NULL)  # Append NULL onto a
print(a)
[1] 1 2 3
# Notice no change
a <- c(a, NA)
print(a)
[1]  1  2  3 NA

What the heck is Not a Number?

b <- 1
b <- sqrt(-b)
Warning: NaNs produced
print(b)
[1] NaN
pi/0
[1] Inf

Read in Data

CSV is Our Friend

# Set working directory to the tutorial directory In RStudio can do this
# in 'Tools' tab
setwd("~/GitHub/r_tutorial_ed")
# Load some data
df <- read.csv("data/smalldata.csv")
# Note if we don't assign data to 'df' R just prints contents of table

Let's Check What We Got

'data.frame':   2700 obs. of  6 variables:
 $ schoolavg: int  1 1 1 1 1 1 1 1 1 1 ...
 $ schoollow: int  0 0 0 0 0 0 0 0 0 0 ...
 $ readSS   : num  357 264 370 347 373 ...
 $ mathSS   : num  387 303 365 344 441 ...
 $ proflvl  : Factor w/ 4 levels "advanced","basic",..: 2 3 2 2 2 4 4 4 3 2 ...
 $ race     : Factor w/ 5 levels "A","B","H","I",..: 2 2 2 2 2 2 2 2 2 2 ...

Always Check Your Data

dim(df)
[1] 2700   32
summary(df[, 1:5])
       X               school         stuid            grade     
 Min.   :     44   Min.   :   1   Min.   :   205   Min.   :3.00  
 1st Qu.: 108677   1st Qu.: 195   1st Qu.: 44205   1st Qu.:4.00  
 Median : 458596   Median : 436   Median : 88205   Median :5.00  
 Mean   : 557918   Mean   : 460   Mean   : 99229   Mean   :5.44  
 3rd Qu.: 972291   3rd Qu.: 717   3rd Qu.:132205   3rd Qu.:7.00  
 Max.   :1499992   Max.   :1000   Max.   :324953   Max.   :8.00  
     schid      
 Min.   :  6.0  
 1st Qu.: 15.0  
 Median : 55.5  
 Mean   : 52.0  
 3rd Qu.: 75.0  
 Max.   :105.0  

Checking your data II

names(df)
 [1] "X"           "school"      "stuid"       "grade"       "schid"      
 [6] "dist"        "white"       "black"       "hisp"        "indian"     
[11] "asian"       "econ"        "female"      "ell"         "disab"      
[16] "sch_fay"     "dist_fay"    "luck"        "ability"     "measerr"    
[21] "teachq"      "year"        "attday"      "schoolscore" "district"   
[26] "schoolhigh"  "schoolavg"   "schoollow"   "readSS"      "mathSS"     
[31] "proflvl"     "race"       
names(attributes(df))
[1] "names"     "row.names" "class"    
class(df)
[1] "data.frame"

Other References for the Previous Section

Overview

In this lesson we hope to learn:

Again, read in our dataset

# Set working directory to the tutorial directory In RStudio can do
# this in 'Tools' tab
setwd("~/GitHub/r_tutorial_ed")
# Load some data
load("data/smalldata.rda")
# Note if we don't assign data to 'df' R just prints contents of
# table

Aggregation

table(df$grade, df$year)
   
    2000 2001 2002
  3  200  100  200
  4  100  200  100
  5  200  100  200
  6  100  200  100
  7  200  100  200
  8  100  200  100

Aggregation can be more complex

table(df$year, df$race)
      
         A   B   H   I   W
  2000  16 370  93   7 414
  2001  16 370  93   7 414
  2002  16 370  93   7 414

More complicated still

with(df[df$grade == 3, ], {
    table(year, race)
})
      race
year     A   B   H   I   W
  2000   4  78  22   4  92
  2001   1  44   8   2  45
  2002   0  74  20   1 105
df2 <- subset(df, grade == 3)
table(df2$year, df2$race)
      
         A   B   H   I   W
  2000   4  78  22   4  92
  2001   1  44   8   2  45
  2002   0  74  20   1 105
rm(df2)

Tables cont.

table(df$year, df$proflvl)
      
       advanced basic below basic proficient
  2000       56   313         143        388
  2001      229   183          64        424
  2002      503    27           3        367
table(df$race, df$proflvl)
   
    advanced basic below basic proficient
  A       19     7           3         19
  B      160   302         162        486
  H       54    76          33        116
  I        7     4           1          9
  W      548   134          11        549

Checking Understanding

Aggregating Data

# Reading Scores by Race
aggregate(readSS ~ race, FUN = mean, data = df)
  race readSS
1    A  508.7
2    B  460.2
3    H  473.2
4    I  485.2
5    W  533.2

Aggregate Isn't Enough

The Logic of plyr

An Aside about Split-Apply-Combine

The logic of plyr

How plyr works on dataframes

Using plyr

plyr in Action

  library(plyr)
myag<-ddply(df, .(dist,grade),summarize,
            mean_read=mean(readSS,na.rm=T),
            mean_math=mean(mathSS,na.rm=T),
            sd_read=sd(readSS,na.rm=T),
            sd_math=sd(mathSS,na.rm=T),
            count_read=length(readSS),
            count_math=length(mathSS))

Results

head(myag)
  dist grade mean_read mean_math sd_read sd_math count_read
1  205     3     451.7     406.1   93.52   72.45        200
2  205     4     438.9     459.9   77.76   79.10        100
3  205     5     487.9     462.6   85.30   75.10        200
4  205     6     514.7     526.8   76.83   66.04        100
5  205     7     530.0     521.5   84.82   74.85        200
6  205     8     575.5     581.2   79.58   83.45        100
  count_math
1        200
2        100
3        200
4        100
5        200
6        100

Sorting

df.badsort <- order(df$readSS, df$mathSS)
head(df.badsort)
[1]  106 1026    2   56  122  118

Correct Example

df.sort <- df[order(df$readSS, df$mathSS, df$attday), ]
head(df[, c(3, 23, 29, 30)])
   stuid attday readSS mathSS
1 149995    180  357.3  387.3
2  13495    180  263.9  302.6
3 106495    160  369.7  365.5
4  45205    168  346.6  344.5
5 142705    156  373.1  441.2
6  14995    157  436.8  463.4
head(df.sort[, c(3, 23, 29, 30)])
      stuid attday readSS mathSS
106  106705    160  251.5  277.0
1026  80995    176  263.2  377.8
2     13495    180  263.9  302.6
56   122402    180  264.3  271.7
122   79705    168  266.4  318.7
118   40495    173  266.9  275.0

Let's clean it up a bit more

head(df[with(df, order(-readSS, -attday)), c(3, 23, 29, 30)])
      stuid attday readSS mathSS
1631 145205    137  833.2  828.4
1462 107705    180  773.3  746.6
2252 122902    180  744.0  621.6
2341  44902    175  741.7  676.3
1482 134705    180  739.2  705.4
1630  14495    162  738.9  758.2

About sorting

M <- matrix(c(1, 2, 2, 2, 3, 6, 4, 5), 4, 2, byrow = FALSE, dimnames = list(NULL, 
    c("a", "b")))
M[order(M[, "a"], -M[, "b"]), ]
     a b
[1,] 1 3
[2,] 2 6
[3,] 2 5
[4,] 2 4

About Sorting

mytab <- table(df$grade, df$year)
mytab[order(mytab[, 1]), ]
   
    2000 2001 2002
  4  100  200  100
  6  100  200  100
  8  100  200  100
  3  200  100  200
  5  200  100  200
  7  200  100  200
mytab[order(mytab[, 2]), ]
   
    2000 2001 2002
  3  200  100  200
  5  200  100  200
  7  200  100  200
  4  100  200  100
  6  100  200  100
  8  100  200  100

Filtering Data

Basic Filtering a Column

# Gives all rows that meet this requirement
df[df$readSS > 800, ]
           X school  stuid grade schid dist white black hisp indian
1631 1281061    852 145205     8    15  205     1     0    0      0
     asian econ female ell disab sch_fay dist_fay luck ability
1631     0    0      1   0     0       0        0    0   108.3
     measerr teachq year attday schoolscore district schoolhigh
1631   6.325  155.7 2001    137       227.7       19          0
     schoolavg schoollow readSS mathSS  proflvl race
1631         1         0  833.2  828.4 advanced    W
df$grade[df$mathSS > 800]
[1] 8
# Gives all values of grade that meet this requirement

Multiple filters

df$grade[df$black == 1 & df$readSS > 650]
 [1] 8 7 8 6 6 7 8 7 8 8 8 4

Using filters to assign values

myag$spread <- NA  # create variable
myag$spread[myag$sd_read < 75] <- "low"
myag$spread[myag$sd_read > 75] <- "high"
myag$spread <- as.factor(myag$spread)
summary(myag$spread)
high  low 
  15    3 

Merging Data

Merging Data II

names(myag)
[1] "dist"       "grade"      "mean_read"  "mean_math"  "sd_read"   
[6] "sd_math"    "count_read" "count_math" "spread"    
names(df[, c(2, 3, 4, 6)])
[1] "school" "stuid"  "grade"  "dist"  

Merge Options

simple_merge <- merge(df, myag)
names(simple_merge)
 [1] "grade"       "dist"        "X"           "school"     
 [5] "stuid"       "schid"       "white"       "black"      
 [9] "hisp"        "indian"      "asian"       "econ"       
[13] "female"      "ell"         "disab"       "sch_fay"    
[17] "dist_fay"    "luck"        "ability"     "measerr"    
[21] "teachq"      "year"        "attday"      "schoolscore"
[25] "district"    "schoolhigh"  "schoolavg"   "schoollow"  
[29] "readSS"      "mathSS"      "proflvl"     "race"       
[33] "mean_read"   "mean_math"   "sd_read"     "sd_math"    
[37] "count_read"  "count_math"  "spread"     

Merge Options

Reshaping Data

head(df[, 1:10], 3)
    X school  stuid grade schid dist white black hisp indian
1  44      1 149995     3   105  495     0     1    0      0
2  53      1  13495     3    45  495     0     1    0      0
3 116      1 106495     3    45  495     0     1    0      0
head(widedf[, c(1, 28:40)], 3)
   stuid readSS.2000 mathSS.2000 proflvl.2000 race.2000  X.2001
1 149995       357.3       387.3        basic         B  441000
2  13495       263.9       302.6  below basic         B  531000
3 106495       369.7       365.5        basic         B 1161000
  school.2001 grade.2001 schid.2001 dist.2001 white.2001 black.2001
1           1          4        105       495          0          1
2           1          4         45       495          0          1
3           1          4         45       495          0          1
  hisp.2001 indian.2001
1         0           0
2         0           0
3         0           0

Wide Data v. Long Data

The reshape Function

Deconstructing reshape

widedf <- reshape(df, timevar = "year", idvar = "stuid", direction = "wide")

What about Wide to Long?

longdf <- reshape(widedf, idvar = "stuid", timevar = "year", varying = names(widedf[, 
    2:91]), direction = "long", sep = ".")

Subsetting Data

g4 <- subset(df, grade == 4)
dim(g4)
[1] 400  32
g4_b <- df[df$grade == 4, ]
identical(g4, g4_b)
[1] TRUE

Other References for the Previous Section

Overview

In this lesson we hope to learn:

Datasets

In this tutorial we will use a number of datasets of different types:

Reading Data In

load("data/midwest_schools.rda")
head(midsch[, 1:12])
  district_id school_id subject grade n1   ss1 n2   ss2 predicted
1          14       130    math     4 44 433.1 40 463.0     468.7
2          70        20    math     4 18 443.0 20 477.2     476.5
3         112        80    math     4 86 445.4 94 472.6     478.4
4         119        50    math     4 95 427.1 94 460.7     464.1
5         147        60    math     4 27 424.2 27 458.7     461.8
6         147       125    math     4 17 423.5 26 463.1     461.2
  residuals  resid_z  resid_t
1   -5.7446 -0.59190 -0.59171
2    0.7235  0.07456  0.07452
3   -5.7509 -0.59267 -0.59248
4   -3.3586 -0.34606 -0.34591
5   -3.0937 -0.31877 -0.31863
6    1.8530  0.19094  0.19085

What do we have then?

table(midsch$test_year, midsch$grade)
      
          4    5    6    7    8
  2007 1150 1094  472  638  734
  2008 1204 1146  462  588  692
  2009 1173 1092  434  592  668
  2010 1120 1090  428  610  686
  2011 1126 1060  420  618  688
length(unique(midsch$district_id))
[1] 357
length(unique(midsch$school_id))
[1] 247

Explore Data Structure (II)

table(midsch$subject, midsch$grade)
      
          4    5    6    7    8
  math 2886 2741 1108 1523 1734
  read 2887 2741 1108 1523 1734

Diagnostic Plots Perhaps

library(ggplot2)
qplot(ss1, ss2, data = midsch, alpha = I(0.07)) + theme_dpi() + geom_smooth() + 
    geom_smooth(method = "lm", se = FALSE, color = "purple")
plot of chunk diag1

plot of chunk diag1

Frequencies, Crosstabs, and t-tests

Let's take a simple example of cars

data(mtcars)  # load the data from R
head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

T-test

mean(mtcars$mpg)
[1] 20.09
t.test(mtcars$mpg, mu = 18, alternative = "greater")

    One Sample t-test

data:  mtcars$mpg 
t = 1.962, df = 31, p-value = 0.02938
alternative hypothesis: true mean is greater than 18 
95 percent confidence interval:
 18.28   Inf 
sample estimates:
mean of x 
    20.09 
t.test(mtcars$mpg, mu = 22, alternative = "less")

    One Sample t-test

data:  mtcars$mpg 
t = -1.792, df = 31, p-value = 0.04144
alternative hypothesis: true mean is less than 22 
95 percent confidence interval:
 -Inf 21.9 
sample estimates:
mean of x 
    20.09 

Other References for the Previous Section

Overview

In this lesson we hope to learn:

Basic Plot

library(ggplot2)
qplot(readSS, mathSS, data = df)
plot of chunk plot1

Understanding Grammar of Graphics through A Scatterplot

qplot(readSS, mathSS, data = df, alpha = I(0.3)) + theme_dpi()
plot of chunk smallscatter

Geoms

Aesthetics

ggplot(df, aes(x = readSS, y = mathSS)) + geom_point()
plot of chunk extended
# Identical to: qplot(readSS,mathSS,data=df)

Examples of Aesthetics

data(mpg)
qplot(displ, cty, data = mpg) + theme_dpi()
plot of chunk plot2
qplot(displ, cty, data = mpg, size = cyl) + theme_dpi()
plot of chunk plot2
qplot(displ, cty, data = mpg, shape = drv, size = I(3)) + theme_dpi()
plot of chunk plot2
qplot(displ, cty, data = mpg, color = class) + theme_dpi()
plot of chunk plot2

Thinking about Aesthetics

Aesthetic Discrete Continuous
Color Disparate colors Sequential or divergent colors
Size Unique size for each value mapping to radius of value
Shape A shape for each value does not make sense

Another is ordered v. unordered

Aesthetic Ordered Unordered
Color Sequential or divergent colors R ainbow
Size Increasing or decreasing radius * does not make sense*
Shape does not make sense A shape for each value

Layers

qplot(readSS, mathSS, data = df) + facet_wrap(~grade) + theme_dpi(base_size = 12) + 
    geom_smooth(method = "lm", se = FALSE, size = I(1.2))
plot of chunk smallfacets

We can also facet across more attributes

qplot(readSS, mathSS, data = df) + facet_grid(ell ~ grade) + theme_dpi(base_size = 12) + 
    geom_smooth(method = "lm", se = FALSE, size = I(1.2))
plot of chunk smallfacets2

Visualizing Categorical Data

Structural Plots

library(vcd)
df$proflvl <- factor(df$proflvl, levels = c("advanced", "proficient", "basic", 
    "below basic"))
a <- structable(proflvl ~ race, data = df)
mosaic(a, shade = TRUE)

Another example

library(vcd)
df$proflvl <- factor(df$proflvl, levels = c("advanced", "proficient", "basic", 
    "below basic"))
a <- structable(female ~ race, data = df)
mosaic(a, shade = TRUE)

What are the basic plot types?

plot of chunk ggplot2plottypes

What are some advanced plot types?

plot of chunk ggplot2plottypesadv

Above and Beyond

plot of chunk premier

Scary R Code

library(grid)
p1<-qplot(readSS,..density..,data=df,fill=race,
      position='fill',geom='density')+scale_fill_brewer(
        type='qual',palette=2)

p2<-qplot(readSS,..fill..,data=df,fill=race,
      position='fill',geom='density')+scale_fill_brewer(
        type='qual',palette=2)+ylim(c(0,1))+theme_bw()+
          opts(legend.position='none',
               axis.text.x=theme_blank(),
               axis.text.y=theme_blank(),
               axis.ticks=theme_blank(),
               panel.margin=unit(0,"lines"))+ylab('')+
                 xlab('')

vp<-viewport(x=unit(.65,"npc"),y=unit(.73,"npc"),
             width=unit(.2,"npc"),height=unit(.2,"npc"))
print(p1)
print(p2,vp=vp)

References for the Previous Section

  1. Hadley Wickham's JSM 2012 Presentation
  2. Hadley Wickam's ggplot2 Intro Presentation
  3. The ggplot2 Homepage
  4. ggplot2 Documentation
  5. Quick R: Basic Graphs
  6. Quick R: Advanced Graphs

Overview

In this lesson we hope to learn:

Exporting data

Here's an Example

write.csv(df, file = "PATH/TO/MY.csv")
write.dta(df, file = "PATH/TO/MY.dta")
# save in the R file
save(df, file = "PATH/TO/MY.rda", compress = "xz")

References

  1. How to Use knitr
  2. CRAN Taskview: Reproducible Research
  3. A Sweave Demo
  4. Donald Knuth on Literate Programming

Questions?

Jared:

Justin: