# MATH 2565 W 2007 Section M Week 4 R Script for Numerical Measures

### From MathWiki

# # MATH 2565 Week 4 # Using R for Numerical Measures # Chapter 3 # # download functions in 'fun.R' help.start() source("http://www.math.yorku.ca/~georges/R/fun.R") baseball <- read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv") source("/R/fun.R") names(baseball) summary(baseball) attach(baseball) mean(sal87) # oops, mean doesn't know what to do with NAs sal87 mean(sal87, na.rm = T) # many functions have this option to ignore NAs median ( sal87, na.rm = T ) # middle value # how to do something to every variable in a data frame sapply( baseball, mean, na.rm = T) # sapply(data.frame, function, extra arguments) ?sapply ## ## My personal favorite way of seeing a data frame at a glance ## xqplot (in fun.R ) td(new = T) # (fun.R) to make sure 'history' is turned on xqplot(baseball) # (fun.R) # quantile plots = 'class photo lined up short to tall' # i.e. data lined up from shortest to tallest # (flipped cumulative distribution functions) # for numerical data # or boxplots for factors

### ### Density plots: smooth histogram to see shape of distribution ### plot ( density ( sal87 ) ) # oops, those NAs again plot( density (sal87, na.rm = T)) par(mfrow=c(1,1)) # reset one plot per page plot( density (sal87, na.rm = T)) # skewed to right par(mfrow=c(3,2)) # multiple plots per page (3 rows 2 cols) # also starts new page plot( density (sal87, na.rm = T)) plot( density ( runs86 ) ) plot( density ( runs86 ) , main = "runs86") plot( density ( hits86 / atbat86 ), main= "Batting average in 1986", xlab = paste("N =",length(hits86))) # note symmetric par ( mfrow = c(3,2) ) # start new page plot( density ( hits86 / atbat86 ), main= "Batting average in 1986", xlab = paste("N =",length(hits86))) # note symmetric ## ## Percentile (p. 86 +) ## uniform quantile plots = class photo in library(car) ## = percentile plots quantile ( hits86/atbat86, .50, na.rm = T) quantile ( hits86/atbat86, .25, na.rm = T) # 1st quartile quantile ( hits86/atbat86, c(.25, .5, .75 ), na.rm = T ) # many at once # plotting quantiles: the uniform quantile plot library(car) qq.plot( hits86 / atbat86, dist = 'unif', line = 'none') plot( density ( sal87 , na.rm = T), main= "Salary in 1987", xlab = paste("N =",sum(!is.na(sal87)))) # skewed to right qq.plot( sal87, dist = 'unif', line = 'none', main="Salary in 1987") # silently drops NAs (some would consider this bad)

## ## Density plots , means and medians ## par(mfrow = c(1,1)) # one plot per page plot( density( sal87, na.rm = T), main = "Baseball salaries in 1987", xlab = "Salary in $1000's") # add vertical line at mean abline( v = mean( sal87, na.rm = T ), col = 'blue', lwd = 2) # add vertical line median (where will it go) abline( v = median( sal87, na.rm = T ), col = 'red', lwd = 2) legend( locator(1), legend = c("mean","median"), lty = c(1,1) , # line type, 1 is regular lwd = 2, col = c('blue','red')) ## EXERCISE: Repeat above ( density and two lines ) with batting average

## ## Might skip in class: ## How to create a variable with missing values removed ## -- an example of logical selection ## sal87 is.na(sal87) # logical vector !is.na(sal87) # logical negation, i.e. non-missing values sal87.c <- sal87[ !is.na( sal87 ) ] # selection with logical vector sal87.c # no NAs # if you have a lot of this to do na.rm na.rm <- function(x) x[!is.na(x)] # could now use: sal87.c <- na.rm( sal87 ) ## ## Weighted averages ## class.size <- c( 10, 15, 10, 20, 200 ) class.size mean( class.size ) # this is mean if you ask instructors weighted.mean ( class.size, c(2,3,2,1,1)) median ( class.size ) # what if you asked students? # you would get 10 students who say 10 from 1st class # 15 who say 15 # another 10 who say 10 # 20 who say 20 # 200 who say 200 !! weighted.mean ( class.size, class.size) # a very different perspective

## ## Box and whisker plot ## boxplot( sal87 ) boxplot( hits86 / atbat86 ) boxplot( runs86 ) ## The really nice use: side by side boxplot split( sal87, league87 ) boxplot ( split( sal87, league87 )) boxplot( split( sal87, posit86 )) # some data manipulation in R. Let's get rid of posit with small n z <- split(sal87, posit86) z z.size <- sapply( z, length ) z.size # Let's keep positions with more than 5. Note use of logical selection zbig <- z [ z.size > 5 ] zbig boxplot( zbig ) # Let's order then by median zbig.med <- sapply( zbig, median , na.rm = T) zbig.med order( zbig.med ) # indices smallest to largest rev( order ( zbig.med ) ) # indices largest to smalles zbig.o <- zbig [rev( order ( zbig.med ) )] boxplot( zbig.o ) boxplot( zbig.o , xlab = "position", ylab = "Salary in $1,000s")

### ### ### Measures of variation or 'spread' ### ### range( sal87 ) range( sal87 , na.rm = T) # not quite 'range' in the book diff( range( sal87, na.rm = T)) # 'range' as in the book IQR(sal87, na.rm = T) diff( quantile( sal87, c(.25,.75), na.rm = T) ) # 1st quartile # in fact : IQR var( sal87 ) # sample variance only var( sal87, na.rm = T) # in $1000s squared sd( sal87 , na.rm = T) # in $1000s

## ## ## Standardizing ## ## z.sal87 <- scale( sal87 ) # silently drops NAs par( mfrow = c(2,2) ) plot( density ( sal87, na.rm = T)) plot( density ( z.sal87, na.rm = T)) abline( v = mean( z.sal87, na.rm = T), col = 'red') # What is the mean of the z-scores?