# MATH 2565 W 2007 Section M Week 4 R Script for Numerical Measures

```  #
#  MATH 2565 Week 4
#  Using R for Numerical Measures
#  Chapter 3
#

help.start()
source("http://www.math.yorku.ca/~georges/R/fun.R")

source("/R/fun.R")

names(baseball)

summary(baseball)

attach(baseball)

mean(sal87)             # oops, mean doesn't know what to do with NAs

sal87

mean(sal87, na.rm = T)  # many functions have this option to ignore NAs

median ( sal87, na.rm = T ) # middle value

# how to do something to every variable in a data frame

sapply( baseball, mean, na.rm = T)   # sapply(data.frame, function, extra arguments)

?sapply

##
##  My personal favorite way of seeing a data frame at a glance
##  xqplot (in fun.R )

td(new = T)       # (fun.R) to make sure 'history' is turned on
xqplot(baseball)  # (fun.R)
# quantile plots = 'class photo lined up short to tall'
# i.e. data lined up from shortest to tallest
# (flipped cumulative distribution functions)
# for numerical data
# or boxplots for factors

```
```  ###
###  Density plots: smooth histogram to see shape of distribution
###

plot ( density ( sal87 ) )  # oops, those NAs again
plot( density (sal87, na.rm = T))

par(mfrow=c(1,1)) # reset one plot per page

plot( density (sal87, na.rm = T))    # skewed to right

par(mfrow=c(3,2))      # multiple plots per page (3 rows 2 cols)
# also starts new page

plot( density (sal87, na.rm = T))

plot( density ( runs86 ) )
plot( density ( runs86 ) , main = "runs86")

plot( density ( hits86 / atbat86 ), main= "Batting average in 1986",
xlab = paste("N =",length(hits86)))   # note symmetric

par ( mfrow = c(3,2) )  # start new page

plot( density ( hits86 / atbat86 ), main= "Batting average in 1986",
xlab = paste("N =",length(hits86)))   # note symmetric

##
##  Percentile (p. 86 +)
##  uniform quantile plots  = class photo in library(car)
##     = percentile plots

quantile ( hits86/atbat86, .50, na.rm = T)
quantile ( hits86/atbat86, .25, na.rm = T)   # 1st quartile
quantile ( hits86/atbat86, c(.25, .5, .75 ), na.rm = T )   # many at once

# plotting quantiles: the uniform quantile plot

library(car)

qq.plot( hits86 / atbat86, dist = 'unif', line = 'none')

plot( density ( sal87 , na.rm = T), main= "Salary in 1987",
xlab = paste("N =",sum(!is.na(sal87))))   # skewed to right

qq.plot( sal87, dist = 'unif', line = 'none', main="Salary in 1987")
# silently drops NAs (some would consider this bad)
```
```  ##
##  Density plots , means and medians
##

par(mfrow = c(1,1))    # one plot per page

plot( density( sal87, na.rm = T), main = "Baseball salaries in 1987",
xlab = "Salary in \$1000's")

# add vertical line at mean

abline( v = mean( sal87, na.rm = T ), col = 'blue', lwd = 2)

# add vertical line median (where will it go)

abline( v = median( sal87, na.rm = T ), col = 'red', lwd = 2)

legend( locator(1), legend = c("mean","median"),
lty = c(1,1) ,    # line type, 1 is regular
lwd = 2, col = c('blue','red'))

## EXERCISE: Repeat above ( density and two lines ) with batting average

```
```  ##
##  Might skip in class:
##  How to create a variable with missing values removed
##  -- an example of logical selection
##

sal87
is.na(sal87)       # logical vector
!is.na(sal87)      # logical negation, i.e. non-missing values
sal87.c <- sal87[ !is.na( sal87 ) ]    # selection with logical vector

sal87.c    # no NAs

# if you have a lot of this to do
na.rm
na.rm <- function(x) x[!is.na(x)]

# could now use:

sal87.c <- na.rm( sal87 )

##
##  Weighted averages
##

class.size <- c( 10, 15, 10, 20, 200 )
class.size

mean( class.size )      # this is mean if you ask instructors
weighted.mean ( class.size, c(2,3,2,1,1))
median ( class.size )

# what if you asked students?
# you would get 10 students who say 10 from 1st class
#      15 who say 15
#     another 10 who say 10
#     20 who say 20
#     200 who say 200 !!

weighted.mean ( class.size, class.size)     # a very different perspective

```
```  ##
##  Box and whisker plot
##

boxplot( sal87 )
boxplot(  hits86 / atbat86 )

boxplot( runs86 )

## The really  nice use: side by side boxplot

split( sal87, league87 )

boxplot ( split( sal87, league87 ))

boxplot( split( sal87, posit86 ))

# some data manipulation in R. Let's get rid of posit with small n

z <- split(sal87, posit86)
z

z.size <- sapply( z, length )
z.size

# Let's keep positions with more than 5. Note use of logical selection

zbig <-  z [ z.size >  5 ]
zbig

boxplot( zbig )

# Let's order then by median

zbig.med <- sapply( zbig, median , na.rm = T)
zbig.med
order( zbig.med )  # indices smallest to largest
rev( order ( zbig.med ) )   # indices largest to smalles

zbig.o <- zbig [rev( order ( zbig.med ) )]
boxplot( zbig.o )
boxplot( zbig.o , xlab = "position", ylab = "Salary in \$1,000s")

```
```  ###
###
###   Measures of variation or 'spread'
###
###

range( sal87 )
range( sal87 , na.rm = T)    # not quite 'range' in the book
diff( range( sal87, na.rm = T))  # 'range' as in the book

IQR(sal87, na.rm = T)

diff( quantile( sal87, c(.25,.75), na.rm = T) )    # 1st quartile

# in fact :
IQR

var( sal87 )               # sample variance only
var( sal87, na.rm = T)     # in \$1000s squared
sd( sal87 , na.rm = T)     # in \$1000s
```

```  ##
##
##  Standardizing
##
##

z.sal87 <- scale( sal87 )    # silently drops NAs

par( mfrow = c(2,2) )

plot( density ( sal87, na.rm = T))
plot( density ( z.sal87, na.rm = T))
abline( v = mean( z.sal87, na.rm = T), col = 'red')

# What is the mean of the z-scores?
```