MATH 2565 W 2007 Section M Week 4 R Script for Numerical Measures

From MathWiki

  #
  #  MATH 2565 Week 4
  #  Using R for Numerical Measures
  #  Chapter 3
  #
 
  # download functions in 'fun.R'
  
  help.start()
  source("http://www.math.yorku.ca/~georges/R/fun.R")
 
  baseball <-  read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv")
 
  source("/R/fun.R")
 
  names(baseball)
  
  summary(baseball)
  
  attach(baseball)
 
  mean(sal87)             # oops, mean doesn't know what to do with NAs
 
  sal87
  
  mean(sal87, na.rm = T)  # many functions have this option to ignore NAs
  
  median ( sal87, na.rm = T ) # middle value
  
  # how to do something to every variable in a data frame
  
  sapply( baseball, mean, na.rm = T)   # sapply(data.frame, function, extra arguments)
 
  ?sapply
 
  ##
  ##  My personal favorite way of seeing a data frame at a glance
  ##  xqplot (in fun.R )
 
 
  td(new = T)       # (fun.R) to make sure 'history' is turned on
  xqplot(baseball)  # (fun.R)
                    # quantile plots = 'class photo lined up short to tall'
                    # i.e. data lined up from shortest to tallest
                    # (flipped cumulative distribution functions)
                    # for numerical data
                    # or boxplots for factors
 
                    
  ###
  ###  Density plots: smooth histogram to see shape of distribution
  ###
 
  plot ( density ( sal87 ) )  # oops, those NAs again
  plot( density (sal87, na.rm = T))
  
  par(mfrow=c(1,1)) # reset one plot per page
 
  plot( density (sal87, na.rm = T))    # skewed to right
  
  par(mfrow=c(3,2))      # multiple plots per page (3 rows 2 cols)
                         # also starts new page
  
 
  plot( density (sal87, na.rm = T))
 
  plot( density ( runs86 ) )
  plot( density ( runs86 ) , main = "runs86")
  
  plot( density ( hits86 / atbat86 ), main= "Batting average in 1986",
               xlab = paste("N =",length(hits86)))   # note symmetric
 
  par ( mfrow = c(3,2) )  # start new page
  
  plot( density ( hits86 / atbat86 ), main= "Batting average in 1986",
               xlab = paste("N =",length(hits86)))   # note symmetric
  
  ##
  ##  Percentile (p. 86 +)
  ##  uniform quantile plots  = class photo in library(car)
  ##     = percentile plots
  
  
  quantile ( hits86/atbat86, .50, na.rm = T)
  quantile ( hits86/atbat86, .25, na.rm = T)   # 1st quartile
  quantile ( hits86/atbat86, c(.25, .5, .75 ), na.rm = T )   # many at once
  
  # plotting quantiles: the uniform quantile plot
  
  library(car)
  
  qq.plot( hits86 / atbat86, dist = 'unif', line = 'none')
 
  plot( density ( sal87 , na.rm = T), main= "Salary in 1987",
               xlab = paste("N =",sum(!is.na(sal87))))   # skewed to right
 
  qq.plot( sal87, dist = 'unif', line = 'none', main="Salary in 1987")
                   # silently drops NAs (some would consider this bad)
  ##
  ##  Density plots , means and medians
  ##
  
  par(mfrow = c(1,1))    # one plot per page
  
  plot( density( sal87, na.rm = T), main = "Baseball salaries in 1987",
       xlab = "Salary in $1000's")
 
  # add vertical line at mean
  
  abline( v = mean( sal87, na.rm = T ), col = 'blue', lwd = 2)
  
  # add vertical line median (where will it go)
 
  abline( v = median( sal87, na.rm = T ), col = 'red', lwd = 2)
  
  legend( locator(1), legend = c("mean","median"),
           lty = c(1,1) ,    # line type, 1 is regular
           lwd = 2, col = c('blue','red'))
 
  ## EXERCISE: Repeat above ( density and two lines ) with batting average
 
  ##
  ##  Might skip in class:
  ##  How to create a variable with missing values removed
  ##  -- an example of logical selection
  ##
  
  sal87
  is.na(sal87)       # logical vector
  !is.na(sal87)      # logical negation, i.e. non-missing values
  sal87.c <- sal87[ !is.na( sal87 ) ]    # selection with logical vector
  
  sal87.c    # no NAs
  
  # if you have a lot of this to do
  na.rm
  na.rm <- function(x) x[!is.na(x)]
  
  # could now use:
  
  sal87.c <- na.rm( sal87 )
  
  ##
  ##  Weighted averages
  ##
  
  class.size <- c( 10, 15, 10, 20, 200 )
  class.size
  
  mean( class.size )      # this is mean if you ask instructors
  weighted.mean ( class.size, c(2,3,2,1,1))
  median ( class.size )
  
  # what if you asked students?
  # you would get 10 students who say 10 from 1st class
  #      15 who say 15
  #     another 10 who say 10
  #     20 who say 20
  #     200 who say 200 !!
  
  weighted.mean ( class.size, class.size)     # a very different perspective
  
  
  ##
  ##  Box and whisker plot
  ##
 
  boxplot( sal87 )
  boxplot(  hits86 / atbat86 )
  
  boxplot( runs86 )
  
  ## The really  nice use: side by side boxplot
  
  split( sal87, league87 )
  
  boxplot ( split( sal87, league87 ))
  
  boxplot( split( sal87, posit86 ))
  
  # some data manipulation in R. Let's get rid of posit with small n
  
  z <- split(sal87, posit86)
  z
  
  z.size <- sapply( z, length )
  z.size
 
  # Let's keep positions with more than 5. Note use of logical selection
  
  zbig <-  z [ z.size >  5 ]
  zbig
 
  boxplot( zbig )
 
  # Let's order then by median
  
  zbig.med <- sapply( zbig, median , na.rm = T)
  zbig.med
  order( zbig.med )  # indices smallest to largest
  rev( order ( zbig.med ) )   # indices largest to smalles
 
  zbig.o <- zbig [rev( order ( zbig.med ) )]
  boxplot( zbig.o )
  boxplot( zbig.o , xlab = "position", ylab = "Salary in $1,000s")
  
  ###
  ###
  ###   Measures of variation or 'spread'
  ###
  ###
  
  range( sal87 )
  range( sal87 , na.rm = T)    # not quite 'range' in the book
  diff( range( sal87, na.rm = T))  # 'range' as in the book
 
  IQR(sal87, na.rm = T)
  
  diff( quantile( sal87, c(.25,.75), na.rm = T) )    # 1st quartile
 
  # in fact :
  IQR
  
  var( sal87 )               # sample variance only
  var( sal87, na.rm = T)     # in $1000s squared
  sd( sal87 , na.rm = T)     # in $1000s


  ##
  ##
  ##  Standardizing
  ##
  ##
  
  z.sal87 <- scale( sal87 )    # silently drops NAs
 
  par( mfrow = c(2,2) )
  
  plot( density ( sal87, na.rm = T))
  plot( density ( z.sal87, na.rm = T))
  abline( v = mean( z.sal87, na.rm = T), col = 'red')
  
  # What is the mean of the z-scores?