MATH 2565 W 2007 Section M Week 9 R Script for Confidence Intervals and Tests

From MathWiki

  #
  #  MATH 2565 Week 9
  #  Inference about means and proportions
  #  Chapters 7,8
  #
  
  # Chapter 7:  Confidence interval for one mean, one proportion
  # Chapter 8:  Hypothesis testing for one mean, one proportion
  # Chapter 9:  Confidence intervals and hypothesis testing for two populations
  
  # Estimating one mean
  
  source("http://www.math.yorku.ca/~georges/R/fun.R")


  # Create a population  same as in Chapter 6
  
  baseball <-  read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv")
  dim(baseball)
  pop <- baseball[ rep( 1: nrow(baseball), 100), ]
  pop <- pop[ !is.na( pop$sal87 ), ]   # keep only non-missing salaries - to keep it the same as in Chap. 6
  pop$ba86 <- round(pop$hits86 / pop$atbat86 , 4)
  dim(pop)
  head(pop)
  #
  # We need to pretend that we don't have data on the 'population' but we
  # can sample individuals and get information from them.
  #
  # Suppose we want to estimate ba86 and we can get a sample of 10
  
  n <- 10
  sam <- sample( pop$ba86, n)
  sam
  
  # what are we going to do with it to estimate mu = pop. mean of ba86
  
  # Answer:
  
  mean(sam)
  # How accurate an estimate is this??
  
  # Suppose we know (or think we know) that the population SD of Y is 0.03
  
  # SE of mean of (sam)
  
  se.xbar <- 0.03 / sqrt(n)
  se.xbar
  # point estimate:
  mean(sam)
  
  # point estimate with SE
  mean( pop$ba86)
  se.xbar
  
  # More 'sophisticated' " 95% Confidence Interval"
  
  mean(pop$ba86)+ c( - 1.96, 1.96) * se.xbar


  # What about 90% CI
  
  # note
  pnorm( 1.96) - pnorm(-1.96)
  pnorm( 1.96)
  alpha <- 1 - .95
  alpha
  
  qnorm( 1 - alpha/2)
  # What if we wanted 90%
  
  alpha <- 1 - .9
  alpha
  crit.value <- qnorm( 1 - alpha/2 )
  crit.value    # 1.645
  
  # general formula: CI for mean, Using normal appxn (pop normal or n large),
  # popn sigma known
  
  #
  #  xbar +/- crit.value x SE(xbar)
  #  SE(xbar) = Pop.SD / sqrt(n)
  #
  
  
  ci.1 <- function( x, sigma,  conf = .95 ) {
       # CI for one mean, sigma known
       xbar <- mean(x)
       n <- length(x)
       se <- sigma / sqrt(n)
       alpha <- 1 - conf
       crit.value <- qnorm( 1 - alpha/2)
       list( "Confidence Interval for mean, sigma known" = xbar + c(-1,1) * crit.value * se,
           "Confidence Level"= conf)
  }
  
  ci.1 ( sam, .03 )
  ci.1 ( sam, .03, .9)
  ci.1 ( sam, .03, .99)
  
  
  # Thought experiment: what happens if we take 100 samples
  
  cis <- matrix(NA, ncol = 2, nrow = 100)
  for ( ii in 1:100 ) {
       sam <- sample(pop$ba86, 10)
       cis[ ii, ] <- ci.1( sam, .03)1
  }
  cis
  
  plot( x = c(.2,.35), y = c(1,100), type = 'n')
  
  for( ii in 1:100 ) {
      lines( x = cis[ii,], y =c( ii, ii), lwd = .5)
  }
  
  # Now we get to do something we can only do when we're simulating
  
  abline( v = mean( pop$ba86), col = 'red')
  
  
  # What makes a 95% CI a 95% CI?
  # The procedure covers the true value 95% of the time.
  # With a single sample we have no idea whether we hit or not
  # We know that we would miss 5% = 1 - 95% of the time.
  # Maybe this is one of them, maybe it's not!!
  
  # Note: we got close to 5% misses BECAUSE our guess of sigma = 0.03 was pretty close
  # Note: the number of misses has a binomial distribution or approx Poisson
  #
  
  # What would happen if we had a bigger N
  n <- 1000
  cis <- matrix(NA, ncol = 2, nrow = 100)
  for ( ii in 1:100 ) {
       sam <- sample(pop$ba86, 1000)
       cis[ ii, ] <- ci.1( sam, .03)1
  }
  cis
  plot( x = c(.2,.35), y = c(1,100), type = 'n')
  for( ii in 1:100 ) {
      lines( x = cis[ii,], y =c( ii, ii), lwd = .5)
  }
  # Now we get to do something we can only do when we're simulating
  abline( v = mean( pop$ba86), col = 'red')
  
  ###
  ###
  ###   More realistic: sigma not known
  ###   It would be unusual to know sigma and NOT mu
  ###
  
  # Idea:
  #     1) Use sd(sam) instead of sigma
  #     2) Use the 't' distribution instead of z to allow for added error
  #        due to uncertainty in sd
  n <- 10
  sam <- sample( pop$ba86 , 10)
  sam
  xbar <- mean(sam)
  xbar
  est.sd <- sd(sam)
  se.xbar <- est.sd / sqrt(n)
  crit.value <- qt( 1 - .05/2, df = n - 1)
  crit.value    # note larger
  
  # confidence interval : xbar +/- crit.value x se.xbar
  
  ci.2 <- function( x,  conf = .95 ) {
       # CI for one mean, sigma unknown
       xbar <- mean(x)
       n <- length(x)
       se <- sd(x) / sqrt(n)
       alpha <- 1 - conf
       crit.value <- qt( 1 - alpha/2, df = n - 1)
       list( "Confidence Interval for mean, sigma unknown" = xbar + c(-1,1) * crit.value * se,
           "Confidence Level"= conf)
  }
  
  
  
  
  ci.2(sam)
  
  
  
  ###
  ###  Confidence interval for a proportion
  ###
  
  head(pop)
  n <- 50
  sam <- sample( pop$league86 == "A", 50)
  sam
  p <- mean(sam)      # proportion in American league
  p
  
  # Use normal appxn
  
  se.p <- sqrt( p * (1-p) / n)
  se.p
  
  conf <- .95
  alpha <- 1 - conf
  alpha
  crit.value <- qnorm( 1 - alpha / 2)
  crit.value
  ci <- p + c(-1,1) * crit.value * se.p
  ci
  # true value
  mean( pop$league86 == "A")
  ci.3 <- function( x , conf = .95 ){
     # CI for one proportion
     # note: x must be 0,1 or F,T
     p <- mean(x)
     n <- length(x)
     se.p <- sqrt( p * (1-p) / sqrt(n))
     alpha <- 1 - conf
     crit.value <- qnorm( 1 - alpha/2)
     if ( n * p < 5 || n * ( 1 - p ) < 5 ) warning("Normal approximation may be invalid")
       list( "Confidence Interval for proportion using normal appx" = p + c(-1,1) * crit.value * se.p,
           "Confidence Level"= conf)
  }


      1. Chapter 8: Hypothesis testing
      2. Answering a question: Do we have evidence to 'prove' that mean(ba) != 0.300
   ##
   ## Mean, sigma unknown, using normal appxn (n large or if n small, pop near normal)
   ##
   # Example: Test whether mean ba == .3 or is it < .3
   mu.0 <- .3
   alpha <- .05
   n <- 50
   sam <- sample ( pop$ba86, 50)
   sam
   xbar <- mean(sam)
   xbar
   se.xbar <- sd(sam) / sqrt(n)
   se.xbar
   test.stat <- ( xbar - mu.0 )/ se.xbar
   test.stat
   
   # Alternative is left-sided
   crit.val <- qt( alpha, df = n - 1)
   crit.val
   
   # SO???
   
   # Reject H0 and accept HA that mu < .3
   # OR report observed p-value
   pt( test.stat, df = n - 1 )
   
   
   ###
   ###  Hypothesis test for a proportion
   ###  e.g. proportion in American league
   ###  HO: popn p = .5 vs !=  (two-tail test)
   
   p.0 <- 0.5
   alpha <- .05
   
   n <- 50
   sam <- sample ( pop$league86 == "A" , n)
   
   p <- mean(sam)
   p
   
   se.p.0 <- sqrt( p.0 * ( 1 - p.0) / n)
   se.p.0
   
   test.stat <- ( p - p.0) / se.p.0
   test.stat
   
   #
   crit.val <- qnorm( 1 - alpha/2 )
   crit.val
   # Reject if test.stat < - crit.val OR test.stat > crit.val