MATH 2565 W 2007 Section M Week 8 R Script for Sampling Distributions

From MathWiki

   ###
   ###  MATH 2565
   ###  Chapter 6
   ###
   ###


   # Generate a big population of salaries


       help.start()
       source("http://www.math.yorku.ca/~georges/R/fun.R")
       baseball <-  read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv")
       dim(baseball)
   # Pretend this is a really big population
       pop <- baseball[ rep( 1: nrow(baseball), 100), ]
       dim(pop)
       head(pop)
       pop <- pop[ !is.na( pop$sal87 ), ]   # keep only non-missing salaries - for illustration
   ## population parameters: (we usually don't know these, that's why we need a sample)


   # Salary:
       mean( pop$sal87 )   # mu
       var( pop$sal87)     # sigma^2
       sd ( pop$sal87 )    # sigma
   # Proportion in National League
       mean( pop$league86=="N" )
       var( pop$league86=="N" )
       sd ( pop$league86=="N" )
   # Proportion of utility players
       mean( pop$posit86=="UT" )
       var( pop$posit86=="UT" )
       sd ( pop$posit86=="UT" )
   ##
   ##  Sample of salaries
   ##
   # Small sample, n = 10
       sal.10 <- sample( pop$sal87, 10)
       sal.10
       hist(sal.10)
       mean(sal.10)
       var(sal.10)
       sd(sal.10)
   # Large sample, n = 1000
       sal.1000 <- sample( pop$sal87, 1000)
       sal.1000
       hist(sal.1000)
       mean(sal.1000)
       var(sal.1000)
       sd(sal.1000)
   # What makes a large sample better than a small sample
       plot( density( pop$sal87), ylim = c(0,.005))
       abline( v = mean( pop$sal87))   # mu
   # B = 100 small samples with n = 10
       means.10 <- rep(NA, 100)
       for ( i in 1:100 ) {
           means.10[i] <- mean( sample( pop$sal87, 10) )
       }
       segments( means.10, 0, means.10,0.0001, col = 'red', lwd = .5)
       lines( density( means.10), col = 'red')


   # B = 100 large samples with n = 1000
       means.1000 <- rep(NA, 100)
       for ( i in 1:100 ) {
           means.1000[i] <- mean( sample( pop$sal87, 1000) )
       }
       segments( means.1000, 0, means.1000, 0.0003, col = 'blue', lwd = .5)
       lines( density( means.1000), col = 'blue')


       plot( density( means.1000), col = 'blue')
       abline( v= mean( pop$sal87 ))
       segments( means.1000, 0, means.1000, 0.003, col = 'blue', lwd = .5)


   ##
   ##  Sample of proportions of UT players
   ##
   # B = 100 small samples with n = 10
       sam.10 <- list()
       for ( i in 1:100 ) {
           sam.10i <- sample( pop$posit86 == "UT", 10)
       }
       sam.10
       plot( 0:1, 0:1, type = 'n')
       abline( v = mean( pop$posit86 == "UT"))
       prop.10 <- sapply( sam.10, mean)
       prop.10
       table( prop.10)
       vals <- as.numeric( names(table( prop.10)))
       rel.freq <- table( prop.10 )/length(prop.10)
       segments( vals, 0, vals, rel.freq, col = 'red', lwd = 2)
       # get a better view:
       plot( c(0,.3), 0:1, type = 'n')
       abline( v = mean( pop$posit86 == "UT"))
       segments( vals, 0, vals, rel.freq, col = 'red', lwd = 2)


   # B = 100 medium samples with n = 50
       sam.50 <- list()
       for ( i in 1:100 ) {
           sam.50i <- sample( pop$posit86 == "UT", 50)
       }
       sam.50
       # plot( 0:1, 0:1, type = 'n')
       # abline( v = mean( pop$posit86 == "UT"))
       prop.50 <- sapply( sam.50, mean)
       prop.50
       table( prop.50)
       vals <- as.numeric( names(table( prop.50)))
       rel.freq <- table( prop.50 )/length(prop.50)
       segments( vals, 0, vals, rel.freq, col = 'green', lwd = .5)


   # B = 100 large samples with n = 1000
       sam.1000 <- list()
       for ( i in 1:100 ) {
           sam.1000i <- sample( pop$posit86 == "UT", 1000)
       }
       # sam.1000


       prop.1000 <- sapply( sam.1000, mean)
       prop.1000
       table( prop.1000)
       vals <- as.numeric( names(table( prop.1000)))
       rel.freq <- table( prop.1000 )/length(prop.1000)
       segments( vals, 0, vals, rel.freq, col = 'blue', lwd = 4)