# MATH 2565 W 2007 Section M Week 8 R Script for Sampling Distributions

```   ###
###  MATH 2565
###  Chapter 6
###
###
```

```   # Generate a big population of salaries
```

```       help.start()
source("http://www.math.yorku.ca/~georges/R/fun.R")
```
```       baseball <-  read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv")
```
```       dim(baseball)
```
```   # Pretend this is a really big population
```
```       pop <- baseball[ rep( 1: nrow(baseball), 100), ]
```
```       dim(pop)
```
```       pop <- pop[ !is.na( pop\$sal87 ), ]   # keep only non-missing salaries - for illustration
```
```   ## population parameters: (we usually don't know these, that's why we need a sample)
```

```   # Salary:
```
```       mean( pop\$sal87 )   # mu
var( pop\$sal87)     # sigma^2
sd ( pop\$sal87 )    # sigma
```
```   # Proportion in National League
```
```       mean( pop\$league86=="N" )
var( pop\$league86=="N" )
sd ( pop\$league86=="N" )
```
```   # Proportion of utility players
```
```       mean( pop\$posit86=="UT" )
var( pop\$posit86=="UT" )
sd ( pop\$posit86=="UT" )
```
```   ##
##  Sample of salaries
##
```
```   # Small sample, n = 10
```
```       sal.10 <- sample( pop\$sal87, 10)
sal.10
hist(sal.10)
mean(sal.10)
var(sal.10)
sd(sal.10)
```
```   # Large sample, n = 1000
```
```       sal.1000 <- sample( pop\$sal87, 1000)
sal.1000
hist(sal.1000)
mean(sal.1000)
var(sal.1000)
sd(sal.1000)
```
```   # What makes a large sample better than a small sample
```
```       plot( density( pop\$sal87), ylim = c(0,.005))
```
```       abline( v = mean( pop\$sal87))   # mu
```
```   # B = 100 small samples with n = 10
```
```       means.10 <- rep(NA, 100)
```
```       for ( i in 1:100 ) {
means.10[i] <- mean( sample( pop\$sal87, 10) )
}
segments( means.10, 0, means.10,0.0001, col = 'red', lwd = .5)
```
```       lines( density( means.10), col = 'red')
```

```   # B = 100 large samples with n = 1000
```
```       means.1000 <- rep(NA, 100)
```
```       for ( i in 1:100 ) {
means.1000[i] <- mean( sample( pop\$sal87, 1000) )
}
segments( means.1000, 0, means.1000, 0.0003, col = 'blue', lwd = .5)
```
```       lines( density( means.1000), col = 'blue')
```

```       plot( density( means.1000), col = 'blue')
abline( v= mean( pop\$sal87 ))
segments( means.1000, 0, means.1000, 0.003, col = 'blue', lwd = .5)
```

```   ##
##  Sample of proportions of UT players
##
```
```   # B = 100 small samples with n = 10
```
```       sam.10 <- list()
```
```       for ( i in 1:100 ) {
sam.10i <- sample( pop\$posit86 == "UT", 10)
}
sam.10
```
```       plot( 0:1, 0:1, type = 'n')
abline( v = mean( pop\$posit86 == "UT"))
```
```       prop.10 <- sapply( sam.10, mean)
prop.10
table( prop.10)
vals <- as.numeric( names(table( prop.10)))
rel.freq <- table( prop.10 )/length(prop.10)
segments( vals, 0, vals, rel.freq, col = 'red', lwd = 2)
```
```       # get a better view:
```
```       plot( c(0,.3), 0:1, type = 'n')
abline( v = mean( pop\$posit86 == "UT"))
segments( vals, 0, vals, rel.freq, col = 'red', lwd = 2)
```

```   # B = 100 medium samples with n = 50
```
```       sam.50 <- list()
```
```       for ( i in 1:100 ) {
sam.50i <- sample( pop\$posit86 == "UT", 50)
}
sam.50
```
```       # plot( 0:1, 0:1, type = 'n')
# abline( v = mean( pop\$posit86 == "UT"))
```
```       prop.50 <- sapply( sam.50, mean)
prop.50
table( prop.50)
vals <- as.numeric( names(table( prop.50)))
rel.freq <- table( prop.50 )/length(prop.50)
segments( vals, 0, vals, rel.freq, col = 'green', lwd = .5)
```

```   # B = 100 large samples with n = 1000
```
```       sam.1000 <- list()
```
```       for ( i in 1:100 ) {
sam.1000i <- sample( pop\$posit86 == "UT", 1000)
}
# sam.1000
```

```       prop.1000 <- sapply( sam.1000, mean)
prop.1000
table( prop.1000)
vals <- as.numeric( names(table( prop.1000)))
rel.freq <- table( prop.1000 )/length(prop.1000)
segments( vals, 0, vals, rel.freq, col = 'blue', lwd = 4)
```