# MATH 2565 W 2007 Section M Week 3 R Script for Charts and Histograms

```  #
#  MATH 2565 Week 3
#  Using R for Graphs, Charts and Tables
#  Chapter 2
#

source("http://www.math.yorku.ca/~georges/R/fun.R")

# note if you want to use these functions when you are not connected to

# then you can load the functions with

source("/fun.R")

# Note that this installs the file in your 'root' directory. You might
# want to put it somewhere else

dim(baseball)
names(baseball)
head( baseball )   # or baseball[ 1:6, ]

baseball\$name1
baseball\$name2
any (duplicated(paste( baseball\$name1, baseball\$name2 )))
```
```  rownames(baseball) <- paste( baseball\$name1, baseball\$name2 )
```
```  # Selecting rows

baseball[ c(1,3,100), ]
baseball[c('Al Newman','Eddie Murray'),]
rownames(baseball)
baseball[ grep("^Al", rownames(baseball)) ,]
baseball[ grep(" Mc| Mac", rownames(baseball)) ,]

##
##  looking at team86
##

table( baseball\$team86 )        # refer to variable as 'data.frame'\$'variable name'

attach(baseball)                # make variables directly available
table(team86)                   # categorical variable (or 'factor' in R)
table(posit86)
```
```  table(atbat86)                  # numerical variable with many values
table(homer86)                  # numerical variable with fewer values
cbind(table(homer86))           # form into a column

atotal( table(team86) )         # add totals ('atotal' from 'fun.R')
```
```  table( sal87 )                  # ridiculous with 'continuous' variable

table( team86, league86 )              # 2-way table
atotal( table( team86 , league86 ))

table( league86 , league87 )

table( league86 , league87 , div86)    # 3-way table
atotal ( table( league86 , league87 , div86)   )

ftable(    table( league86 , league87 , div86) )          # 'flat' 3-way table
ftable(   atotal( table( league86 , league87 , div86)) )

##
##  Frequency distributions and histograms
##
```
```  table( homer86 )                         # Frequency
sum( table (homer86 ) )                  # Total N
table( homer86 ) / sum(table(homer86))    # Relative frequency
acond( table(homer86))                    # same [acond from fun.R]
100 * acond( table(homer86))              # Percentage

# creating a table out of columns
```
```  z <- cbind( atotal(table( homer86 )), acond(table(homer86)),
100*acond(table(homer86)))
```
```  # better way

zz <- table(homer86)
zz
z <- cbind( Frequency = atotal(zz), "Relative Frequency" = acond(zz),
Percentage=100*acond(zz))
z
```

```  ## More challenging:
# If you're going to do this often, write a function

mytab <- function( x ) {
tt <- table(x)
cbind( Frequency = atotal(tt), "Rel. Freq." = acond(tt),
Pct = 100 * acond(tt))
}
mytab

mytab ( homer86 )
mytab ( rbi86)
mytab ( name1 )

zz <- mytab( name1 )

zz[ rev( order (zz [ ,"Frequency"])), ]

round( mytab( homer86 ), 2)
signif( mytab( homer86 ), 1)

?signif
apply( mytab( homer86 ), 2, signif, 3)   # apply signif to each column with additional argument '3'

##
##  working with continuous data
##

atotal ( table ( sal87 ) )                # something's missing
atotal ( table( sal87, exclude = NULL) )  # include NAs in table
```

```  # creating categories with 'cut'

cut ( sal87, breaks = seq( 0  , 2500, 100))
sal87.cat <- cut( sal87, breaks = seq( 0  , 2500, 100))
sal87.cat
atotal( table( sal87.cat ))

# let cut split data into 6 intervals of equal width

sal87.cat1 <- cut (sal87, breaks = 6 )
sal87.cat1
atotal( table( sal87.cat1 ))

sal87.cat2 <- cut( sal87,
breaks = c(0,100,200,300,600,1000, 1500, 2000, 2500),
dig.lab = 5)
sal87.cat2
atotal( table( sal87.cat2 ))

sal87.cat3 <- cut( sal87,
breaks = c(-Inf,100,200,300,600,1000,Inf),
labels = c("under 100","100-200","200-300",
"300-600","600-1,000","over 1,000")
)
sal87.cat3
atotal( table( sal87.cat3 ) )
```
```  # to include NA as a category

sal87.cat4 <- na.include(cut( sal87,
breaks = c(-Inf,100,200,300,600,1000,Inf),
labels = c("under 100","100-200","200-300",
"300-600","600-1,000","over 1,000")
))
sal87.cat4
atotal( table( sal87.cat4 ) )
```
```  #
#  Grouping data by classes:
#  Principles

#  1. Mutually exclusive classes: each data value in only one class
#         default for 'cut' is to exclude left end, include right end
#  2. All-inclusive classes: all data in a class ... but what about NAs?
#  3. Width of classes: equal width best unless there's reasons to do otherwise
#  4. Number of classes:
#           Rule of thumb: 5 to 20
#           Sturges rule: 1 + 3.322 * log10(n)
#  5. Class width: (range = max - min) / (number of classes)
#  6. Class boundaries  (decide re right end, left end)
#  7. Count number in each class
#

#
#  NOTE: Best is to have boundaries that are meaningful. 95% of the purpose of
#        data analysis is to communicate. Messy boundaries and class names
#

length( sal87 )  # number of observations
```
```  length( unique( sal87 ))   # number of data values

1 + 3.322 * log10( 151 )    # Sturges rule
```
```  sal87.cats <- cut( sal87, breaks = round( 1 + 3.322 * log10( 151 )),dig.lab=5)
atotal ( table ( sal87.cats ) )

##
##  Cumulative frequency
##

x <- c( 2, 3, 6, 2, 10, 1)
x
cumsum(x)   # cumulative sum

homer86
table( homer86 )
cumsum (table( homer86 ))
```
```  z <- cbind( Freq = table(homer86),
Cum.Freq = cumsum( table(homer86)))
z

z <- cbind(
Freq = table( homer86 ),
Rel.Freq = table( homer86 ) / sum( table( homer86 ) ),
Cum.Freq = cumsum( table(homer86) ),
Rel.Cum.Freq = cumsum ( table(homer86) ) / sum( table ( homer86))
)

## write a function

mytab2 <- function( x ) {
cbind(
Freq = table( x ),
Rel.Freq = table( x ) / sum( table( x ) ),
Cum.Freq = cumsum( table(x) ),
Rel.Cum.Freq = cumsum ( table(x) ) / sum( table ( x))
)
}

mytab2( homer86 )

mytab2( hits86 )

mytab2( cut( homer86, breaks = 6 ))

###
###  Histograms
###

help.search("histogram")
RSiteSearch("histogram")
```
```  hist( sal87 )     # basic histogram
?hist

##
##  Histograms easy if intervals (bins) of equal width
##       The vertical axis can show
##            frequency
##         OR density = frequency / width
##       The only difference is the labelling of the y-axis
##
##  If 'bins' are NOT of equal width, it's important to use density

x <- c( 0,0,0,0,0,1,1,1,1,1,1,2,2,2,3,3,4,5,5,6)
table( x)
hist(x)
hist(x, breaks = seq(-.5,6.5,by=1))
# Frequencies easy to understand and ok if equal intervals
hist(x, breaks = c( -.5, .5, 1.5, 2.5, 3.5, 6.5))
# What is hist doing that's really smart ?

## Manual exercise for quiz
## Do histogram as above with unequal width bins

hist( sal87 )

# Exercise: find better breaks for sal87

## Fancier graphics: use 'lattice' package

library( lattice )
histogram( ~sal87, baseball )     # uses formula and data.frame - no need to attach
histogram( ~sal87 | league87, baseball )
histogram( ~sal87 | league86 + div86, baseball )
histogram( ~sal87 | league86 , baseball, groups = div86 )

histogram( ~sal87 | league86 + div86, baseball , breaks = c(0,100,200,500,1000,10000))
```
```  ##  BAD BAD BAD: Shows why you need density, (adjusted for width) NOT Percentage
##               unless widths are equal
##  SO: don't specify 'breaks' for 'histogram'

```
```  ?histogram
```
```  names(baseball)
help.search('histogram')
help.start()
```
```  library( Hmisc )

histbackback( split( sal87, league87 ), prob = T)

```
```  graphics.off()
td(new=T)          # from fun.R: this sets graphics with a white background and history

##
## Cumulative relative frequency ogive
##    = 'empirical distribution function'
##
##

x <- c( 2, 1.1, 1, 3, 3.2, 3.4, 4, 5)
crf <-  (1:length(x) ) / length (x)

plot(sort(x), crf)
```
```  plot( sort(x), crf, type = 'b', ylim = c(0,1))

plot( sort(x), crf, type = 'b', ylim = c(0,1), xlab = x)
```
```  ogive <- function( x , xlab = deparse(substitute(x)), ...) {
xc <- x [ ! is.na(x) ]     # removes missing values
xs <- sort(xc)
crf <- (1:length(xs)) / length(xs)
plot( xs, crf, ylim = c(0,1), xlab = xlab , ylab = "Cumulative Relative Frequency",
type = 'l', ...)
points( xs, crf, pch = 16 )
}

ogive( sal87 )

ogive( hits86 )

#
#  Personally, I prefer the 'quantile plot' which is a 'flipped' ogive
#  I think of it as 'classroom photo' with students lined up shortest to tallest
#

myqplot <- function( x , ylab = deparse(substitute(x)), ...) {
xc <- x [ ! is.na(x) ]     # removes missing values
xs <- sort(xc)
crf <- (1:length(xs)) / length(xs)
plot( crf, xs, xlim = c(0,1), ylab = ylab , xlab = "Cumulative Relative Frequency",
type = 'p', ...)
}

myqplot( sal87 )

##
##  Bar chart (a histogram for categories)
##
```
```  help.search('barplot')   # original graphics
?barplot

barplot( table( team86 ))
barplot( table( team86 ), col = c('red','blue'))
rainbow(length( table(team86)))
barplot( table( team86 ), col = rainbow(length( table(team86))))

help.search('barchart')   # lattice graphics

library(lattice)
?barchart

barchart( table(team86))
barchart( table(team86))
barchart( table(team86, div86))
barchart( table(team86, div86), auto.key = T)
barchart( table(team86, div86, league86), auto.key = T)

data (Titanic)
Titanic
ftable(atotal( Titanic ))
barchart( Titanic, auto.key = T )

##
##  Pie chart
##
```
```  ?pie
pie(table(team86))

##
##  scatterplots and line charts
##

plot( baseball\$hits86, baseball\$sal87)       # original graphics
```
```  identify( baseball\$hits86, baseball\$sal87, labels = baseball\$name2)
```

```  # lattice graphics

xyplot( sal87 ~ hits86, baseball )

xyplot( sal87 ~ hits86, baseball, groups = posit86, auto.key = T)

```
```  xyplot( sal87 ~ hits86, baseball, groups = posit86, auto.key = list(columns = 5))
```

```  xyplot( sal87 ~ hits86 | posit86, baseball)
```

```  ##
##
##
```