MATH 2565 W 2007 Section M Week 3 R Script for Charts and Histograms

From MathWiki

  #
  #  MATH 2565 Week 3
  #  Using R for Graphs, Charts and Tables
  #  Chapter 2
  #
  
  # download functions in 'fun.R'
  source("http://www.math.yorku.ca/~georges/R/fun.R")
  
  # note if you want to use these functions when you are not connected to
  # the internet you might like to download the file
  
  download.file( "http://www.math.yorku.ca/~georges/R/fun.R", "/fun.R")
  
  # then you can load the functions with
  
  source("/fun.R")
  
  # Note that this installs the file in your 'root' directory. You might
  # want to put it somewhere else
  
  # See the link to 'fun.R' on the wiki for more information
  
  baseball <-  read.csv("http://wiki.math.yorku.ca/images/7/73/MATH_2565_Baseball_Data.csv")
  
  dim(baseball)
  names(baseball)
  head( baseball )   # or baseball[ 1:6, ]
  
  baseball$name1
   baseball$name2
   any (duplicated(paste( baseball$name1, baseball$name2 )))
  rownames(baseball) <- paste( baseball$name1, baseball$name2 )
  # Selecting rows
  
  baseball[ c(1,3,100), ]
  baseball[c('Al Newman','Eddie Murray'),]
  rownames(baseball)
  baseball[ grep("^Al", rownames(baseball)) ,]
  baseball[ grep(" Mc| Mac", rownames(baseball)) ,]
  
  ##
  ##  looking at team86
  ##
  
  table( baseball$team86 )        # refer to variable as 'data.frame'$'variable name'
  
  attach(baseball)                # make variables directly available
  table(team86)                   # categorical variable (or 'factor' in R)
  table(posit86)
  table(atbat86)                  # numerical variable with many values
  table(homer86)                  # numerical variable with fewer values
  cbind(table(homer86))           # form into a column
  
  atotal( table(team86) )         # add totals ('atotal' from 'fun.R')
  table( sal87 )                  # ridiculous with 'continuous' variable
  
  
  table( team86, league86 )              # 2-way table
  atotal( table( team86 , league86 ))
  
  table( league86 , league87 )
  
  table( league86 , league87 , div86)    # 3-way table
  atotal ( table( league86 , league87 , div86)   )
  
  
  ftable(    table( league86 , league87 , div86) )          # 'flat' 3-way table
  ftable(   atotal( table( league86 , league87 , div86)) )
  
  ##
  ##  Frequency distributions and histograms
  ##
  table( homer86 )                         # Frequency
  sum( table (homer86 ) )                  # Total N
  table( homer86 ) / sum(table(homer86))    # Relative frequency
  acond( table(homer86))                    # same [acond from fun.R]
  100 * acond( table(homer86))              # Percentage
  
  # creating a table out of columns
  z <- cbind( atotal(table( homer86 )), acond(table(homer86)),
           100*acond(table(homer86)))
  # better way
  
  zz <- table(homer86)
 zz
  z <- cbind( Frequency = atotal(zz), "Relative Frequency" = acond(zz),
           Percentage=100*acond(zz))
  z


  ## More challenging:
  # If you're going to do this often, write a function
  
  mytab <- function( x ) {
       tt <- table(x)
       cbind( Frequency = atotal(tt), "Rel. Freq." = acond(tt),
               Pct = 100 * acond(tt))
  }
  mytab
  
  mytab ( homer86 )
  mytab ( rbi86)
  mytab ( name1 )
  
  zz <- mytab( name1 )
  
  zz[ rev( order (zz [ ,"Frequency"])), ]
  
  round( mytab( homer86 ), 2)
  signif( mytab( homer86 ), 1)
  
  ?signif
  apply( mytab( homer86 ), 2, signif, 3)   # apply signif to each column with additional argument '3'
  
  ##
  ##  working with continuous data
  ##
  
  atotal ( table ( sal87 ) )                # something's missing
  atotal ( table( sal87, exclude = NULL) )  # include NAs in table


  # creating categories with 'cut'
  
  cut ( sal87, breaks = seq( 0  , 2500, 100))
  sal87.cat <- cut( sal87, breaks = seq( 0  , 2500, 100))
  sal87.cat
  atotal( table( sal87.cat ))
  
  # let cut split data into 6 intervals of equal width
  
  sal87.cat1 <- cut (sal87, breaks = 6 )
  sal87.cat1
  atotal( table( sal87.cat1 ))
  
  
  sal87.cat2 <- cut( sal87,
           breaks = c(0,100,200,300,600,1000, 1500, 2000, 2500),
           dig.lab = 5)
  sal87.cat2
  atotal( table( sal87.cat2 ))
  
  # specifying your own labels
  
  sal87.cat3 <- cut( sal87,
               breaks = c(-Inf,100,200,300,600,1000,Inf),
               labels = c("under 100","100-200","200-300",
                   "300-600","600-1,000","over 1,000")
               )
  sal87.cat3
  atotal( table( sal87.cat3 ) )
  # to include NA as a category
  
  sal87.cat4 <- na.include(cut( sal87,
               breaks = c(-Inf,100,200,300,600,1000,Inf),
               labels = c("under 100","100-200","200-300",
                   "300-600","600-1,000","over 1,000")
               ))
  sal87.cat4
  atotal( table( sal87.cat4 ) )
  #
  #  Grouping data by classes:
  #  Principles
  
  #  1. Mutually exclusive classes: each data value in only one class
  #         default for 'cut' is to exclude left end, include right end
  #  2. All-inclusive classes: all data in a class ... but what about NAs?
  #  3. Width of classes: equal width best unless there's reasons to do otherwise
  #  4. Number of classes:
  #           Rule of thumb: 5 to 20
  #           Sturges rule: 1 + 3.322 * log10(n)
  #  5. Class width: (range = max - min) / (number of classes)
  #  6. Class boundaries  (decide re right end, left end)
  #  7. Count number in each class
  #
  
  #
  #  NOTE: Best is to have boundaries that are meaningful. 95% of the purpose of
  #        data analysis is to communicate. Messy boundaries and class names
  #        confuse and repel readers.
  #
  
  length( sal87 )  # number of observations
  length( unique( sal87 ))   # number of data values
  
  1 + 3.322 * log10( 151 )    # Sturges rule
  sal87.cats <- cut( sal87, breaks = round( 1 + 3.322 * log10( 151 )),dig.lab=5)
  atotal ( table ( sal87.cats ) )
  
  
  ##
  ##  Cumulative frequency
  ##
  
  x <- c( 2, 3, 6, 2, 10, 1)
  x
  cumsum(x)   # cumulative sum
  
  homer86
  table( homer86 )
  cumsum (table( homer86 ))
  z <- cbind( Freq = table(homer86),
               Cum.Freq = cumsum( table(homer86)))
  z
  
  z <- cbind(
            Freq = table( homer86 ),
            Rel.Freq = table( homer86 ) / sum( table( homer86 ) ),
            Cum.Freq = cumsum( table(homer86) ),
            Rel.Cum.Freq = cumsum ( table(homer86) ) / sum( table ( homer86))
  )
  
  ## write a function
  
  mytab2 <- function( x ) {
     cbind(
            Freq = table( x ),
            Rel.Freq = table( x ) / sum( table( x ) ),
            Cum.Freq = cumsum( table(x) ),
            Rel.Cum.Freq = cumsum ( table(x) ) / sum( table ( x))
      )
  }
  
  mytab2( homer86 )
  
  mytab2( hits86 )
  
  mytab2( cut( homer86, breaks = 6 ))
  
  
  ###
  ###  Histograms
  ###
  
  help.search("histogram")
  RSiteSearch("histogram")
  hist( sal87 )     # basic histogram
  ?hist
  
  ##
  ##  Histograms easy if intervals (bins) of equal width
  ##       The vertical axis can show
  ##            frequency
  ##         OR density = frequency / width
  ##       The only difference is the labelling of the y-axis
  ##
  ##  If 'bins' are NOT of equal width, it's important to use density
  
  
  x <- c( 0,0,0,0,0,1,1,1,1,1,1,2,2,2,3,3,4,5,5,6)
  table( x)
  hist(x)
  hist(x, breaks = seq(-.5,6.5,by=1))
       # Frequencies easy to understand and ok if equal intervals
  hist(x, breaks = c( -.5, .5, 1.5, 2.5, 3.5, 6.5))
       # What is hist doing that's really smart ?
       
       
  ## Manual exercise for quiz
  ## Do histogram as above with unequal width bins
  
  hist( sal87 )
  
  # Exercise: find better breaks for sal87
  
  ## Fancier graphics: use 'lattice' package
  
  library( lattice )
  histogram( ~sal87, baseball )     # uses formula and data.frame - no need to attach
  histogram( ~sal87 | league87, baseball )
  histogram( ~sal87 | league86 + div86, baseball )
  histogram( ~sal87 | league86 , baseball, groups = div86 )
  
  histogram( ~sal87 | league86 + div86, baseball , breaks = c(0,100,200,500,1000,10000))
  ##  BAD BAD BAD: Shows why you need density, (adjusted for width) NOT Percentage
  ##               unless widths are equal
  ##  SO: don't specify 'breaks' for 'histogram'
  
  
  ?histogram
  names(baseball)
  help.search('histogram')
  help.start()
  library( Hmisc )
  
  histbackback( split( sal87, league87 ), prob = T)
  
  # when graphics go bad:
  graphics.off()
  td(new=T)          # from fun.R: this sets graphics with a white background and history
  
  
  ##
  ## Cumulative relative frequency ogive
  ##    = 'empirical distribution function'
  ##
  ##
  
  x <- c( 2, 1.1, 1, 3, 3.2, 3.4, 4, 5)
  crf <-  (1:length(x) ) / length (x)
  
  plot(sort(x), crf)
  plot( sort(x), crf, type = 'b', ylim = c(0,1))
  
  plot( sort(x), crf, type = 'b', ylim = c(0,1), xlab = x)
  ogive <- function( x , xlab = deparse(substitute(x)), ...) {
       xc <- x [ ! is.na(x) ]     # removes missing values
       xs <- sort(xc)
       crf <- (1:length(xs)) / length(xs)
       plot( xs, crf, ylim = c(0,1), xlab = xlab , ylab = "Cumulative Relative Frequency",
            type = 'l', ...)
       points( xs, crf, pch = 16 )
  }
  
  ogive( sal87 )
  
  ogive( hits86 )
  
  #
  #  Personally, I prefer the 'quantile plot' which is a 'flipped' ogive
  #  I think of it as 'classroom photo' with students lined up shortest to tallest
  #
  
  myqplot <- function( x , ylab = deparse(substitute(x)), ...) {
       xc <- x [ ! is.na(x) ]     # removes missing values
       xs <- sort(xc)
       crf <- (1:length(xs)) / length(xs)
       plot( crf, xs, xlim = c(0,1), ylab = ylab , xlab = "Cumulative Relative Frequency",
            type = 'p', ...)
  }
  
  myqplot( sal87 )
  
  
  
  ##
  ##  Bar chart (a histogram for categories)
  ##
  help.search('barplot')   # original graphics
  ?barplot
  
  barplot( table( team86 ))
  barplot( table( team86 ), col = c('red','blue'))
  rainbow(length( table(team86)))
  barplot( table( team86 ), col = rainbow(length( table(team86))))
  
  
  help.search('barchart')   # lattice graphics
  
  library(lattice)
  ?barchart
  
  barchart( table(team86))
  barchart( table(team86))
  barchart( table(team86, div86))
  barchart( table(team86, div86), auto.key = T)
  barchart( table(team86, div86, league86), auto.key = T)
  
  data (Titanic)
  Titanic
  ftable(atotal( Titanic ))
  barchart( Titanic, auto.key = T )
  
  
  ##
  ##  Pie chart
  ##
  ?pie
  pie(table(team86))
  
  ##
  ##  scatterplots and line charts
  ##
  
  plot( baseball$hits86, baseball$sal87)       # original graphics
  identify( baseball$hits86, baseball$sal87, labels = baseball$name2)


  # lattice graphics
  
  xyplot( sal87 ~ hits86, baseball )
  
  xyplot( sal87 ~ hits86, baseball, groups = posit86, auto.key = T)
  
  xyplot( sal87 ~ hits86, baseball, groups = posit86, auto.key = list(columns = 5))


  xyplot( sal87 ~ hits86 | posit86, baseball)


  ##
  ##
  ##