Statistics: Sources of data

From MathWiki

Please contribute links to depositories and other sources of data for statistical analysis.

Combining Gapminder data sets in R

###
### Extracting data from Gapminder files:
### from http://www.gapminder.org/data/
### First download Gapminder variables as Excel files into
### a sub-directory called 'Gapminderdata'. Then:

gmdxls <- list.files('Gapminderdata', full.names = TRUE)
library(gdata)
ddins <- lapply(gmdxls, read.xls)

# Flip each data set
head(ddins[[1]])

rshape <- function(x) {
    # take a Gapminder data set for a variable and turn it into longitudinal form
  Var <- names(x)[1]
  Country <- rep(x[[1]], each = ncol(x) - 1)
  Year <- rep(names(x)[-1], nrow(x))
  Year <- as.numeric( sub("^X","", Year))
  Data <- c( t( x[,-1]))
  ret <- data.frame(Country = Country, 
                    Year = Year, Var = Data)
  names(ret) [3] <- Var
  ret
}

dds <- lapply( ddins, rshape)
lapply(dds, head)

# Merge into a single longitudinal file

dm <- dds[[1]]
for ( ii in seq_along(dds)[-1]){
  dm <- merge(dm, dds[[ii]], all = T)
}
dim(dm)
head(dm)

# Get rid of rows in which all variables (except Country and Year) are missing

dm$nmis <- apply( dm, 1, function(x) sum(is.na(x)))
dm <- subset( dm, nmis < ncol(dm) - 2)
dm <- droplevels(dm)
dim(dm)
head(dm)

# Look at missing value structure

library(vmv)
tablemissing(dm)