Datasets from Grahams missing data book (2012).

data(data.graham.ex3)
data(data.graham.ex6)
data(data.graham.ex8a)
data(data.graham.ex8b)
data(data.graham.ex8c)

Format

  • Dataset data.graham.ex3:

    'data.frame': 2756 obs. of 20 variables:
    $ school : int 1 1 1 1 1 1 1 1 1 1 ...
    $ alc7 : int 1 1 1 7 3 6 1 5 4 3 ...
    $ rskreb71: int 1 3 1 2 1 NA 1 2 1 2 ...
    $ rskreb72: int NA NA NA NA NA NA NA 3 2 3 ...
    $ rskreb73: int NA NA NA NA NA NA NA 2 1 2 ...
    $ rskreb74: int NA NA NA NA NA NA NA 3 2 4 ...
    $ likepa71: int 4 2 3 3 2 NA 1 4 3 3 ...
    $ likepa72: int 5 2 4 2 2 NA 5 3 3 2 ...
    $ likepa73: int 4 1 3 3 2 NA 1 3 2 3 ...
    $ likepa74: int 5 3 1 5 4 4 3 4 3 2 ...
    $ likepa75: int 4 4 4 4 3 3 4 4 3 3 ...
    $ posatt71: int 1 1 1 1 1 2 1 NA NA NA ...
    $ posatt72: int 1 2 1 1 1 2 4 NA NA NA ...
    $ posatt73: int 1 1 1 1 1 2 1 NA NA NA ...
    $ alc8 : int 1 8 4 8 5 7 1 3 5 3 ...
    $ rskreb81: int 1 4 1 2 2 3 2 3 1 4 ...
    $ rskreb82: int NA NA NA NA NA NA NA 3 1 4 ...
    $ rskreb83: int NA NA NA NA NA NA NA 2 1 2 ...
    $ rskreb84: int NA NA NA NA NA NA NA 3 2 4 ...
    $ alc9 : int 3 NA 7 NA 5 7 NA 6 6 7 ...

  • Dataset data.graham.ex6:

    'data.frame': 2756 obs. of 9 variables:
    $ school : int 1 1 1 1 1 1 1 1 1 1 ...
    $ program : int 0 0 0 0 0 0 0 0 0 0 ...
    $ alc7 : int 1 1 1 7 3 6 1 5 4 3 ...
    $ riskreb7: int 1 3 1 2 1 NA 1 2 1 2 ...
    $ likepar7: int 4 2 3 3 2 NA 1 4 3 3 ...
    $ posatt7 : int 1 1 1 1 1 2 1 NA NA NA ...
    $ alc8 : int 1 8 4 8 5 7 1 3 5 3 ...
    $ riskreb8: int 1 4 1 2 2 3 2 3 1 4 ...
    $ alc9 : int 3 NA 7 NA 5 7 NA 6 6 7 ...

  • Dataset data.graham.ex8a:

    'data.frame': 1023 obs. of 20 variables:
    $ skill1 : int 28 29 27 29 29 NA NA NA 29 NA ...
    $ skill2 : int NA NA 29 29 NA NA NA NA NA 21 ...
    $ skill3 : int NA NA 29 29 29 NA 28 10 29 25 ...
    $ skill4 : int NA 29 25 29 29 28 29 NA NA NA ...
    $ skill5 : int 29 29 28 28 29 NA 29 10 NA 25 ...
    $ iplanV1: int 14 18 15 17 16 NA NA NA 18 NA ...
    $ iplanV2: int NA NA 17 16 NA NA NA NA NA 16 ...
    $ iplanV3: int NA NA 16 18 18 NA 17 1 18 16 ...
    $ iplanV4: int NA 18 14 18 14 6 18 NA NA NA ...
    $ iplanV5: int 13 18 12 18 18 NA 18 3 NA 5 ...
    $ planA1 : int 1 0 2 8 3 NA NA NA 7 NA ...
    $ planA2 : int NA NA 0 4 NA NA NA NA NA 6 ...
    $ planA3 : int NA NA 1 4 7 NA 2 0 1 7 ...
    $ planA4 : int NA 8 0 4 6 0 0 NA NA NA ...
    $ planA5 : int 0 7 1 5 7 NA 2 0 NA 6 ...
    $ planV1 : int NA NA NA NA NA NA NA NA NA NA ...
    $ planV2 : int NA NA NA NA NA NA NA NA NA 1 ...
    $ planV3 : int NA NA 1 NA NA NA NA 0 NA 1 ...
    $ planV4 : int NA NA NA NA 2 NA NA NA NA NA ...
    $ planV5 : int 2 NA 2 NA NA NA NA 0 NA NA ...

  • Dataset data.graham.ex8b:

    'data.frame': 2570 obs. of 6 variables:
    $ rskreb71: int 1 3 1 2 1 NA 1 2 1 2 ...
    $ rskreb72: int NA NA NA NA NA NA NA 3 2 3 ...
    $ posatt71: int 1 1 1 1 1 2 1 NA NA NA ...
    $ posatt72: int 1 2 1 1 1 2 4 NA NA NA ...
    $ posatt73: int 1 1 1 1 1 2 1 NA NA NA ...
    $ posatt : int 3 4 3 3 3 6 6 NA NA NA ...

  • Dataset data.graham.ex8c:

    'data.frame': 2756 obs. of 16 variables:
    $ s1 : int 1 1 1 1 1 1 1 1 1 1 ...
    $ s2 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s3 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s4 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s5 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s6 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s7 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s8 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s9 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s10 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ s11 : int 0 0 0 0 0 0 0 0 0 0 ...
    $ xalc7 : int 1 1 1 7 3 6 1 5 4 3 ...
    $ rskreb72: int NA NA NA NA NA NA NA 3 2 3 ...
    $ likepa71: int 4 2 3 3 2 NA 1 4 3 3 ...
    $ posatt71: int 1 1 1 1 1 2 1 NA NA NA ...
    $ alc8 : int 1 8 4 8 5 7 1 3 5 3 ...

Source

The datasets were downloaded from http://methodology.psu.edu/pubs/books/missing.

References

Graham, J. W. (2012). Missing data. New York: Springer. doi: 10.1007/978-1-4614-4018-5

Examples

if (FALSE) {
library(mitools)
library(mice)
library(Amelia)
library(jomo)

#############################################################################
# EXAMPLE 1: data.graham.8a | Imputation under multivariate normal model
#############################################################################

data(data.graham.ex8a)
dat <- data.graham.ex8a
dat <- dat[,1:10]
vars <- colnames(dat)
V <- length(vars)
# remove persons with completely missing data
dat <- dat[ rowMeans( is.na(dat) ) < 1, ]
summary(dat)

# some descriptive statistics
psych::describe(dat)

#**************
# imputation under a multivariate normal model
M <- 7  # number of imputations

#--------- mice package
# define imputation method
impM <- rep("norm", V)
names(impM) <- vars
# mice imputation
imp1a <- mice::mice( dat, method=impM, m=M, maxit=4 )
summary(imp1a)
# convert into a list of datasets
datlist1a <- miceadds::mids2datlist(imp1a)

#--------- Amelia package
imp1b <- Amelia::amelia( dat, m=M )
summary(imp1b)
datlist1b <- imp1b$imputations

#--------- jomo package
imp1c <- jomo::jomo1con(Y=dat, nburn=100, nbetween=10, nimp=M)
str(imp1c)
# convert into a list of datasets
datlist1c <- miceadds::jomo2datlist(imp1c)

# alternatively one can use the jomo wrapper function
imp1c1 <- jomo::jomo(Y=dat, nburn=100, nbetween=10, nimp=M)

#############################################################################
# EXAMPLE 2: data.graham.8b | Imputation with categorical variables
#############################################################################

data(data.graham.ex8b)
dat <- data.graham.ex8b
vars <- colnames(dat)
V <- length(vars)

# descriptive statistics
psych::describe(dat)

#*******************************
# imputation in mice using predictive mean matching
imp1a <- mice::mice( dat, m=5, maxit=10)
datlist1a <- mitools::imputationList( miceadds::mids2datlist(imp1a) )
print(datlist1a)

#*******************************
# imputation in jomo treating all variables as categorical

# Note that variables must have values from 1 to N
# use categorize function from sirt package here
dat.categ <- sirt::categorize( dat, categorical=colnames(dat), lowest=1 )
dat0 <- dat.categ$data

# imputation in jomo treating all variables as categorical
Y_numcat <- apply( dat0, 2, max, na.rm=TRUE )
imp1b <- jomo::jomo1cat(Y.cat=dat0, Y.numcat=Y_numcat, nburn=100,
                 nbetween=10, nimp=5)

# recode original categories
datlist1b <- sirt::decategorize( imp1b, categ_design=dat.categ$categ_design )
# convert into a list of datasets
datlist1b <- miceadds::jomo2datlist(datlist1b)
datlist1b <- mitools::imputationList( datlist1b )

# Alternatively, jomo can be used but categorical variables must be
# declared as factors
dat <- dat0
# define two variables as factors
vars <- miceadds::scan.vec(" rskreb71 rskreb72")
for (vv in vars){
    dat[, vv] <- as.factor( dat[,vv] )
          }
# use jomo
imp1b1 <- jomo::jomo(Y=dat, nburn=30, nbetween=10, nimp=5)

#****************************
# compare frequency tables for both imputation packages
fun_prop <- function( variable ){
            t1 <- table(variable)
            t1 / sum(t1)
                }

# variable rskreb71
res1a <-  with( datlist1a, fun_prop(rskreb71) )
res1b <-  with( datlist1b, fun_prop(rskreb71) )
summary( miceadds::NMIcombine(qhat=res1a, NMI=FALSE ) )
summary( miceadds::NMIcombine(qhat=res1b, NMI=FALSE ) )

# variable posatt
res2a <-  with( datlist1a, fun_prop(posatt) )
res2b <-  with( datlist1b, fun_prop(posatt) )
summary( miceadds::NMIcombine(qhat=res2a, NMI=FALSE ) )
summary( miceadds::NMIcombine(qhat=res2b, NMI=FALSE ) )
}