###### Applied Linear Regression I: scatter plot ## install R package "alr3" first ## see also http://users.stat.umn.edu/~sandy/alr3ed/website/R.html install.packages("alr3", dependencies=TRUE) # load R package "alr3" library(alr3) #alrWeb("primer") # obtain Scripts of R commands at # http://users.stat.umn.edu/~sandy/alr3ed/website/Links/Rscripts.zip #alrWeb(script="chapter1") # scatter plot of Dheight (Y axis) vs. Mheight (X axis) with(heights, plot(Mheight, Dheight, xlim=c(55,75), ylim=c(55,75), pch=21, cex=.7)) # simulate random normal vectors n <- 1000 x <- rnorm(n, mean=62, sd=4) y <- rnorm(n, mean=63, sd=3) plot(x,y) z <- x*0.3 + y*0.7 plot(x,z) # use "jittering" trick to check if there is any "overplotting" problem # That is "many points at exactly the same location". with(heights, plot(Mheight, Dheight+runif(length(Dheight),-0.5,0.5), xlim=c(55,75), ylim=c(55,75), pch=21, cex=.7)) par(mfrow=c(1,2)) # divide the graphical area into 1*2 parts with(heights, plot(Mheight, Dheight, xlim=c(55,75), ylim=c(55,75), pch=21, cex=.7)) with(heights, plot(Mheight, Dheight+runif(length(Dheight),-0.5,0.5), xlim=c(55,75), ylim=c(55,75), pch=21, cex=.7)) par(mfrow=c(1,1)) #### Applied Linear Regression II: missing data ### see also http://www.statmethods.net/input/missingdata.html ### or http://www.ats.ucla.edu/stat/r/faq/missing.htm with more examples ### or http://www.stat.columbia.edu/~gelman/arm/missing.pdf for more readings ## read data with missing values into R xy5 <- read.table("xy5.txt", header=T) # does not work xy5 <- read.table("xy5.txt", sep="\t", header=T) # separated by TABS sum(as.vector(is.na(xy5))) # how many "NA" in total apply(is.na(xy5),1,sum) # how many "NA" in each row apply(is.na(xy5),2,sum) # how many "NA" in each column ## check if there is any missing value y = c(1,2,3,NA) is.na(y) sum(is.na(y)) # number of missing values # list rows of data that have missing values x = runif(5*4,-1,1) # simulated data set dim(x) = c(5,4) # rearrange x into 5 by 4 matrix form x[2,3]=x[3,4]=NA # two missing values x x[!complete.cases(x),] ## ignore missing values mean(y) mean(y, na.rm=T) # ignore missing values when using "mean" ## create new dataset by removing missing data y <- c(3, 5, 2, NA) y <- na.omit(y) y x x <- na.omit(x) x xy5 xy5 <- na.omit(xy5) xy5 ## Some R packages relevant to missing value imputation # Amelia Amelia II: A Program for Missing Data (Version: 1.5-4, Published: 2011-08-23) # ForImp Imputation of missing values through a forward imputation algorithm (Version: 1.0, Published: 2011-06-15) # mi Missing Data Imputation and Model Checking (Version: 0.09-14, Published: 2011-04-25) # missForest Nonparametric Missing Value Imputation using Random Forest # mitools Tools for multiple imputation of missing data # VIM Visualization and Imputation of Missing Values # vmv Visualization of Missing Values # arrayImpute Missing imputation for microarray data # arrayMissPattern Exploratory analysis of Missing patterns for microarray data # MissingDataGUI A GUI for Missing Data Exploration ## Some R packages for analysis with missing values # cat Analysis of categorical-variable datasets with missing values # missMDA Handling missing values with/in multivariate data analysis (principal component methods) # mlmmm ML estimation under multivariate linear mixed models with missing values # monomvn Estimation for multivariate normal and Student-t data with monotone missingness # mvnmle ML estimation for multivariate normal data with missing values # NestedCohort Survival Analysis for Cohorts with Missing Covariate Information # norm Analysis of multivariate normal datasets with missing values