This file contains some examples of the functions related to the MacroPCA routine. More specifically, MacroPCA
, MacroPCApredict
and cellMap
will be illustrated.
library("cellWise")
library("gridExtra") # has grid.arrange()
We will first look at a small artificial example.
set.seed(12345) # for reproducibility
n <- 50; d <- 10
A <- matrix(0.9, d, d); diag(A) = 1
round(A,1) # true covariance matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [2,] 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [3,] 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [4,] 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9
## [5,] 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9
## [6,] 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9
## [7,] 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9
## [8,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9
## [9,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9
## [10,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0
library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 25, FALSE)] <- 10
x[sample(1:(n * d), 25, FALSE)] <- -10
x <- cbind(1:n, x)
# When not specifying MacroPCApars all defaults are used:
MacroPCA.out <- MacroPCA(x, k = 0)
##
## The input data has 50 rows and 11 columns.
##
## The data contained 1 columns that were identical to the case number
## (number of the row).
## Their column names are:
##
## [1] X1
##
## These columns will be ignored in the analysis.
## We continue with the remaining 10 columns:
##
## [1] X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
##
## The final data set we will analyze has 50 rows and 10 columns.
##
##
## Eigenvalues: 5.41553 0.1193142 0.07484633 0.06937444 0.05188761 0.04727873 0.03680524 0.0280979 0.01900759 0.0135016
##
## The cumulative percentage of explained variability by the first 10 components is:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 92.2 94.2 95.5 96.7 97.5 98.3 99.0 99.4 99.8 100.0
##
## Based on explained variability >= 80% one would select k = 1.
##
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(x, k = 1)
##
## The input data has 50 rows and 11 columns.
##
## The data contained 1 columns that were identical to the case number
## (number of the row).
## Their column names are:
##
## [1] X1
##
## These columns will be ignored in the analysis.
## We continue with the remaining 10 columns:
##
## [1] X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
##
## The final data set we will analyze has 50 rows and 10 columns.
##
##
## Eigenvalues: 5.651754 0.1260469 0.0698702 0.058418 0.05244722 0.04240901 0.03944442 0.03030861 0.01952611 0.01014903
##
## XciSVD$rank, Xcih.SVD$rank, k and kmax: 10 10 1 10
##
## Perform extra reweighting step (k = 1 < rank = 10 )
##
## Final step: use eigenvectors of MCD scatter.
cellMap(D = MacroPCA.out$remX, R = MacroPCA.out$stdResid,
columnlabels = 1:d, rowlabels = 1:n)
# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, and all other cells are yellow.
The Top Gear data contains information on 297 cars.
library(robustHD)
data(TopGear)
dim(TopGear)
myTopGear = TopGear
rownames(myTopGear) = paste(myTopGear[,1],myTopGear[,2])
# These rownames are make and model of the cars.
rownames(myTopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = myTopGear[,-31] # removes subjective variable `Verdict'
# Transform some skewed variables:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)
# Check the data:
checkData = checkDataSet(transTG, silent = TRUE)
##
## The final data set we will analyze has 296 rows and 11 columns.
##
# With option silent = FALSE we obtain more information:
#
# The input data has 297 rows and 31 columns.
#
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#
# [1] Maker Model Type Fuel
# [5] DriveWheel AdaptiveHeadlights AdjustableSteering AlarmSystem
# [9] Automatic Bluetooth ClimateControl CruiseControl
# [13] ElectricSeats Leather ParkingSensors PowerSteering
# [17] SatNav ESP Origin
#
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#
# [1] Price Cylinders Displacement BHP Torque Acceleration TopSpeed
# [8] MPG Weight Length Width Height
#
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#
# [1] Citroen C5 Tourer
#
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#
# [1] Alfa Romeo Giulietta Alfa Romeo MiTo
# .......
# [295] Volvo XC70 Volvo XC90
#
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
#
# [1] Cylinders
#
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
#
# [1] Price Displacement BHP Torque Acceleration TopSpeed MPG
# [8] Weight Length Width Height
#
# The final data set we will analyze has 296 rows and 11 columns.
# The remainder of the dataset:
remTG = checkData$remX
dim(remTG)
## [1] 296 11
colSums(is.na(remTG)) # we still have quite a few NA's, especially in Weight:
## Price Displacement BHP Torque Acceleration TopSpeed
## 0 8 3 3 0 3
## MPG Weight Length Width Height
## 11 32 10 15 10
# Check robust scale of the variables:
locscale = estLocScale(remTG)
round(locscale$scale,2)
## Price Displacement BHP Torque Acceleration TopSpeed
## 0.60 0.38 0.59 0.60 3.51 0.17
## MPG Weight Length Width Height
## 17.08 381.02 420.89 91.95 131.82
# The scales are clearly different, so we will standardize before PCA.
# This is the argument scale=TRUE in MacroPCApars below.
# Small option lists (MacroPCA will automatically extend them with the
# default choices for the other options):
DDCpars <- list(fastDDC = FALSE, silent = TRUE)
MacroPCApars <- list(DDCpars = DDCpars, scale = TRUE, silent = TRUE)
# Note that MacroPCA needs DDCpars because it first runs DDC.
# To choose the number k of principal components we can run MacroPCA with k=0:
MacroPCAtransTG0 <- MacroPCA(transTG,k=0,MacroPCApars=MacroPCApars)
##
## The final data set we will analyze has 296 rows and 11 columns.
##
##
## The cumulative percentage of explained variability by the first 10 components is:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 70.3 83.4 90.4 93.3 95.3 96.7 97.8 98.7 99.3 99.7
##
## Based on explained variability >= 80% one would select k = 2.
##
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCAtransTG <- MacroPCA(transTG,k=2,MacroPCApars=MacroPCApars) # takes about 1 second.
##
## The final data set we will analyze has 296 rows and 11 columns.
##
names(MacroPCAtransTG)
## [1] "MacroPCApars" "remX" "DDC" "scaleX" "k"
## [6] "loadings" "eigenvalues" "center" "alpha" "h"
## [11] "It" "diff" "X.NAimp" "scores" "OD"
## [16] "cutoffOD" "SD" "cutoffSD" "indrows" "residScale"
## [21] "stdResid" "indcells" "NAimp" "Cellimp" "Fullimp"
MacroPCAtransTG$MacroPCApars # these are all the options used, starting with DDCpars:
## $DDCpars
## $DDCpars$fracNA
## [1] 0.5
##
## $DDCpars$numDiscrete
## [1] 3
##
## $DDCpars$precScale
## [1] 1e-12
##
## $DDCpars$cleanNAfirst
## [1] "automatic"
##
## $DDCpars$tolProb
## [1] 0.99
##
## $DDCpars$corrlim
## [1] 0.5
##
## $DDCpars$combinRule
## [1] "wmean"
##
## $DDCpars$returnBigXimp
## [1] FALSE
##
## $DDCpars$silent
## [1] TRUE
##
## $DDCpars$nLocScale
## [1] 25000
##
## $DDCpars$fastDDC
## [1] FALSE
##
## $DDCpars$standType
## [1] "1stepM"
##
## $DDCpars$corrType
## [1] "gkwls"
##
## $DDCpars$transFun
## [1] "wrap"
##
## $DDCpars$nbngbrs
## [1] 100
##
##
## $kmax
## [1] 10
##
## $alpha
## [1] 0.5
##
## $scale
## [1] TRUE
##
## $maxdir
## [1] 250
##
## $distprob
## [1] 0.99
##
## $silent
## [1] TRUE
##
## $maxiter
## [1] 20
##
## $tol
## [1] 0.005
##
## $bigOutput
## [1] TRUE
MacroPCAtransTG$cutoffOD # cutoff value for orthogonal distances OD:
## [1] 1.737869
MacroPCAtransTG$cutoffSD # cutoff value for score distances OD:
## [1] 3.034854
length(MacroPCAtransTG$indrows) # list of flagged rows
## [1] 35
length(MacroPCAtransTG$indcells) # list of flagged cells
## [1] 126
ICPCAtransTG <- ICPCA(remTG,k=2,scale=TRUE,tolProb=0.99)
names(ICPCAtransTG)
## [1] "scaleX" "k" "loadings" "eigenvalues" "center"
## [6] "covmatrix" "It" "diff" "X.NAimp" "scores"
## [11] "OD" "cutoffOD" "SD" "cutoffSD" "indrows"
## [16] "residScale" "stdResid" "indcells"
# Compare imputed values for missings:
MacroPCAtransTG$X.NAimp[c(94,176),8]
## Ford Kuga Mini Coupe
## 1654.907 1125.878
ICPCAtransTG$X.NAimp[c(94,176),8]
## Ford Kuga Mini Coupe
## 1774.286 1001.698
# CAR MAGAZINE: Ford Kuga Kuga 2.0 TDCi weighs 1605kg
# CAR MAGAZINE: MINI Coupe 1.6T Cooper weighs 1165kg
# Make untransformed submatrix of X for labeling the cells
# in the residual plot:
tempTG = myTopGear[checkData$rowInAnalysis,checkData$colInAnalysis]
dim(tempTG)
## [1] 296 11
tempTG$Price = tempTG$Price/1000 # to avoid printing long numbers
rowlabels = rownames(tempTG)
columnlabels = colnames(tempTG)
showrows = c(12,42,50,51,52,59,72,94,98,135,150,164,176,
180,195,196,198,209,210,215,219,234,259,277) # these 24 cars will be shown
# Make the ggplot2 objects for the residual maps by the function cellMap:
ggpICPCA = cellMap(D=tempTG,
R=ICPCAtransTG$stdResid,
indcells=ICPCAtransTG$indcells,
indrows=ICPCAtransTG$indrows,
standOD=ICPCAtransTG$OD/ICPCAtransTG$cutoffOD,
showVals="D",
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="ICPCA residual map",
showrows=showrows,
adjustrowlabels=0.5)
plot(ggpICPCA)
ggpMacroPCA = cellMap(D=tempTG,
R=MacroPCAtransTG$stdResid,
indcells=MacroPCAtransTG$indcells,
indrows=MacroPCAtransTG$indrows,
standOD=MacroPCAtransTG$OD/MacroPCAtransTG$cutoffOD,
showVals="D",
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MacroPCA residual map",
showrows=showrows,
adjustrowlabels=0.5)
plot(ggpMacroPCA)
# Creating the combined pdf:
# pdf(file="TopGear_IPCA_MacroPCA_residualMap.pdf",width=20,height=16)
# gridExtra::grid.arrange(ggpICPCA,ggpMacroPCA,ncol=2) # arranges two plots on a page
# dev.copy(pdf,file="TopGear_IPCA_MacroPCA_residualMap.pdf",width=20,height=16)
# dev.off()
### Creating the outlier maps
outlierMap(MacroPCAtransTG,title="MacroPCA outlier map",
labelOut=FALSE)
plotLabs = rep("",nrow(tempTG))
plotLabs[42] = rowlabels[42] # BMW i3
plotLabs[50] = "Bugatti"
plotLabs[52] = rowlabels[52] # Caterham Super 7
plotLabs[59] = rowlabels[59] # Chevrolet Volt
plotLabs[180] = rowlabels[180] # Mitsubishi i-MiEV
plotLabs[195] = rowlabels[195] # Noble M600
plotLabs[196] = rowlabels[196] # Pagani Huayra
plotLabs[219] = rowlabels[219] # Renault Twizy
plotLabs[234] = rowlabels[234] # Ssangyong Rodius
plotLabs[259] = rowlabels[259] # Vauxhall Ampera
textPos = cbind(MacroPCAtransTG$SD,MacroPCAtransTG$OD)
textPos[42,1] = textPos[42,1] -0.5 # BMW i3
textPos[42,2] = textPos[42,2] +0.8 # BMW i3
textPos[50,1] = textPos[50,1] -0.03 # Bugatti Veyron
textPos[50,2] = textPos[50,2] +0.05 # Bugatti Veyron
textPos[52,1] = textPos[52,1] +0.3 # Caterham Super 7
textPos[52,2] = textPos[52,2] -0.5 # Caterham Super 7
textPos[59,1] = textPos[59,1] -0.05 # Chevrolet Volt
textPos[59,2] = textPos[59,2] -0.1 # Chevrolet Volt
textPos[180,1] = textPos[180,1] -1.2 # Mitsubishi i-MiEV
textPos[180,2] = textPos[180,2] +0.6 # Mitsubishi i-MiEV
textPos[195,1] = textPos[195,1] -0.05 # Noble M600
textPos[195,2] = textPos[195,2] +0.35 # Noble M600
textPos[196,1] = textPos[196,1] -0.6 # Pagani Huayra
textPos[196,2] = textPos[196,2] +0.7 # Pagani Huayra
textPos[219,1] = textPos[219,1] -0.03 # Renault Twizy
textPos[234,1] = textPos[234,1] +0.1 # Ssangyong Rodius
textPos[234,2] = textPos[234,2] +0.6 # Ssangyong Rodius
textPos[259,1] = textPos[259,1] -1.35 # Vauxhall Ampera
textPos[259,2] = textPos[259,2] +0.65 # Vauxhall Ampera
text(textPos,plotLabs,cex=0.8,pos=4)
# dev.copy(pdf,"TopGear_MacroPCA_outlierMap.pdf",width=6,height=6)
# dev.off()
outlierMap(ICPCAtransTG,title="ICPCA outlier map",labelOut=FALSE)
plotLabs = rep("",nrow(tempTG))
plotLabs[42] = rowlabels[42] # BMW i3
plotLabs[50] = rowlabels[50] # Bugatti Veyron
plotLabs[52] = rowlabels[52] # Caterham Super 7
plotLabs[59] = rowlabels[59] # Chevrolet Volt
plotLabs[180] = rowlabels[180] # Mitsubishi i-MiEV
plotLabs[195] = rowlabels[195] # Noble M600
plotLabs[196] = rowlabels[196] # Pagani Huayra
plotLabs[219] = rowlabels[219] # Renault Twizy
plotLabs[234] = rowlabels[234] # Ssangyong Rodius
plotLabs[259] = rowlabels[259] # Vauxhall Ampera
textPos = cbind(ICPCAtransTG$SD,ICPCAtransTG$OD)
textPos[50,1] = textPos[50,1] -0.05 # Bugatti Veyron
textPos[50,2] = textPos[50,2] +0.05 # Bugatti Veyron
textPos[52,1] = textPos[52,1] -0.07 # Caterham Super 7
textPos[52,2] = textPos[52,2] -0.37 # Caterham Super 7
textPos[59,1] = textPos[59,1] -0.05 # Chevrolet Volt
textPos[59,2] = textPos[59,2] -0.3 # Chevrolet Volt
textPos[180,1] = textPos[180,1] -1.2 # Mitsubishi i-MiEV
textPos[180,2] = textPos[180,2] +0.23 # Mitsubishi i-MiEV
textPos[195,1] = textPos[195,1] +0.25 # Noble M600
textPos[195,2] = textPos[195,2] +0.2 # Noble M600
textPos[196,1] = textPos[196,1] -0.05 # Pagani Huayra
textPos[196,2] = textPos[196,2] +0.32 # Pagani Huayra
textPos[219,1] = textPos[219,1] -0.8 # Renault Twizy
textPos[219,2] = textPos[219,2] +0.4 # Renault Twizy
textPos[234,1] = textPos[234,1] -0.05 # Ssangyong Rodius
textPos[234,2] = textPos[234,2] +0.2 # Ssangyong Rodius
textPos[259,1] = textPos[259,1] -0.9 # Vauxhall Ampera
textPos[259,2] = textPos[259,2] +0.4 # Vauxhall Ampera
text(textPos,plotLabs, cex=0.8, pos=4)
# dev.copy(pdf,"TopGear_ICPCA_outlierMap.pdf",width=6,height=6)
# dev.off()
# For comparison, remake the residual map of the entire dataset, but now showing
# the values of the residuals instead of the data values:
columnlabels = colnames(remTG)
rowlabels = rownames(remTG)
ggpMacroPCAres = cellMap(D=remTG,
R=MacroPCAtransTG$stdResid,
indcells=MacroPCAtransTG$indcells,
indrows=MacroPCAtransTG$indrows,
standOD=MacroPCAtransTG$NAimp$OD/MacroPCAtransTG$NAimp$cutoffOD,
showVals="R",
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MacroPCA residual map",
showrows=showrows,
adjustrowlabels=0.5)
plot(ggpMacroPCAres)
# Define the "initial" dataset as the rows not in these 24:
initX = remTG[-showrows,]
dim(initX)
## [1] 272 11
# Fit initX:
MacroPCAinitX = MacroPCA(initX,k=2,MacroPCApars=MacroPCApars)
##
## The final data set we will analyze has 271 rows and 11 columns.
##
# Define the "new" data to predict:
newX = remTG[showrows,]
dim(newX)
## [1] 24 11
# Make predictions by MacroPCApredict.
# Its inputs are:
#
# Xnew : the new data (test data), which must be a
# matrix or a data frame.
# It must always be provided.
# InitialMacroPCA : the output of the MacroPCA function on the
# initial (training) dataset. Must be provided.
# MacroPCApars : the input options to be used for the prediction.
# By default the options of InitialMacroPCA
# are used. For the complete list of options
# see the function MacroPCA.
predictMacroPCA = MacroPCApredict(newX,MacroPCAinitX)
# We did not need to specify the third argument because it is taken
# from the initial fit MacroPCAinitX .
names(predictMacroPCA)
## [1] "MacroPCApars" "scaleX" "k" "loadings" "eigenvalues"
## [6] "center" "It" "diff" "scores" "OD"
## [11] "cutoffOD" "SD" "cutoffSD" "indrows" "residScale"
## [16] "stdResid" "indcells" "X.NAimp" "NAimp" "Cellimp"
## [21] "Fullimp" "DDC"
# The outputs are similar to those of of MacroPCA.
# Make the residual map:
columnlabels = colnames(newX)
rowlabels = rownames(newX)
ggpMacroPCApredict = cellMap(D=newX,
R=predictMacroPCA$stdResid,
indcells=predictMacroPCA$indcells,
indrows=predictMacroPCA$indrows,
standOD=predictMacroPCA$OD/predictMacroPCA$cutoffOD,
showVals="R",
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MacroPCApredict residual map",
adjustrowlabels=0.5)
plot(ggpMacroPCApredict) # is very similar to that based on all the data!
# Creating the combined pdf:
# pdf(file="TopGear_MacroPCApredict_residualMap.pdf",width=20,height=16)
# gridExtra::grid.arrange(ggpMacroPCAres,ggpMacroPCApredict,ncol=2)
# dev.off()
The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.
data(data_glass)
library(rrcov) # for robust PCA:
dim(data_glass)
## [1] 180 750
# Do not scale the spectra in the glass data:
MacroPCApars$scale = FALSE
# Check data
checkData = checkDataSet(data_glass, silent=TRUE)
##
## The final data set we will analyze has 180 rows and 737 columns.
##
# With checkData = checkDataSet(glass, silent=FALSE) we obtain more information:
#
# The input data has 180 rows and 750 columns.
#
# The data contained 11 discrete columns with 3 or fewer values.
# Their column names are:
#
# [1] V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
#
# These columns will be ignored in the analysis.
# We continue with the remaining 739 columns:
#
# [1] V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25
# ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
# The data contained 2 columns with zero or tiny median absolute deviation.
# Their column names are:
#
# [1] V12 V13
#
# These columns will be ignored in the analysis.
# We continue with the remaining 737 columns:
#
# [1] V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27
# ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
# The final data set we will analyze has 180 rows and 737 columns.
remglass = checkData$remX
n <- nrow(remglass); n
## [1] 180
d = ncol(remglass); d # the first 13 variables had scale 0:
## [1] 737
# Compare ICPCA and MacroPCA:
ICPCAglass <- ICPCA(remglass,k=4,scale=F,tolProb=0.99)
MacroPCAglass = MacroPCA(remglass,k=4,MacroPCApars=MacroPCApars) # takes 8 seconds
nrowsinblock = 5
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/nrowsinblock)] = "n";
rowtitle = "glass samples"
ncolumnsinblock = 5
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/ncolumnsinblock)] = "d"
columntitle = "wavelengths"
ggpICPCA <- cellMap(D=remglass,
R=ICPCAglass$stdResid,
indcells=ICPCAglass$indcells,
indrows=ICPCAglass$indrows,
standOD=ICPCAglass$OD/ICPCAglass$cutoffOD,
showVals=NULL,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="ICPCA residual map",
rowtitle=rowtitle,
columntitle=columntitle,
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=FALSE,
sizetitles=1.75)
ggpMacroPCA <- cellMap(D=remglass,
R=MacroPCAglass$stdResid,
indcells=MacroPCAglass$indcells,
indrows=MacroPCAglass$indrows,
standOD=MacroPCAglass$OD/MacroPCAglass$cutoffOD,
showVals=NULL,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MacroPCA residual map",
rowtitle=rowtitle,
columntitle=columntitle,
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=FALSE,
sizetitles=1.75)
grid.arrange(ggpMacroPCA,ggpICPCA,nrow=2)
# pdfName = "Glass_MacroPCA_ICPCA_residualMap.pdf"
# dev.copy(pdf,pdfName,width=20,height=12)
# dev.off()
library(rrcov) # only needed for PcaHubert()
ROBPCAglass = PcaHubert(remglass,k=4,alpha=0.5)
ROBPCAindrows = which(ROBPCAglass@od > ROBPCAglass@cutoff.od)
# Calculate ROBPCA residuals and standardize them:
Xhat = sweep(ROBPCAglass@scores %*% t(ROBPCAglass@loadings),
2,ROBPCAglass@center,"+")
Xresid = remglass - Xhat
scaleRes = estLocScale(Xresid,type="1stepM",center=F)$scale
stdResidROBPCA = sweep(Xresid,2,scaleRes,"/")
# Univariate outlier detection indicates outlying cells:
cutoffResid = sqrt(qchisq(0.99,df=1))
ROBPCAindcells = which(abs(stdResidROBPCA) > cutoffResid)
ggpROBPCA <- cellMap(D=remglass,
R=stdResidROBPCA,
indcells=NULL,#ROBPCAindcells,
indrows=ROBPCAindrows,
standOD=NULL,
showVals=NULL,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="ROBPCA residual map",
rowtitle=rowtitle,
columntitle=columntitle,
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=FALSE,
sizetitles=1.75)
grid.arrange(ggpMacroPCA,ggpROBPCA,nrow=2)
# pdfName = "Glass_MacroPCA_ROBPCA_residualMap.pdf"
# dev.copy(pdf,pdfName,width=20,height=12)
# dev.off()
fastDDCpars = list(fastDDC=TRUE, silent=TRUE)
fastMacroPCApars = list(DDCpars=fastDDCpars, scale=FALSE, silent=TRUE)
fastMacroPCAglass = MacroPCA(data_glass,k=4,MacroPCApars=fastMacroPCApars) # 2 seconds
##
## The final data set we will analyze has 180 rows and 737 columns.
##
ggpfastMacroPCA <- cellMap(D=remglass,
R=fastMacroPCAglass$stdResid,
indcells=fastMacroPCAglass$indcells,
indrows=fastMacroPCAglass$indrows,
standOD=fastMacroPCAglass$OD/fastMacroPCAglass$cutoffOD,
showVals=NULL,
columnlabels=columnlabels,
rowlabels=rowlabels,
mTitle="MacroPCA with fastDDC=T",
columntitle=columntitle,
rowtitle=rowtitle,
ncolumnsinblock=ncolumnsinblock,
nrowsinblock=nrowsinblock,
autolabel=FALSE,
sizetitles=1.75)
grid.arrange(ggpMacroPCA,ggpfastMacroPCA,nrow=2) # The results are similar:
# pdfName = "Glass_MacroPCA_residualMaps.pdf"
# dev.copy(pdf,pdfName,width=20,height=12)
# dev.off()
Finally we analyze the DPOSS data. This is a random subset of 20'000 stars from the Digitized Palomar Sky Survey described by Odewahn et al (1998).
data(data_dposs)
colnames(data_dposs); dim(data_dposs)
## [1] "MAperF" "MTotF" "MCoreF" "AreaF" "IR2F" "csfF" "EllipF" "MAperJ"
## [9] "MTotJ" "MCoreJ" "AreaJ" "IR2J" "csfJ" "EllipJ" "MAperN" "MTotN"
## [17] "MCoreN" "AreaN" "IR2N" "csfN" "EllipN"
## [1] 20000 21
n = nrow(data_dposs); n
## [1] 20000
# Count missing cells
missmat = is.na(data_dposs)
sizemat = nrow(missmat)*ncol(missmat); sizemat
## [1] 420000
100*sum(as.vector(missmat))/sizemat # 50.2% of the values are missing:
## [1] 50.20952
# Count rows with missings
missrow = length(which(rowSums(missmat) > 0))
100*missrow/nrow(missmat) # 84.6% of the rows contain missing values:
## [1] 84.61
# PERFORM ICPCA AND MACROPCA
ICPCAdposs = ICPCA(data_dposs,k=4,scale=TRUE)
names(ICPCAdposs)
## [1] "scaleX" "k" "loadings" "eigenvalues" "center"
## [6] "covmatrix" "It" "diff" "X.NAimp" "scores"
## [11] "OD" "cutoffOD" "SD" "cutoffSD" "indrows"
## [16] "residScale" "stdResid" "indcells"
# MacroPCA options with fracNA allowing for many NA's:
DDCPars = list(fastDDC=F,fracNA=1.0)
MacroPCAPars = list(DDCpars=DDCPars,scale=TRUE,silent=T)
MacroPCAdposs = MacroPCA(data_dposs,k=4,MacroPCApars=MacroPCAPars) # takes 6 seconds
## SCREE PLOTS
barplot(ICPCAdposs$eigenvalues,
main="ICPCA scree plot", ylab="eigenvalues",
names.arg=1:length(ICPCAdposs$eigenvalues))
# dev.copy(pdf,"DPOSS_ICPCA_screeplot.pdf",width=6,height=6)
# dev.off()
barplot(MacroPCAdposs$eigenvalues,
main="MacroPCA scree plot", ylab="eigenvalues",
names.arg=1:length(MacroPCAdposs$eigenvalues))
# Not as concentrated in the first eigenvalue.
# dev.copy(pdf,"DPOSS_MacroPCA_screeplot.pdf",width=6,height=6)
# dev.off()
## LOADINGS
ICPCAdposs$loadings[,2] = -ICPCAdposs$loadings[,2]
matplot(ICPCAdposs$loadings[,1:2],main="ICPCA loadings",
xlab="variables",ylab="Loadings",col=c("black","blue"),
ylim=c(-0.4,0.6),type="l",lty=c(1,2),lwd=2)
abline(v=7.5,col="red")
abline(v=14.5,col="red")
# dev.copy(pdf,"DPOSS_ICPCA_loadings.pdf",width=5,height=5)
# dev.off()
matplot(MacroPCAdposs$loadings[,1:2],main="MacroPCA loadings",
xlab="variables",ylab="Loadings",col=c("black","blue"),
ylim=c(-0.3,0.5),type="l",lty=c(1,2),lwd=2)
abline(v=7.5,col="red")
abline(v=14.5,col="red")
# dev.copy(pdf,"DPOSS_MacroPCA_loadings.pdf",width=5,height=5)
# dev.off()
# Select the 150 rows with highest OD from MacroPCA
dpossOD = MacroPCAdposs$OD
summary(dpossOD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04344 0.45833 0.99144 2.01535 2.31592 145.53136
n = length(dpossOD); n
## [1] 20000
sortOD = order(dpossOD,1:n)
summary(sortOD); sortOD[1:5]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 5001 10000 10000 15000 20000
## [1] 10370 6891 1320 17270 7993
indHighOD = sortOD[n-150+1:n]
indHighOD = na.omit(indHighOD)
summary(indHighOD); length(indHighOD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 157 5009 8222 9549 15101 19979
## [1] 150
indLowOD = sortOD[1:(n-150)]
indLowOD = na.omit(indLowOD)
summary(indLowOD); length(indLowOD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 5001 10014 10004 15000 20000
## [1] 19850
## OUTLIER MAPS
dpossColList = list(class1=list(col="black",index=indLowOD),
class2=list(col="red",index=indHighOD))
outlierMap(ICPCAdposs,title="ICPCA outlier map",
col=dpossColList,labelOut=FALSE)
# dev.copy(pdf,"DPOSS_ICPCA_outlierMap.pdf",width=8,height=8)
# dev.off()
outlierMap(MacroPCAdposs,title="MacroPCA outlier map",
col=dpossColList,labelOut=FALSE)
# dev.copy(pdf,"DPOSS_MacroPCA_outlierMap.pdf",width=8,height=8)
# dev.off()
## ICPCA SCORE PLOTS
ICPCAdposs$scores[,2] = -ICPCAdposs$scores[,2]
plot(ICPCAdposs$scores[,1:2],main="ICPCA scores",xlab="PC1",ylab="PC2")
points(ICPCAdposs$scores[indHighOD,1:2],pch=16,col="red")
# dev.copy(pdf,"DPOSS_ICPCA_Scores12.pdf",width=5,height=5)
# dev.off()
plot(ICPCAdposs$scores[,c(1,3)],main="ICPCA scores",xlab="PC1",ylab="PC3")
points(ICPCAdposs$scores[indHighOD,c(1,3)],pch=16,col="red")
plot(ICPCAdposs$scores[,2:3],main="ICPCA scores",xlab="PC2",ylab="PC3")
points(ICPCAdposs$scores[indHighOD,2:3],pch=16,col="red")
# MacroPCA SCORE PLOTS
MacroPCAdposs$scores[,2] = -MacroPCAdposs$scores[,2]
MacroPCAdposs$scores[,3] = -MacroPCAdposs$scores[,3]
plot(MacroPCAdposs$scores[,1:2],main="MacroPCA scores",xlab="PC1",ylab="PC2")
points(MacroPCAdposs$scores[indHighOD,1:2],pch=16,col="red")
# dev.copy(pdf,"DPOSS_MacroPCA_Scores12.pdf",width=5,height=5)
# dev.off()
plot(MacroPCAdposs$scores[,c(1,3)],main="MacroPCA scores",xlab="PC1",ylab="PC3")
points(MacroPCAdposs$scores[indHighOD,c(1,3)],pch=16,col="red")
plot(MacroPCAdposs$scores[,2:3],main="MacroPCA scores",xlab="PC2",ylab="PC3")
points(MacroPCAdposs$scores[indHighOD,2:3],pch=16,col="red")
# RESIDUAL MAPS
dpossH = na.omit(indHighOD)
summary(dpossH); length(as.vector(dpossH))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 157 5009 8222 9549 15101 19979
## [1] 150
dpossL = na.omit(indLowOD)
summary(dpossL); length(as.vector(dpossL))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 5001 10014 10004 15000 20000
## [1] 19850
set.seed(0)
dpossH = sample(dpossH,150)
dpossL = sample(dpossL,300)
showrowsdposs = c(dpossH,dpossL)
summary(showrowsdposs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 150 5110 9818 10115 15554 19991
length(showrowsdposs)
## [1] 450
nrowsinblock = 25
rowtitle = ""
ncolumnsinblock = 1
columntitle = "variables"
rowlabels = c("OS1","OS2","OS3","OS4","OS5","OS6",
"S1","S2","S3","S4","S5","S6","S7","S8",
"S9","S10","S11","S12")
columnlabels = c("MAperF","MTotF","MCoreF","AreaF","IR2F",
"csfF","EllipF",
"MAperJ","MTotJ","MCoreJ","AreaJ","IR2J",
"csfJ","EllipJ",
"MAperN","MTotN","MCoreN","AreaN","IR2N",
"csfN","EllipN")
ggpICPCAdposs = cellMap(D=remdposs,
R=ICPCAdposs$stdResid,
indcells=ICPCAdposs$indcells,
indrows=ICPCAdposs$indrows,
standOD=ICPCAdposs$OD/ICPCAdposs$cutoffOD,
showVals=NULL,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="ICPCA residual map",
rowtitle=rowtitle,
columntitle=columntitle,
showrows=showrowsdposs,
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=F,
sizetitles=1.75)
plot(ggpICPCAdposs) # not much to see:
# dev.copy(pdf,"DPOSS_ICPCA_residualMap.pdf",width=8,height=6)
# dev.off()
ggpMacroPCAdposs = cellMap(D=remdposs,
R=MacroPCAdposs$stdResid,
indcells=MacroPCAdposs$indcells,
indrows=MacroPCAdposs$indrows,
standOD=MacroPCAdposs$OD/MacroPCAdposs$cutoffOD,
showVals=NULL,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MacroPCA residual map",
rowtitle=rowtitle,
columntitle=columntitle,
showrows=showrowsdposs,
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=F,
sizetitles=1.75)
plot(ggpMacroPCAdposs) # interesting structure:
# dev.copy(pdf,"DPOSS_MacroPCA_residualMap.pdf",width=8,height=6)
# dev.off()