#Data
data("iris")
str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Partition Data
set.seed(111)
ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.8, 0.2))
training <- iris[ind==1,]
testing <- iris[ind==2,]
# Scatter Plot & Correlations
# In case, psych package is missing, packages > install > psych
library(psych)
pairs.panels(training[,-5],
gap = 0,
bg = c("red", "yellow", "blue")[training$Species],
pch = 21)
# Principal Component Analysis
pc <- prcomp(training[,-5],
center = TRUE,
scale. = TRUE)
attributes(pc)
$names
[1] "sdev" "rotation" "center" "scale" "x"
$class
[1] "prcomp"
pc$center
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.790000 3.069167 3.597500 1.111667
mean(training$Sepal.Length)
[1] 5.79
pc$scale
Sepal.Length Sepal.Width Petal.Length Petal.Width
0.8234382 0.4588615 1.7872782 0.7556158
sd(training$Sepal.Length)
[1] 0.8234382
print(pc)
Standard deviations (1, .., p=4):
[1] 1.7173318 0.9403519 0.3843232 0.1371332
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
Sepal.Length 0.5147163 -0.39817685 0.7242679 0.2279438
Sepal.Width -0.2926048 -0.91328503 -0.2557463 -0.1220110
Petal.Length 0.5772530 -0.02932037 -0.1755427 -0.7969342
Petal.Width 0.5623421 -0.08065952 -0.6158040 0.5459403
Each principal component is normalized linear combination of original variables.
summary(pc)
Importance of components%s:
PC1 PC2 PC3 PC4
Standard deviation 1.7173 0.9404 0.38432 0.1371
Proportion of Variance 0.7373 0.2211 0.03693 0.0047
Cumulative Proportion 0.7373 0.9584 0.99530 1.0000
PC1 captures 73% of variation.
#Orthogonality of PCs
pairs.panels(pc$x,
gap=0,
bg = c("red", "yellow", "blue")[training$Species],
pch = 21)
#Bi-Plot
library(devtools)
#if ggbiplot is missing run this once.
#install_github("ggbiplot", "vqv")
library(ggbiplot)
g <- ggbiplot(pc,
obs.scale = 1,
var.scale = 1,
groups = training$Species,
ellipse = TRUE,
circle = TRUE,
ellipse.prob = 0.68)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
#Prediction with PCs
trg <- predict(pc, training)
trg <- data.frame(trg, training[5])
tst <- predict(pc, testing)
tst <- data.frame(tst, testing[5])
# Multinomial Logistic regression with Firsth two PCs
library(nnet)
trg$Species <- relevel(trg$Species, ref = "setosa")
mymodel <- multinom(Species~PC1+PC2, data = trg)
# weights: 12 (6 variable)
initial value 131.833475
iter 10 value 20.607042
iter 20 value 18.331120
iter 30 value 18.204474
iter 40 value 18.199783
iter 50 value 18.199009
iter 60 value 18.198506
final value 18.198269
converged
summary(mymodel)
Call:
multinom(formula = Species ~ PC1 + PC2, data = trg)
Coefficients:
(Intercept) PC1 PC2
versicolor 7.2345029 14.05161 3.167254
virginica -0.5757544 20.12094 3.625377
Std. Errors:
(Intercept) PC1 PC2
versicolor 187.5986 106.3766 127.8815
virginica 187.6093 106.3872 127.8829
Residual Deviance: 36.39654
AIC: 48.39654
# Confusion Matrix & Misclassification Error - training
p <- predict(mymodel, trg)
tab <- table(p, trg$Species)
tab
p setosa versicolor virginica
setosa 45 0 0
versicolor 0 35 3
virginica 0 5 32
During Classification, made mistook 5 labels to category 3 instead of 2.
# Calculating miss classification error - training
1 - sum(diag(tab))/sum(tab)
[1] 0.06666667
p1 <- predict(mymodel, tst)
tab1 <- table(p1, tst$Species)
1 - sum(diag(tab1))/sum(tab1)
[1] 0.1333333
# What if we choose 3 principle components
mymodel <- multinom(Species~PC1+PC2+PC3, data = trg)
# weights: 15 (8 variable)
initial value 131.833475
iter 10 value 13.118022
iter 20 value 2.460562
iter 30 value 2.286200
iter 40 value 1.186481
iter 50 value 1.175090
iter 60 value 1.165399
iter 70 value 1.150867
iter 80 value 1.144306
iter 90 value 1.105235
iter 100 value 1.018050
final value 1.018050
stopped after 100 iterations
No convergence achieved after 100 iterations.