f irisdata basicstatsanalysis · 102 5.8 2.7 5.1 1.9 virginica 149 6.2 3.4 5.4 2.3 virginica 150...
TRANSCRIPT
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
14 October 2018 Dr. Norhaiza Ahmad
Department of Mathematical SciencesFaculty of Science
Universiti Teknologi Malaysiahttp://science.utm.my/norhaiza/
Getting Started with
for newbiesSTATISTICAL ANALYSIS ON IRIS DATA
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
IRISDATASETIrisflower datasetis acollectionofdatatoquantify themorphologic variationofIrisflowers.Theflowerswere collected intheGaspéPeninsula from thesamepasture,andpicked onthesame day andmeasured at thesame timebythesamepersonwith thesame apparatus.Thedatasetconsists of50samples fromeach ofthree species ofIrissetosa (farleft),Irisversicolor (centre)andIrisvirginica).Fourcomponentsoftheflowers’featureswere measured fromeach sample:thelengthandthewidth ofthesepals andpetals,incentimetres.
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdataset•The iris data is included in the R base package as a dataframe.
> irisSepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa2 4.9 3.0 1.4 0.2 setosa..50 5.0 3.3 1.4 0.2 setosa51 7.0 3.2 4.7 1.4 versicolor52 6.4 3.2 4.5 1.5 versicolor..100 5.7 2.8 4.1 1.3 versicolor101 6.3 3.3 6.0 2.5 virginica102 5.8 2.7 5.1 1.9 virginica..149 6.2 3.4 5.4 2.3 virginica150 5.9 3.0 5.1 1.8 virginica
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
TASK
CallupIRISdatasetonRandanalyzethedatasetusingthecodesgiventoyou.
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdatasetiris #multivariate data on flower measurements
head(iris)
tail(iris)
>head(iris)Sepal.Length Sepal.Width Petal.LengthPetal.Width Species15.13.51.40.2setosa24.93.01.40.2setosa34.73.21.30.2setosa44.63.11.50.2setosa55.03.61.40.2setosa65.43.91.70.4setosa
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdatasetsummary(iris)Sepal.Length Sepal.Width Petal.Length Petal.Width Species Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50 Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50 Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800 Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#mean and median appear close- indication data symmetric
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdataset
iris.vs = iris[51:100,1:4]
#mean and median appear close- indication data symmetric
#Say we want to analyse species versicolor only#Create subset of Species versicolor
iris.vs =iris[iris$Species=="versicolor",1:4]
#or
names(iris.vs)
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdatasethist(iris.vs[,1])#tidy
#DISPLAY DATA
hist(iris.vs[,1],main=names(iris)[1],xlab=NULL)
or
par(mfrow=c(1,2)) #1 row, 2 col layouthist(iris.vs[,1])hist(iris.vs[,1],main=names(iris)[1],xlab=NULL)
#change layout of graphs
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdataset#Graphs for all measurements of iris versicolorpar(mfrow=c(2,2))hist(iris.vs[,1],main=names(iris)[1],xlab=NULL)hist(iris.vs[,2],main=names(iris)[2],xlab=NULL)hist(iris.vs[,3],main=names(iris)[3],xlab=NULL)hist(iris.vs[,4],main=names(iris)[4],xlab=NULL)
#DISPLAY DATA
> pairs(iris.vs)
or
#multi scatter-plots between variables
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdatasetcor(iris.vs)
#Correlation between variables
cor.test(iris.vs[,3],iris.vs[,4])
or
#pairs Sepal.Length vs Petal.Length, and Petal.Length vs Petal Width #are most strongly correlated with respective correlations of #0.7540 and 0.7867
Sepal.Length Sepal.Width Petal.Length Petal.WidthSepal.Length 1.0000000 0.5259107 0.7540490 0.5464611Sepal.Width 0.5259107 1.0000000 0.5605221 0.6639987Petal.Length 0.7540490 0.5605221 1.0000000 0.7866681Petal.Width 0.5464611 0.6639987 0.7866681 1.0000000
#Correlation test between Petal.Length vs Petal Width
Pearson's product-moment correlation
data: iris.vs[, 3] and iris.vs[, 4] t = 8.828, df = 48, p-value = 1.272e-11alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval:0.6508311 0.8737034 sample estimates:
cor0.7866681 #Significant linear correlation
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdataset
(irisVS.lm=lm(iris.vs[,3]~iris.vs[,4]))
#how to build a statistical model to predict a new Petal width given a new petal length? Use simple linear regression model
or
Call:lm(formula = iris.vs[, 3] ~ iris.vs[, 4])
Coefficients:(Intercept) iris.vs[, 4]
1.781 1.869
#Petal width= 1.781+1.869*Petal Length
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
Irisdataset
iris.s =iris[iris$Species==”setosa",1:4] t.test(iris.s[,3],iris.vs[,3])
#Is there a difference between the average Petal Length of species Setosa and Versicolor?#assume the data are normally distributed
> t.test(iris.s[,3],iris.vs[,3])
Welch Two Sample t-test
data: iris.s[, 3] and iris.vs[, 3] t = -39.4927, df = 62.14, p-value < 2.2e-16alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval:-2.939618 -2.656382 sample estimates:mean of x mean of y
1.462 4.260
#Significant evidence to Reject the null hypothesis that there is no difference between the average petal length of species iris Setosa & Versicolor
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
#------------# Advanced- Use package ggplot2 : IRIS
library(ggplot2)
p1 = ggplot(data = iris, aes(x = Petal.Length, y = Petal.Width)); p1 #setgraph paper
p2 = p1 + geom_point(aes(color = Species));p2 #use geom to specify what to plot
p3 = p2 + geom_smooth(method='lm');p3 #add a linear regression model to fit the data
p4 = p3 + xlab("Petal Length (cm)") + ylab("Petal Width (cm)") +ggtitle("PetalLgth vs Petal Width"); p4 #create/modify title
Use package ggplot2 : IRIS
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
OtherExample
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
15
Example:StudentAdmissionsdata
AggregatedataonapplicantstopostgraduateschoolatBerkeleyforthesixlargestdepartmentsclassified byadmissionandgender.
Admission Levels:Admitted/RejectedGender:Male/FemaleDepartment:A-F
> UCBdtAdmit Gender Dept Freq
1 Admitted Male A 5122 Rejected Male A 3133 Admitted Female A 894 Rejected Female A 195 Admitted Male B 3536 Rejected Male B 2077 Admitted Female B 178 Rejected Female B 89 Admitted Male C 12010 Rejected Male C 20511 Admitted Female C 20212 Rejected Female C 39113 Admitted Male D 13814 Rejected Male D 27915 Admitted Female D 13116 Rejected Female D 24417 Admitted Male E 5318 Rejected Male E 13819 Admitted Female E 9420 Rejected Female E 29921 Admitted Male F 2222 Rejected Male F 35123 Admitted Female F 2424 Rejected Female F 317
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
16
SimpleVisual:StudentAdmissions-packageplyr
Moremalesthanfemalesadmitted totheuniversity
HighestadmissionfordepartmentAcomparedtotherest.LowestadmissionfordepartmentFDept.A&Bdiscriminategenderforadmission.
HighestadmissionfordepartmentAcomparedtotherest.LowestadmissionfordepartmentFDept.A&Bdiscriminategenderforadmission.
Workshop:Getting Startedwith R.UTM14Oct2018.©Dr.NorhaizaAhmad
#-------------# Advanced- Use package plyr : Students Admission
library(plyr)library(datasets)UCBdt <- as.data.frame(UCBAdmissions)overall <- ddply(UCBdt, .(Gender), function(gender) {
temp <- c(sum(gender[gender$Admit == "Admitted", "Freq"]), sum(gender[gender$Admit == "Rejected", "Freq"])) / sum(gender$Freq)
names(temp) <- c("Admitted", "Rejected")temp
}) departmentwise <- ddply(UCBdt, .(Gender,Dept), function(gender) {
temp <- gender$Freq / sum(gender$Freq)names(temp) <- c("Admitted", "Rejected")temp
}) # A barplot for overall admission percentage for each gender.p1 <- ggplot(data = overall, aes(x = Gender, y = Admitted, width = 0.2))p1 <- p1 + geom_bar(stat = "identity") + ggtitle("Overall admission percentage") + ylim(0,1) ;p1
# A 1x6 panel of barplots, each of which represents the # admission percentage for a departmentp2 <- ggplot(data = UCBdt[UCBdt$Admit == "Admitted", ], aes(x = Gender, y = Freq))p2 <- p2 + geom_bar(stat = "identity") + facet_grid(. ~ Dept) + ggtitle("Number of admitted students\nfor each department") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) ;p2
# A 1x6 panel of barplots, each of which represents the # number of admitted students for a departmentp3 <- ggplot(data = departmentwise, aes(x = Gender, y = Admitted))p3 <- p3 + geom_bar(stat = "identity") + facet_grid(. ~ Dept) + ylim(0,1) + ggtitle("Admission percentage\nfor each department") +
theme(axis.text.x = element_text(angle = 90, hjust = 1));p3
# A 1x6 panel of barplots, each of which represents the # number of applicants for a departmentp4 <- ggplot(data = UCBdt, aes(x = Gender, y = Freq))p4 <- p4 + geom_bar(stat = "identity") + facet_grid(. ~ Dept) + ggtitle("Number of Applicants\nfor each department") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)); p4
#--------------------