load the data set: TeachingRatings.rda

setInternet2(TRUE) # solution for https files

download.file("https://sites.google.com/site/statsr4us/intro/software/rcmdr-1/TeachingRatings.rda", "TeachingRatings.rda")
## Warning in download.file("https://sites.google.com/site/statsr4us/intro/
## software/rcmdr-1/TeachingRatings.rda", : downloaded length 6252 != reported
## length 6252
load("TeachingRatings.rda")

setwd("C:/Users/Murtaza/Google Drive/AEBE/Data/ch.06")

names(TeachingRatings)
##  [1] "minority"    "age"         "gender"      "credits"     "beauty"     
##  [6] "eval"        "division"    "native"      "tenure"      "students"   
## [11] "allstudents" "prof"
## Warning: package 'xtable' was built under R version 3.1.3
## Warning: package 'psych' was built under R version 3.1.3

Figure 6.1: Histogram of Appl returns

library(quantmod)  ##  load the package "quantmod"
## Warning: package 'quantmod' was built under R version 3.1.3
## Loading required package: xts
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.1.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols("AAPL") ##  get daily Apple stock data from Yahoo Finance
##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## Warning in download.file(paste(yahoo.URL, "s=", Symbols.name, "&a=",
## from.m, : downloaded length 165393 != reported length 200
## [1] "AAPL"
dim(AAPL)          ##  find the sizes of the data downloaded
## [1] 2271    6
getSymbols("AAPL",from="2011-01-01",to="2013-12-31")  ##  specify the data span
## Warning in download.file(paste(yahoo.URL, "s=", Symbols.name, "&a=",
## from.m, : downloaded length 55540 != reported length 200
## [1] "AAPL"
y<-diff(log(AAPL$AAPL.Close))

x1<-rbind(mean=mean(y, na.rm=T), sdev=sd(y, na.rm=T))
colnames(x1)<-"descriptive stats" ; x1
##      descriptive stats
## mean      0.0007064645
## sdev      0.0177433396
x <- coredata(y[-1,])

h<-hist(x, breaks=20, col="dark grey", xlab="daily returns for Apple, 2011-2013",
        main="Histogram with Normal Curve" ) 
xfit<-seq(min(x),max(x),length=400) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="black", lwd=2)

Distributions

Fig. 6.6 Normal Distrinution

x=seq(-4,4,length=200)
y=1/sqrt(2*pi)*exp(-x^2/2)
plot(x,y,type="l",lwd=2,col="dark grey")

Comparing normal and t-distribution with varying degrees of freedom

Fig. 6.7

# Display the Student's t distributions with various
# degrees of freedom and compare to the normal distribution
# http://www.statmethods.net/advgraphs/probability.html

x <- seq(-4, 4, length=100)
hx <- dnorm(x)

degf <- c(1, 3, 8, 30)
colors <- c("red", "blue", "darkgreen", "gold", "black")
labels <- c("df=1", "df=3", "df=8", "df=30", "normal")

plot(x, hx, type="l", lty=2, xlab="x value",
  ylab="Density", main="Comparing t with normal distribution",
  sub="adapted from http://www.statmethods.net/advgraphs/probability.html")

for (i in 1:4){
  lines(x, dt(x,degf[i]), lwd=2, lty=i+2, col=colors[i])
}

legend("topright", inset=.05, title="Distributions",
  labels, lwd=2, lty=c(3,4,5,6,2), col=colors)

Teaching Evaluations

Table 6.2 Summary Stats, teaching evaluations

attach(TeachingRatings)

tab <- xtable(describe(cbind(eval, age, beauty, students, allstudents), 
                       skew=F, ranges=F))
rownames(tab)<- c("teaching evaluation score", "instructor's age", "beauty score", 
                  "students responding to survey","students registered in course")
print(tab, type="html")
vars n mean sd se
teaching evaluation score 1 463.00 4.00 0.55 0.03
instructor’s age 2 463.00 48.37 9.80 0.46
beauty score 3 463.00 0.00 0.79 0.04
students responding to survey 4 463.00 36.62 45.02 2.09
students registered in course 5 463.00 55.18 75.07 3.49

Fig. 6.8 Histogram of teaching evaluation

library(lattice)
## Warning: package 'lattice' was built under R version 3.1.3
histogram(TeachingRatings$eval, nint=15, aspect=2, 
          xlab="teaching evaluation score", col=c("dark grey"))

Fig. 6.9: The normal curve for evaluations

x=seq(1,7,length=200)
y=dnorm(x,mean=3.998,sd=.554)
plot(x,y,type="l",lwd=2,col="dark grey", main="Normal distribution",
          xlab="teaching evaluation score", ylab= "density")
          mtext("mean = 3.998, sd=0.554")

Fig. 6.10: Histogram of normalized score

z.eval<-as.matrix((TeachingRatings$eval-3.998)/.554)
histogram(z.eval, nint=15, aspect=2, 
          xlab="nomalized beauty score", col=c("dark grey"))

F6.11: Prob > 4.5

dnorm(4.5,mean=3.998,sd=.554)
## [1] 0.4776437
1-pnorm(4.5,mean=3.998,sd=.554)
## [1] 0.1824316
x=seq(1,7,length=200)
y=dnorm(x,mean=3.998,sd=.554)
plot(x,y,type="l",lwd=2,col="black",  main="Normal distribution",
     sub="probability = 0.1824",
          xlab="teaching evaluation score", ylab= "density")
          mtext("probability of obtaining teching evaluation >4.5")
ticks = c(4.5)
axis(side = 1, at = ticks)
x=seq(4.5,7,length=200)
y=dnorm(x,mean=3.998,sd=.554)
polygon(c(4.5,x,7),c(0,y,0),col="gray")

F6.12 Prob <= 4.5

dnorm(4.5,mean=3.998,sd=.554)
## [1] 0.4776437
pnorm(4.5,mean=3.998,sd=.554)
## [1] 0.8175684
x=seq(1,7,length=200)
y=dnorm(x,mean=3.998,sd=.554)
plot(x,y,type="l",lwd=2,col="black",  main="Normal distribution",
     sub="probability = 0.8176",
          xlab="teaching evaluation score", ylab= "density")
          mtext("probability of obtaining teching evaluation < 4.5")
ticks = c(4.5)
axis(side = 1, at = ticks)
x=seq(-7, 4.5,length=200)
y=dnorm(x,mean=3.998,sd=.554)
polygon(c(-7, x, 4.5),c(0,y,0),col="gray")

F6.14: 3.5< Prob > 4.5

pnorm(4.2,mean=3.998,sd=.554)-pnorm(3.5,mean=3.998,sd=.554)
## [1] 0.4579544
x=seq(2,6,length=200)
y=dnorm(x,mean=3.998,sd=.554)
plot(x,y,type="l",lwd=2,col="black",  main="Normal distribution",
     sub="probability = 0.46",
          xlab="teaching evaluation score", ylab= "density")
          mtext("3.5 <= Teaching evaluation <= 4.2")
ticks = c(3.5, 4.2)
axis(side = 1, at = ticks, line =.8)
x=seq(3.5, 4.2,length=200)
y=dnorm(x,mean=3.998,sd=.554)
polygon(c(3.5,x,4.2),c(0,y,0),col="gray")

Basket Ball

setInternet2(TRUE) # solution for https files
download.file("https://sites.google.com/site/econometriks/docs/basket.rda", "basket.rda")
load("basket.rda")

Fig. 6.15: Chamberlain and Jordan

basket$player<-as.factor(basket$player)
attach(basket) 

tapply(PTS, player,mean);tapply(PTS, player,sd)
##   Michael Jordan Wilt Chamberlain 
##         29.45333         30.63750
##   Michael Jordan Wilt Chamberlain 
##         4.763832        10.591813
x=seq(0,60,length=200)
y=dnorm(x,mean=29.45,sd=4.764)
plot(x,y,type="l",lwd=2,col="black", main="",
     sub="", xlab="average points scored per game", ylab= "density")
y=dnorm(x,mean=30.63,sd=10.59)
lines(x,y,type="l",lwd=2,col="dark grey", main="",
     sub="", xlab=" ", ylab= "")
mtext("Michael \nJordan",1,line=-10,at=40,col="black")
mtext("Wilt \nChamberlain",1,line=-6,at=10,col="blue")

Figures 6.16-6.18: The rejection regions

##two tailed
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="Two-tailed test",
     sub="", xlab=" ", ylab= "density", xaxt="n")
mtext("Normal distribution")
axis(1, c(-4, -3, 0, 3,4), line=0); axis(1, c(-1.96, 1.96), line=0)

x=seq(-4, -1.96, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x,-1.96),c(0,y,0),col="gray")

x=seq(1.96, 4, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(1.96, x, 4),c(0,y,0),col="gray")

##less than
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="One-tailed test",
     sub="", xlab=" ", ylab= "density", xaxt="n")
mtext("Normal distribution, left sided")

axis(1, c(-4, -3, 0, 3,4), line=0); axis(1, c(-1.64, 1.64), line=0)

x=seq(-4, -1.64, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x,-1.64),c(0,y,0),col="gray")

##greater than
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="One-tailed test",
     sub="", xlab=" ", ylab= "density", xaxt="n")
mtext("Normal distribution, right sided")
axis(1, c(-4, -3, 0, 3,4), line=0); axis(1, c(-1.64, 1.64), line=0)

x=seq(1.64, 4, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(1.64, x, 4.5),c(0,y,0),col="gray")

Fig. 6.19: Central limit Theorem

y=qt(.05, c(30,60,90, 200, 400, 500, 1000, 4000),lower.tail = T)
x= c(30,60,90, 200, 400, 500, 1000, 4000)
plot(x,y, pch=20, col="black", main="Critical values for t (lower tail)",
     xlab="degrees of freedom", ylab="critical t values for 5%")

y
## [1] -1.697261 -1.670649 -1.661961 -1.652508 -1.648672 -1.647907 -1.646379
## [8] -1.645235

One sample, known population SDEV mean test

-1.42>-1.96

Two-sided

prob=pnorm(-1.96,mean=0,sd=1);prob*2
## [1] 0.04999579
prob=pnorm(10.7,mean=12,sd=5.5/sqrt(36));prob*2
## [1] 0.1561377
library(NCStats)
## Loading required package: FSA
## Warning: package 'FSA' was built under R version 3.1.3
## 
## 
##  ############################################
##  ##      FSA package, version 0.8.4        ##
##  ##    Derek H. Ogle, Northland College    ##
##  ##                                        ##
##  ## Run ?FSA for documentation.            ##
##  ## Run citation('FSA') for citation ...   ##
##  ##   please cite if used in publication.  ##
##  ##                                        ##
##  ## See derekogle.com/fishR/ for more      ##
##  ##   thorough analytical vignettes.       ##
##  ############################################
## 
## Attaching package: 'FSA'
## The following object is masked from 'package:psych':
## 
##     headtail
## Loading required package: car
## Warning: package 'car' was built under R version 3.1.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: FSAdata
## Warning: package 'FSAdata' was built under R version 3.1.3
## 
## 
##  ##############################################
##  ## FSAdata package, version 0.3.2           ##
##  ##   by Derek H. Ogle, Northland College    ##
##  ##                                          ##
##  ## Run ?FSAdata for documentation with      ##
##  ##   search hints to find data for specific ##
##  ##   types of fisheries analyses.           ##
##  ##############################################
## 
## 
##  ############################################
##  ## NCStats package, version 0.4.2         ##
##  ##   by Derek H. Ogle, Northland College  ##
##  ##                                        ##
##  ##    type ?NCStats for documentation.    ##
##  ############################################
## 
## Attaching package: 'NCStats'
## The following objects are masked from 'package:FSA':
## 
##     compIntercepts, compSlopes, geomean, geosd
z.test(10.7,mu = 12,sd = 5.5/sqrt(36),alternative = "two.sided")
## One Sample z-test with 10.7 
## z = -1.4182, n = 1.000, Std. Dev. = 0.917, SE of the sample mean =
## 0.917, p-value = 0.1561
## alternative hypothesis: true mean is not equal to 12 
## 95 percent confidence interval:
##   8.903366 12.496634 
## sample estimates:
## mean of 10.7 
##         10.7
xbar=10.7; sdev=5.5; mu=12; n=36
z1<-(xbar-mu)/(sdev/sqrt(n));z1
## [1] -1.418182
zvalue=round(z1,2);zvalue
## [1] -1.42
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")
mtext("Normal distribution")
x=seq(-4, -1.96, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x,-1.96),c(0,y,0),col="gray")

x=seq(1.96, 4, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(1.96, x, 4),c(0,y,0),col="gray")

x=seq(-4,zvalue, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x, zvalue),c(0,y,0),col=, lwd=3)
ticks = c(zvalue)
axis(side = 1, at = ticks, line=0.3)

## Axes and labels
axis(1, -4:4, labels = round(-4:4*sdev/sqrt(n)+mu,3), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

axis(1, zvalue, labels = round(zvalue*sdev/sqrt(n)+mu,1), line=2.2, 
     col="blue",col.ticks="blue",col.axis="blue")
mtext("raw \ndata",1,line=3,at=-5,col="blue")
mtext("normalized\ndata",1,line=1.3,at=-4.7,col="black")

# LABELS
mylabel2 = bquote(italic(z-value) == .(format(zvalue, digits = 3)))
text(x = 3, y = .4, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(prob*2, digits = 5)))
text(x = 3, y = .37, labels = mylabel3)

The 95% region

x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub=" ", xlab=" ", ylab= "density")
mtext("95% confidence interval")

x=seq(-1.96, 1.96, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-1.96,x,1.96),c(0,y,0),col="gray")

x=seq(-4,zvalue, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x, zvalue),c(0,y,0),col=, lwd=3)

ticks = c(zvalue)
axis(side = 1, at = ticks, line=0.3)

axis(1, -4:4, labels = round(-4:4*sdev/sqrt(n)+mu,3), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

axis(1, zvalue, labels = round(zvalue*sdev/sqrt(n)+mu,3), line=2.2, 
     col="blue",col.ticks="blue",col.axis="blue")

mtext("raw \ndata",1,line=3,at=-5,col="blue")
mtext("normalized\ndata",1,line=1.3,at=-4.7,col="black")

Fig. 6.23: Left sided

xbar=6.5; sdev=1.5; mu=7; n=20


prob=pnorm(xbar,mean=mu,sd=sdev/sqrt(n));prob
## [1] 0.06801856
z.test(xbar,mu = mu,sd = sdev/sqrt(n),alternative = "less")
## One Sample z-test with xbar 
## z = -1.4907, n = 1.000, Std. Dev. = 0.335, SE of the sample mean =
## 0.335, p-value = 0.06802
## alternative hypothesis: true mean is less than 7 
## 95 percent confidence interval:
##      -Inf 7.051701 
## sample estimates:
## mean of xbar 
##          6.5
z1<-(xbar-mu)/(sdev/sqrt(n));z1
## [1] -1.490712
zvalue=round(z1,2);zvalue
## [1] -1.49
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")
mtext("one-tailed test")

x=seq(-4, -1.64, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4,x,-1.64),c(0,y,0),col="gray")

x=seq(-4, zvalue, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(-4, x, zvalue),c(0,y,0),col=, lwd=2)

ticks = c(-1.64); axis(side = 1, at = ticks, line=.8)
ticks = c(zvalue); axis(side = 1, at = ticks, line=.05)

axis(1, -4:4, labels = round(-4:4*sdev/sqrt(n)+mu,2), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

axis(1, zvalue, labels = round(zvalue*sdev/sqrt(n)+mu,2), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

mtext("raw \ndata",1,line=3,at=-5,col="blue")
mtext("normalized\ndata",1,line=1.3,at=-4.7,col="black")

# LABELS
mylabel2 = bquote(italic(z-value) == .(format(zvalue, digits = 3)))
text(x = 3, y = .4, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(prob, digits = 5)))
text(x = 3, y = .37, labels = mylabel3)

Fig. 6.24: Right sided

xbar=550; sdev=50; mu=500; n=5

prob=pnorm(550,mean=500,sd=50/sqrt(5));1-prob
## [1] 0.01267366
ztest=z.test(550,mu = 500,sd = 50/sqrt(5),alternative = "greater");ztest
## One Sample z-test with 550 
## z = 2.2361, n = 1.000, Std. Dev. = 22.361, SE of the sample mean =
## 22.361, p-value = 0.01267
## alternative hypothesis: true mean is greater than 500 
## 95 percent confidence interval:
##  513.22    Inf 
## sample estimates:
## mean of 550 
##         550
plot(ztest)

z1<-(xbar-mu)/(sdev/sqrt(n));z1
## [1] 2.236068
zvalue=round(z1,2);zvalue
## [1] 2.24
x=seq(-4,4,length=200)
y=dnorm(x,mean=0,sd=1)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="",  xlab=" ", ylab= "density")
mtext("one-tailed, greater than test")

x=seq(1.64, 4, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(1.64, x, 4),c(0,y,0),col="gray")

x=seq(zvalue, 4, length=200)
y=dnorm(x,mean=0,sd=1)
polygon(c(zvalue, x, 4),c(0,y,0),col=, lwd=3)

ticks = c(zvalue, 1.64)
axis(side = 1, at = ticks, line=.7)

## Axes and labels
axis(1, -4:4, labels = round(-4:4*sdev/sqrt(n)+mu,2), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

axis(1, zvalue, labels = round(zvalue*sdev/sqrt(n)+mu,2), line=-6, 
     col="blue",col.ticks="blue",col.axis="blue")

mtext("raw \ndata",1,line=3,at=-5,col="blue")
mtext("normalized\ndata",1,line=1.3,at=-4.7,col="black")

# LABELS
mylabel2 = bquote(italic(z-test) == .(format(zvalue, digits = 3)))
text(x = 3, y = .4, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(1-prob, digits = 5)))
text(x = 3, y = .37, labels = mylabel3)

T distribution

Figure 6.25:Positive Value, two-tailed test

xbar=170000; sdev=25000; mu1=166000; n1=35
t.testx=(xbar-mu1)/(sdev/sqrt(n1)); t.testx
## [1] 0.9465728
prob=pt(t.testx,df=n1-1, lower.tail=F)
pvalue=prob*2 # two tailed prob; 
pvalue
## [1] 0.3505358
tvalue=round(t.testx,2);tvalue
## [1] 0.95
tcrit=abs(qt(0.05/2, n1-1)); tcrit ## two-sided critical t value
## [1] 2.032245
x=seq(-4,4,length=200)
y=dt(x,df=n1-1)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")
          mtext("t distribution")
x=seq(-4, -tcrit, length=200)
y=dt(x,df=n1-1)
polygon(c(-4,x,-tcrit),c(0,y,0),col="gray")

x=seq(tcrit, 4, length=200)
y=dt(x,df=n1-1)
polygon(c(tcrit, x, 4),c(0,y,0),col="gray")

x=seq(tvalue, 4, length=200)
y=dt(x,df=n1-1)
polygon(c(tvalue, x, 4),c(0,y,0),col=, lwd=3)
ticks = c(tvalue)
axis(side = 1, at = ticks)

axis(1, -4:4, labels = round(-4:4*sdev/sqrt(n)+mu1,0), line=3, 
     col="blue",col.ticks="blue",col.axis="blue")

axis(1, tvalue, labels = round(tvalue*sdev/sqrt(n)+mu1,0), line=.72, 
     col="blue",col.ticks="blue",col.axis="blue")
mtext("raw \ndata",1,line=3,at=-5,col="blue")
mtext("normalized\ndata",1,line=1.3,at=-4.7,col="black")

# LABELS
mylabel1 = bquote(italic(dof) == .(format(n1-1, digits = 3)))
text(x = 3, y = .4, labels = mylabel1)
mylabel2 = bquote(italic(t-test) == .(format(tvalue, digits = 3)))
text(x = 3, y = .37, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(pvalue, digits = 5)))
text(x = 3, y = .34, labels = mylabel3)

Evaluation and Gender

Table 6.5: 2-sample t-test example from t.test

x=TeachingRatings

t.mean<-tapply(x$eval,x$gender, mean)
t.sd<- tapply(x$eval,x$gender, sd)
round(cbind(mean=t.mean, s.dev=t.sd),2)
##        mean s.dev
## male   4.07  0.56
## female 3.90  0.54

Figure 6.26:

t2.ex <- t.test(eval~gender,var.equal = FALSE, data=x)
t2.ex
## Welch Two Sample t-test with eval by gender 
## t = 3.2667, df = 425.756, p-value = 0.001176
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  0.06691754 0.26909088 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
pt(c(3.267), df=425.76, lower.tail=F)*2
## [1] 0.001174933
plot(t2.ex)

Fig. 6.27: positive value, two sided

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = FALSE, data=x); t2
## Welch Two Sample t-test with eval by gender 
## t = 3.2667, df = 425.756, p-value = 0.001176
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  0.06691754 0.26909088 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
df.x=as.numeric(t2[2]); df.x
## [1] 425.7558
tvalue=round(as.numeric(t2[1]),2);tvalue
## [1] 3.27
tcrit=abs(qt(0.05/2, df.x)); tcrit ## two-sided critical t value
## [1] 1.965551
x=seq(-4,4,length=200)
y=dt(x,df=df.x)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")

mtext("t distribution")

x=seq(-4, -tcrit, length=200)
y=dt(x,df=df.x)
polygon(c(-4,x,-tcrit),c(0,y,0),col="gray")

x=seq(tcrit, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tcrit, x, 4),c(0,y,0),col="gray")

x=seq(tvalue, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tvalue, x, 4),c(0,y,0),lwd=3)
ticks = c(tvalue)
axis(side = 1, at = ticks)

# LABELS
mylabel1 = bquote(italic(dof) == .(format(df.x, digits = 5)))
text(x = 3, y = .4, labels = mylabel1)
mylabel2 = bquote(italic(t-test) == .(format(tvalue, digits = 3)))
text(x = 3, y = .37, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(t2[3], digits = 3)))
text(x = 3, y = .34, labels = mylabel3)

Fig. 6.28: positive value, greater than

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = FALSE, data=x, alternative='greater');

df.x=as.numeric(t2[2]); df.x
## [1] 425.7558
tvalue=round(as.numeric(t2[1]),2);tvalue
## [1] 3.27
tcrit=abs(qt(0.05, df.x)); tcrit ## one-sided critical t value
## [1] 1.64844
x=seq(-4,4,length=200)
y=dt(x,df=df.x)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")

mtext("t distribution")

x=seq(tcrit, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tcrit, x, 4),c(0,y,0),col="gray")

x=seq(tvalue, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tvalue, x, 4),c(0,y,0),lwd=3)

ticks = c(tvalue)
axis(side = 1, at = ticks)
ticks = c(round(tcrit,2))
axis(side = 1, at = ticks, line=1)

# LABELS
mylabel1 = bquote(italic(dof) == .(format(df.x, digits = 5)))
text(x = 3, y = .4, labels = mylabel1)
mylabel2 = bquote(italic(t-test) == .(format(tvalue, digits = 3)))
text(x = 3, y = .37, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(t2[3], digits = 3)))
text(x = 3, y = .34, labels = mylabel3)

Fig. 6.29: T - test output

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = FALSE, data=x, alternative='greater'); t2
## Welch Two Sample t-test with eval by gender 
## t = 3.2667, df = 425.756, p-value = 0.0005881
## alternative hypothesis: true difference in means is greater than 0 
## 95 percent confidence interval:
##  0.0832263       Inf 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
plot(t2)

Fig. 6.30: positive value, two sided, UnEqual variances

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = T, data=x); t2
##  Two Sample t-test with eval by gender 
## t = 3.2499, df = 461, p-value = 0.001239
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  0.06641797 0.26959045 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026

Fig. 6.31: two tailed, equal variance

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = T, data=x); t2
##  Two Sample t-test with eval by gender 
## t = 3.2499, df = 461, p-value = 0.001239
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  0.06641797 0.26959045 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
df.x=as.numeric(t2[2]); df.x
## [1] 461
tvalue=round(as.numeric(t2[1]),2);tvalue
## [1] 3.25
tcrit=abs(qt(0.05/2, df.x)); tcrit ## two-sided critical t value
## [1] 1.965123
x=seq(-4,4,length=200)
y=dt(x,df=df.x)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")

mtext("t distribution")

x=seq(-4, -tcrit, length=200)
y=dt(x,df=df.x)
polygon(c(-4,x,-tcrit),c(0,y,0),col="gray")

x=seq(tcrit, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tcrit, x, 4),c(0,y,0),col="gray")

x=seq(tvalue, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tvalue, x, 4),c(0,y,0),lwd=3)
ticks = c(tvalue)
axis(side = 1, at = ticks)

# LABELS
mylabel1 = bquote(italic(dof) == .(format(df.x, digits = 5)))
text(x = 3, y = .4, labels = mylabel1)
mylabel2 = bquote(italic(t-test) == .(format(tvalue, digits = 3)))
text(x = 3, y = .37, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(t2[3], digits = 3)))
text(x = 3, y = .34, labels = mylabel3)

Fig. 6.32: T-test: positive value, Greater, Equal variances

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = TRUE, data=x, alternative='greater'); t2
##  Two Sample t-test with eval by gender 
## t = 3.2499, df = 461, p-value = 0.0006194
## alternative hypothesis: true difference in means is greater than 0 
## 95 percent confidence interval:
##  0.08280296        Inf 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026

Fig. 6.33: positive value, Greater, Equal variances

x=TeachingRatings
t2 <- t.test(eval~gender,var.equal = TRUE, data=x, alternative='greater'); t2
##  Two Sample t-test with eval by gender 
## t = 3.2499, df = 461, p-value = 0.0006194
## alternative hypothesis: true difference in means is greater than 0 
## 95 percent confidence interval:
##  0.08280296        Inf 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
df.x=as.numeric(t2[2]); df.x
## [1] 461
tvalue=round(as.numeric(t2[1]),2);tvalue
## [1] 3.25
tcrit=abs(qt(0.05, df.x)); tcrit ## two-sided critical t value
## [1] 1.648166
x=seq(-4,4,length=200)
y=dt(x,df=df.x)
plot(x,y,type="l",lwd=2,col="black", main="Comparison of mean test",
     sub="", xlab=" ", ylab= "density")

mtext("t distribution")


x=seq(tcrit, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tcrit, x, 4),c(0,y,0),col="gray")

x=seq(tvalue, 4, length=200)
y=dt(x,df=df.x)
polygon(c(tvalue, x, 4),c(0,y,0),lwd=3)

ticks = c(tvalue)
axis(side = 1, at = ticks)
ticks = c(round(tcrit,2))
axis(side = 1, at = ticks, line=1)

# LABELS
mylabel1 = bquote(italic(dof) == .(format(df.x, digits = 5)))
text(x = 3, y = .4, labels = mylabel1)
mylabel2 = bquote(italic(t-test) == .(format(tvalue, digits = 3)))
text(x = 3, y = .37, labels = mylabel2)
mylabel3 = bquote(italic(p-value) == .(format(t2[3], digits = 3)))
text(x = 3, y = .34, labels = mylabel3)

Worked out Example

x1=110000;s1=5000; n1=65
x2=125000;s2=15000; n2=45

# Assuming equal variances
vpool= (s1^2*(n1-1)+s2^2*(n2-1))/(n1+n2-2); vpool
## [1] 106481481
sdev = sqrt(vpool*(n1+n2)/(n1*n2)); sdev
## [1] 2001.108
t = (x1-x2)/sdev ;t
## [1] -7.495849
dof= n1+n2-2; dof
## [1] 108

Regression, Annova, Chisquare

Figures 6.35 and 6.36: Regression

x<-TeachingRatings

# Gender and eval
x$gen2<-factor(x$gender, levels = c("female", "male"))
cbind(mean.eval=tapply(x$eval, x$gen2,mean),observations=table(x$gen2))
##        mean.eval observations
## female  3.901026          195
## male    4.069030          268
plot(x$gen2,x$eval, pch=20)

# Fig. 6.35
t2 <- t.test(eval~gender,var.equal = TRUE, data=x); t2
##  Two Sample t-test with eval by gender 
## t = 3.2499, df = 461, p-value = 0.001239
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  0.06641797 0.26959045 
## sample estimates:
##   mean in group male mean in group female 
##             4.069030             3.901026
# Fig. 6.36
summary(lm(eval~gen2, data=x))
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.90103    0.03933   99.19  < 2e-16 ***
## gen2male     0.16800    0.05169    3.25  0.00124 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5492 on 461 degrees of freedom
## Multiple R-squared: 0.0224,  Adjusted R-squared: 0.02028 
## F-statistic: 10.56 on 1 and 461 DF,  p-value: 0.001239

Figures 6.37 and 6.38 Anova

#age as a three factor
x$f.age <- cut(x$age, breaks = 3)
x$f.age <- factor(x$f.age,labels=c("young", "mid age", "old"))
# plot(x$f.age,x$eval, pch=20)
cbind(mean.eval=tapply(x$eval, x$f.age,mean),observations=table(x$f.age))
##         mean.eval observations
## young    4.004348          161
## mid age  4.036036          222
## old      3.881250           80
# plot(x$age,x$eval,pch=20)

# Fig. 6.37
summary(lm(eval~f.age, data=x))
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.00435    0.04361  91.831   <2e-16 ***
## f.agemid age  0.03169    0.05727   0.553    0.580    
## f.ageold     -0.12310    0.07568  -1.626    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5533 on 460 degrees of freedom
## Multiple R-squared: 0.00997, Adjusted R-squared: 0.005665 
## F-statistic: 2.316 on 2 and 460 DF,  p-value: 0.09981
# Fig. 6.38
summary(aov(eval~f.age, data=x))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## f.age         2   1.42  0.7090   2.316 0.0998 .
## Residuals   460 140.82  0.3061                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fig. 6.39, Annova with beauty

## beauty
x$f.beauty<-cut(x$beauty, breaks=3)
x$f.beauty<-factor(x$f.beauty, labels=c("low beauty", "average looking", "good looking"))
cbind(mean.eval=tapply(x$eval, x$f.beauty,mean),observations=table(x$f.beauty))
##                 mean.eval observations
## low beauty       3.945856          181
## average looking  3.988177          203
## good looking     4.144304           79
# Fig. 6.39:
summary(aov(eval~f.beauty, data=x))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## f.beauty      2    2.2  1.1013   3.618 0.0276 *
## Residuals   460  140.0  0.3044                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fig. 6.40: CHI SQUARE

t1<-table(x$gender,x$tenure);t1
##         
##           no yes
##   male    52 216
##   female  50 145
round(prop.table(t1,1)*100,2)
##         
##             no   yes
##   male   19.40 80.60
##   female 25.64 74.36
chisq.test(t1, correct=F)
## Pearson's Chi-squared test with t1 
## X-squared = 2.5571, df = 1, p-value = 0.1098
r1<-margin.table(t1, 1) #  (summed over rows) 
c1<-margin.table(t1, 2) #  (summed over columns)
r1;c1
## 
##   male female 
##    268    195
## 
##  no yes 
## 102 361
e1<-r1%*%t(c1)/sum(t1);e1
##         
##                no     yes
##   male   59.04104 208.959
##   female 42.95896 152.041
t2<-(t1-e1)^2/e1;t2;sum(t2)
##         
##                 no       yes
##   male   0.8396905 0.2372533
##   female 1.1540362 0.3260712
## [1] 2.557051
qchisq(.95, df = 1)
## [1] 3.841459
1-pchisq(sum(t2),(length(r1)-1)*(length(c1)-1))
## [1] 0.1098032