Hypothesis Testing

#One sided confidence intervals

#For p
library(regclass)
data(“CUSTREACQUIRE”)
summary(CUSTREACQUIRE)

summary(CUSTREACQUIRE$Reacquire)
mean( CUSTREACQUIRE$Reacquire == “Yes” )

#Old reacquire policy got 60% of churned customers. New one is cheaper, and may not be as effective
#Ho: p=0.6 vs. HA: p < 0.6
binom.test(295,500,alternative = “less”, p=0.6)
# 95 percent confidence interval:
# 0.0000000 0.6267122
#60% is still a plausible value for p, retain Ho

#Is the average lifetimevalue 2 larger than lifetime value 1?
#Ho: mu2 = mu1 vs. HA: mu2 > mu1
#Ho: mu2 – mu1 = 0 vs HA: mu2 – mu1 > 0
SUB <- subset(CUSTREACQUIRE,Reacquire==”Yes”)
summary(SUB)
t.test(SUB$Lifetime2,SUB$Lifetime1,paired=TRUE,alternative=”greater”)
# 95 percent confidence interval:
# 110.8438 Inf

#Median Age < 53?
median(CUSTREACQUIRE$Age)
# < alternative wants (-Inf, quantile(,.95) )
# > alternative wants ( quantile(,.05), Inf )
#Ho: median = 53
#HA: median < 53

boot.medians <- c()
for (i in 1:4999) {
boot.sample <- sample( CUSTREACQUIRE$Age, replace=TRUE )
boot.medians[i] <- median(boot.sample)
}
hist(boot.medians)

c(-Inf, quantile(boot.medians,.95) )

#Clustering

set.seed(471); DATA <- data.frame(x=runif(100,30,40),y=runif(100,70,80))
set.seed(472);DATA[1:7,] <- data.frame(x=runif(7,31.1,32.9),y=runif(7,71.1,72.9))
set.seed(473);DATA[21:28,] <- data.frame(x=runif(8,35.1,36.8),y=runif(8,76.0,77.8))
plot(DATA,pch=20,cex=2)

#Is there clustering? Or is there not?

#Step 1
#Ho: distribution of points IS random (=)
#HA: distribution of points IS NOT random (not =)

#Step 2
#Test statistic: the average distance to each point’s 5th nearest neighbor
DISTANCE <- as.matrix( dist( DATA) )
#Get the distance to the 5th nearest neighbor for each point
apply(DISTANCE,2,function(x) sort( x )[6] )
#Get average 5th nearest neighbor distance
mean( apply(DISTANCE,2,function(x) sort( x )[6] ) )

#What is the distribution “under the null” of the test statistic
#i.e. what is the distribution of the average 5th nearest neighbor distance when points
#are randomly placed on this square?
#Simulate this with a Monte Carlo simulation
null.stat <- c()
for ( i in 1:500 ) {
#Place points in the square at random
DATA <- data.frame(x=runif(100,30,40),y=runif(100,70,80))
null.stat[i] <- mean( apply(as.matrix( dist( DATA) ) ,2,function(x) sort( x )[6] ) )
}
hist(null.stat)

#Step 3: get pvalue

mean( null.stat <= 1.29358 )
#p-value is 0.106

#Interpretation of this p-value
#The probability that we would measure an average 5th nearest neighbor distance of 1.29358
#or something smaller (i.e. more evidence from the alternative) when the distribution of points
#is purely random is 10.6%.

#There’s a 10.6% chance of observing our data or data that is even more evidence for the
#alternative (i.e. that there’s clustering) when the points are actually being placed
#at random (i.e. the null is true).

#if p-value < 5% reject Ho, if 5% or bigger, retain Ho
#WE RETAIN HO!