library(purrr)
n0 <- 1000
### Sample from standard normal distribution
X <- rnorm(n0)
### Function that returns sample skewness
getb1 <- function(X){
### Numerator
num <- mean((X - mean(X)) ^ 3)
### Denominator
den <- (sum((X - mean(X)) ^ 2) / (length(X) - 1)) ^ (3 / 2)
### Sample skewness
b1 <- num / den
b1
}
### Function that returns sample kurtosis
getb2 <- function(X){
### Numerator
num <- mean((X - mean(X)) ^ 4)
### Denominator
den <- (mean((X - mean(X)) ^ 2)) ^ 2
### Sample kurtosis
b2 <- num / den
b2
}
### Quick check
getb1(X)
[1] 0.09172504
getb2(X)
[1] 2.853368
n <- 50
### Any number greater than 500 would work
B <- 1000
### Sample from standard normal distribution
X <- rnorm(n)
bstar <- map(seq_len(B), function(b){
### Bootstrap sample
Xstar <- sample(X, n, replace = TRUE)
### Bootstrap estimate
b1 <- getb1(Xstar)
b2 <- getb2(Xstar)
c(skewness = b1, kurtosis = b2)
})
### Combine as a data.frame (for map later)
bstar <- as.data.frame(do.call(rbind, bstar))
### Standard error
(SEstar <- map_dbl(bstar, function(x){
sqrt(mean((x - mean(x)) ^ 2))
}))
skewness kurtosis
0.2860932 0.5055912
### Estimate from the original sample
b <- c(skewness = getb1(X), kurtosis = getb2(X))
### Set default confidence level
alp <- 0.1
### Lower bound
normCI.L <- b - qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Upper bound
normCI.U <- b + qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Normal bootstrap CI
rbind(lower = normCI.L, upper = normCI.U)
skewness kurtosis
lower -0.2826211 1.993980
upper 0.6585419 3.657227
### Sample quantile
qstar <- map(bstar, quantile, c(alp / 2, 1 - alp / 2))
### Combine as a matrix
qstar <- do.call(cbind, qstar)
### Lower bound
pivCI.L <- 2 * b - qstar[2, ]
### Upper bound
### Note that both are minus
pivCI.U <- 2 * b - qstar[1, ]
### Pivotal boostrap CI
rbind(lower = pivCI.L, upper = pivCI.U)
skewness kurtosis
lower -0.2370707 2.001658
upper 0.6915765 3.607317
### Percentile bootstrap CI
perCI <- qstar
rownames(perCI) <- c("lower", "upper")
perCI
skewness kurtosis
lower -0.3156556 2.043890
upper 0.6129915 3.649549
M <- 500
mu3 <- 0
mu4 <- 3
### Repeat for M times
q1c <- map(seq_len(M), function(m) {
set.seed(m)
### Sample from standard normal distribution
X <- rnorm(n)
bstar <- map(seq_len(B), function(b) {
### Bootstrap sample
Xstar <- sample(X, n, replace = TRUE)
### Bootstrap estimate
b1 <- getb1(Xstar)
b2 <- getb2(Xstar)
c(skewness = b1, kurtosis = b2)
})
### Combine as a data.frame (for map later)
bstar <- as.data.frame(do.call(rbind, bstar))
### Standard error
SEstar <- map_dbl(bstar, function(x) {
sqrt(mean((x - mean(x))^2))
})
### Estimate from the original sample
b <- c(skewness = getb1(X), kurtosis = getb2(X))
### Lower bound
normCI.L <- b - qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Upper bound
normCI.U <- b + qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Normal bootstrap CI
normCI <- rbind(lower = normCI.L, upper = normCI.U)
### Sample quantile
qstar <- map(bstar, quantile, c(alp / 2, 1 - alp / 2))
### Combine as a matrix
qstar <- do.call(cbind, qstar)
### Lower bound
pivCI.L <- 2 * b - qstar[2, ]
### Upper bound
### Note that both are minus
pivCI.U <- 2 * b - qstar[1, ]
### Pivotal boostrap CI
pivCI <- rbind(lower = pivCI.L, upper = pivCI.U)
### Percentile bootstrap CI
perCI <- qstar
rownames(perCI) <- c("lower", "upper")
### Save all CI as a list
CI <- list(normCI = normCI, pivCI = pivCI, perCI = perCI)
### Indicator for coverage, TRUE as actually covered
q1cm <- map(CI, function(ci) {
### Check for each CI
map2_lgl(as.data.frame(ci), c(mu3, mu4), function(x, y) {
(x[1] < y) & (y < x[2])
})
})
### Logical vector ouput
unlist(q1cm, recursive = FALSE)
})
### Compute the proportion
colMeans(do.call(rbind, q1c))
normCI.skewness normCI.kurtosis pivCI.skewness pivCI.kurtosis perCI.skewness
0.826 0.750 0.786 0.690 0.858
perCI.kurtosis
0.788
Another way is to write a function that output CI for each method first, and then map over different methods.
### A function that returns estimates from the original sample and bootstrap samples
getTTstar <- function(getT){
### Sample from standard normal distribution
X <- rnorm(n)
### Estimate from the original sample
T <- getT(X)
Tstar <- map_dbl(seq_len(B), function(b) {
### Bootstrap sample
Xstar <- sample(X, n, replace = TRUE)
### Bootstrap estimate
tstar <- getT(Xstar)
tstar
})
list(T = T, Tstar = Tstar)
}
### A function that asks for statistic as input: getT = getb1
getCI <- function(getT){
TTstar <- getTTstar(getT)
### Estimate from the original sample
T <- TTstar$T
### Bootstrap estimates
Tstar <- TTstar$Tstar
### Standard error
SEstar <- sqrt(mean((Tstar - mean(Tstar))^2))
### Lower bound
normCI.L <- T - qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Upper bound
normCI.U <- T + qnorm(alp / 2, lower.tail = FALSE) * SEstar
### Normal bootstrap CI
normCI <- c(lower = normCI.L, upper = normCI.U)
### Sample quantile
qstar <- quantile(Tstar, c(alp / 2, 1 - alp / 2))
### Lower bound
pivCI.L <- 2 * T - qstar[2]
### Upper bound
### Note that both are minus
pivCI.U <- 2 * T - qstar[1]
### Pivotal boostrap CI
pivCI <- c(lower = pivCI.L, upper = pivCI.U)
### Percentile bootstrap CI
perCI <- qstar
names(perCI) <- c("lower", "upper")
### Save all results as a list
### All outputs for q1b
q1b <- list(SEstar = SEstar, normCI = normCI, pivCI = pivCI, perCI = perCI)
q1b
}
### skewness
getCI(getb1)
$SEstar
[1] 0.2839336
$normCI
lower upper
-0.7305260 0.2035323
$pivCI
lower.95% upper.5%
-0.7127563 0.2143669
$perCI
lower upper
-0.7413606 0.1857625
### kurtosis
getCI(getb2)
$SEstar
[1] 0.3653069
$normCI
lower upper
1.751062 2.952815
$pivCI
lower.95% upper.5%
1.636960 2.815057
$perCI
lower upper
1.888820 3.066917
q1c <- map2(list(getb1, getb2), c(mu3, mu4), function(getT, y) {
### Indicator for coverage, TRUE as actually covered
I <- map(seq_len(M), function(m) {
set.seed(m)
### Check for each CI
map_lgl(getCI(getT)[-1], function(x) {
(x[1] < y) & (y < x[2])
})
})
### Compute the proportion
colMeans(do.call(rbind, I))
})
names(q1c) <- c("skewness", "kurtosis")
q1c
$skewness
normCI pivCI perCI
0.826 0.786 0.858
$kurtosis
normCI pivCI perCI
0.750 0.690 0.788
In this case, the coverage for all 3 confidence intervals are close, with percentile confidence interval having the highest coverage rate. Also note that the coverage rate is closer to \(90 \%\) when estimating the skewness, which means these confidence intervals are more reliable compared with those from kurtosis. When estimating kurtosis, we are likely to have a larger variation due to the fact that it is a measure of “tailedness”. which might lead us to lower coverage rate.
uber <- read.csv("uber.csv")
str(uber)
'data.frame': 1156 obs. of 7 variables:
$ START_DATE.: chr "1/1/2016 21:11" "1/2/2016 1:25" "1/2/2016 20:25" "1/5/2016 17:31" ...
$ END_DATE. : chr "1/1/2016 21:17" "1/2/2016 1:37" "1/2/2016 20:38" "1/5/2016 17:45" ...
$ CATEGORY. : chr "Business" "Business" "Business" "Business" ...
$ START. : chr "Fort Pierce" "Fort Pierce" "Fort Pierce" "Fort Pierce" ...
$ STOP. : chr "Fort Pierce" "Fort Pierce" "Fort Pierce" "Fort Pierce" ...
$ MILES. : num 5.1 5 4.8 4.7 63.7 4.3 7.1 0.8 8.3 16.5 ...
$ PURPOSE. : chr "Meal/Entertain" "" "Errand/Supplies" "Meeting" ...
A point estimate for the difference \(\mu_1 - \mu_2\) between the population means would be the difference between the sample means.
### Note that there are actually 3 levels
(lvs <- unique(uber$CATEGORY.))
[1] "Business" "Personal" ""
### Indicator for population 1 and 2
I1 <- (uber$CATEGORY. == lvs[1])
I2 <- (uber$CATEGORY. == lvs[2])
### Seperate the data set
X1 <- uber[I1, "MILES."]
X2 <- uber[I2, "MILES."]
### Point estimate
getT <- function(X1, X2){
mean(X1) - mean(X2)
}
getT(X1, X2)
[1] 1.335065
Assume the common standard deviation is \(\sigma\), by CLT, we have \[ \widehat{\mu}_1 := \overline{X}_1 \sim N \left(\mu_1, \frac{\sigma^2}{n_1} \right) , \qquad \text{and} \qquad \widehat{\mu}_2 := \overline{X}_2 \sim N \left(\mu_2, \frac{\sigma^2}{n_2} \right) , \] where \(n_1\) and \(n_2\) are the number of observations for population 1 and 2 respectively.
As all observations are independent, we have \(\widehat{\mu}_1\) and \(\widehat{\mu}_2\) to be independent. Therefore, we have \[ \widehat{\mu}_1 - \widehat{\mu}_2 \sim N \left(\mu_1 - \mu_2, \frac{\sigma^2}{n_1} + \frac{\sigma^2}{n_2} \right) = N \left(\mu_1 - \mu_2, \sigma^2 \left(\frac{1}{n_1} + \frac{1}{n_2} \right) \right) . \] So, a point estimate of the standard error for \(\widehat{\mu}_1 - \widehat{\mu}_2\) is \(\sqrt{\widehat{\sigma^2} \left(1 / n_1 + 1 / n_2 \right)}\), where \(\widehat{\sigma^2}\) is the variance estimate from the whole sample.
### Variance estimate
sig2.est <- var(c(X1, X2))
n1 <- length(X1)
n2 <- length(X2)
### SE estimate
sqrt(sig2.est * (1 / n1 + 1 / n2))
[1] 2.54548
B <- 1000
### A function that returns estimates from the original sample and bootstrap samples
getTTstar <- function(getT){
### Estimate from the original sample
T <- getT(X1, X2)
Tstar <- map_dbl(seq_len(B), function(b) {
### Bootstrap sample
X1star <- sample(X1, n1, replace = TRUE)
X2star <- sample(X2, n2, replace = TRUE)
### Bootstrap estimate
tstar <- getT(X1star, X2star)
tstar
})
list(T = T, Tstar = Tstar)
}
### Set the confidence level
alp <- 0.05
getCI(getT)
$SEstar
[1] 2.369901
$normCI
lower upper
-3.309855 5.979985
$pivCI
lower.97.5% upper.2.5%
-2.180867 6.794369
$perCI
lower upper
-4.124239 4.850997
We obtain a reasonable estimate from bootstrap compared with the result from part b.
t.test(X1, X2, conf.level = 0.95)
Welch Two Sample t-test
data: X1 and X2
t = 0.53268, df = 87.643, p-value = 0.5956
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-3.646029 6.316159
sample estimates:
mean of x mean of y
10.655844 9.320779
The confindence interval from bootstrap as shown in part c is similar to what we obtained from t.test
. Both of them cover 0, so we do NOT have enough evidence to reject the null hypothesis that the difference in population means is 0, which means it is plausible that the population means from two groups are the same.
(Not graded)
A point estimate for the difference \(\theta_1 - \theta_2\) between the population medians would be the difference between the sample medians.
### Point estimate
getT <- function(X1, X2){
median(X1) - median(X2)
}
getT(X1, X2)
[1] 1.9
(Not graded)
getCI(getT)
$SEstar
[1] 0.6323559
$normCI
lower upper
0.6606052 3.1393948
$pivCI
lower.97.5% upper.2.5%
0.89875 3.20250
$perCI
lower upper
0.59750 2.90125
getT <- function(X) {
### Two-sided
abs(mu0 - mean(X))
}
perm.test <- function(X, mu0 = 3, getT, B = 1000, alp) {
### Test statistic from original data set
T <- getT(X)
### Vector of permutation test statistic from permutation data set
Tperm <- map_dbl(seq_len(B), function(b) {
### Sign for permutation
sign <- sample(c(-1, 1), length(X), replace = TRUE)
### Permutation data set
Xperm <- sign * (X - mu0) + mu0
### Test statistic
getT(Xperm)
})
### p-value
pv <- mean(Tperm >= T)
### Indicator, TRUE means reject the null hypothesis
I <- (pv <= alp)
list(p.value = pv, Decision = I)
}
n <- 1000
B <- 1000
mu0 <- 3
### Set the confidence level
alp <- 0.1
### Unit test
### Symmetric X around mu0
X <- rnorm(n, mean = mu0)
testthat::expect_equal(perm.test(X, mu0, getT, B, alp)$Decision, FALSE)
### Usymmetric X around mu0
X <- rexp(n)
testthat::expect_equal(perm.test(X, mu0, getT, B, alp)$Decision, TRUE)
drug1 <- sleep$extra[sleep$group == 1]
drug2 <- sleep$extra[sleep$group == 2]
str(drug1)
num [1:10] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2
str(drug2)
num [1:10] 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4
getT <- function(X1, X2) {
abs(mean(X1) - mean(X2))
}
### Test statistic from original sample
T <- getT(drug1, drug2)
n <- length(drug1)
B <- 1000
### Method 1 for creating permutation data sets:
Tperm <- map_dbl(seq_len(B), function(b) {
### Sign for permutation
sign <- sample(c(-1, 1), length(drug1), replace = TRUE)
### Difference between observations without absolute value
drug.diff <- drug1 - drug2
### Permutation difference
### -1 means labels are shuffled for that subject
drug.diff.perm <- sign * drug.diff
### Permutation test statistic
### Absolute value of difference in mean is equal to absolution value of mean in difference when we have same number of observations
abs(mean(drug.diff.perm))
})
### p-value
(pv <- mean(Tperm >= T))
[1] 0.007
### Decision, TRUE means reject the null hypothesis
pv <= alp
[1] TRUE
### Method 2 for creating permutation data sets:
drug <- data.frame(rbind(drug1, drug2))
Tperm <- map_dbl(seq_len(B), function(b) {
### Create chosen index
I <- sample(1:2, n, replace = TRUE)
### Permutation data sets
drug1perm <- map2_dbl(drug, I, `[`)
drug2perm <- map2_dbl(drug, -I, `[`)
### Permutation test statistic
getT(drug1perm, drug2perm)
})
### p-value
(pv <- mean(Tperm >= T))
[1] 0.006
### Decision, TRUE means reject the null hypothesis
pv <= alp
[1] TRUE
As p-value is less than \(0.1\), we have sufficient evidence to reject the null and claim that two group means are different.
(Not graded)
n <- 20
B <- 1000
alp <- 0.1
getT <- function(X) {
### Two-sided
abs(mu0 - mean(X))
}
mu0 <- 3
### True mean is 3
mu <- 3
### Normal
X <- rnorm(n, mean = mu, sd = 1)
perm.test(X, mu0, getT, B, alp)
$p.value
[1] 0.203
$Decision
[1] FALSE
t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)
One Sample t-test
data: X
t = -1.325, df = 19, p-value = 0.2009
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
2.148410 3.112694
sample estimates:
mean of x
2.630552
### Exponential
X <- rexp(n, rate = 1 / mu)
perm.test(X, mu0, getT, B, alp)
$p.value
[1] 0.86
$Decision
[1] FALSE
t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)
One Sample t-test
data: X
t = 0.18494, df = 19, p-value = 0.8552
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
2.009781 4.227402
sample estimates:
mean of x
3.118592
### True mean is 4
mu <- 4
### Normal
X <- rnorm(n, mean = mu, sd = 1)
perm.test(X, mu0, getT, B, alp)
$p.value
[1] 0.001
$Decision
[1] TRUE
t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)
One Sample t-test
data: X
t = 4.2467, df = 19, p-value = 0.0004363
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
3.696159 4.652432
sample estimates:
mean of x
4.174296
### Exponential
X <- rexp(n, rate = 1 / mu)
perm.test(X, mu0, getT, B, alp)
$p.value
[1] 0.266
$Decision
[1] FALSE
t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)
One Sample t-test
data: X
t = 1.148, df = 19, p-value = 0.2652
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
2.376994 6.084246
sample estimates:
mean of x
4.23062
(Not graded)
### Try 1000 instead of 100
M <- 1000
### True mean is 3
mu <- 3
### Normal
q3d <- map(seq_len(M), function(m){
X <- rnorm(n, mean = mu, sd = 1)
### TRUE means reject the null hypothesis
perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)
### Proportion of rejctions
### Type I error / size
(sizeq3d1 <- colMeans(q3d))
perm t
0.105 0.103
### Exponential
q3d <- map(seq_len(M), function(m){
X <- rexp(n, rate = 1 / mu)
### TRUE means reject the null hypothesis
perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)
### Proportion of rejctions
### Type I error / size
(sizeq3d2 <- colMeans(q3d))
perm t
0.131 0.129
### True mean is 4
mu <- 4
### Normal
q3d <- map(seq_len(M), function(m){
X <- rnorm(n, mean = mu, sd = 1)
### TRUE means reject the null hypothesis
perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)
### Proportion of rejctions
### Power
(powerq3d1 <- colMeans(q3d))
perm t
0.997 0.997
### Exponential
q3d <- map(seq_len(M), function(m){
X <- rexp(n, rate = 1 / mu)
### TRUE means reject the null hypothesis
perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)
### Proportion of rejctions
### Power
(powerq3d2 <- colMeans(q3d))
perm t
0.225 0.224
The size for exponential case with true mean equal to 3 is 0.131, which is inflated because the exponential distribution is not symmetric.
(Not graded)
We can rewrite the pmf for \(X\) and \(Y\) separately, \[ P (X = x) = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } x = 0, \\ \frac{1}{2}, & \text{if } x = 1, \\ \frac{1}{n}, & \text{if } x = 1. \end{cases} = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } x = 0, \\ \frac{1}{2} + \frac{1}{n}, & \text{if } x = 1. \end{cases} , \] \[ P (Y = y) = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } y = 0, \\ \frac{1}{2}, & \text{if } y = 1, \\ \frac{1}{n}, & \text{if } y = - \frac{n}{2}. \end{cases} . \] So, we have \[ E (X) = \left(\frac{1}{2} - \frac{1}{n} \right) \times 0 + \left(\frac{1}{2} + \frac{1}{n} \right) \times 1 = \frac{1}{2} + \frac{1}{n} , \] \[ E (Y) = \left(\frac{1}{2} - \frac{1}{n} \right) \times 0 + \frac{1}{2} \times 1 + \frac{1}{n} \times \left(- \frac{n}{2} \right) = 0 , \] \[ E (X Y) = \left(\frac{1}{2} - \frac{1}{n} \right) \times (0 \times 0) + \frac{1}{2} \times (1 \times 1) + \frac{1}{n} \times \left(1 \times \left(- \frac{n}{2} \right) \right) = 0 . \] So, we have the correlation as \[\begin{align*} Cor (X, Y) = & \frac{E (X Y) - E (X) E (Y)}{\sqrt{E (X ^ 2) - E (X) ^ 2} \sqrt{E (Y ^ 2) - E (Y) ^ 2}} \\ = & \frac{0 - \left(\frac{1}{2} + \frac{1}{n} \right) \times 0}{\sqrt{E (X ^ 2) - E (X) ^ 2} \sqrt{E (Y ^ 2) - E (Y) ^ 2}} \\ = & 0 . \end{align*}\]
n <- 50
### Generate sample
XY <- sample(list(c(X = 0, Y = 0), c(X = 1, Y = 1), c(X = 1, Y = - n / 2)), n, replace = TRUE, prob = c(1 / 2 - 1 / n, 1 / 2, 1 / n))
XY <- data.frame(do.call(rbind, XY))
### Visualize
plot(XY)
M <- 100
alp <- 0.05
B <- 1000
getT <- function(XY) {
with(XY, cor(X, Y))
}
q4c <- map_lgl(seq_len(M), function(m) {
### Generate sample
XY <- sample(list(c(X = 0, Y = 0), c(X = 1, Y = 1), c(X = 1, Y = -n / 2)), n, replace = TRUE, prob = c(1 / 2 - 1 / n, 1 / 2, 1 / n))
XY <- data.frame(do.call(rbind, XY))
### Test statistic for the original sample
T <- getT(XY)
Tperm <- map_dbl(seq_len(B), function(b) {
### Index for shuffling
I <- sample(n)
XYperm <- XY
### Shuffling X
XYperm$X <- XYperm$X[I]
### Test statistic for permutation sample
getT(XYperm)
})
### p-value
pv <- mean(Tperm >= T)
### Decision, TRUE means reject the null hypothesis
pv <= alp
})
### Proportion of rejections
### Power
mean(q4c)
[1] 0.29
If we are really testing for correlation as we think, then we should obtain the size as the sample is uncorrelated (null hypothesis is true), and close to \(\alpha = 0.05\), which is not the case here. We are actually testing for independence and the number represent the power of test, as the sample we generated are dependent (null hypothesis is false). That means the permutation test for correlation is not applicable as it is actually testing for independence.