library(purrr)

1.

a.

n0 <- 1000

### Sample from standard normal distribution
X <- rnorm(n0)

### Function that returns sample skewness
getb1 <- function(X){
  ### Numerator
  num <- mean((X - mean(X)) ^ 3)
  
  ### Denominator
  den <- (sum((X - mean(X)) ^ 2) / (length(X) - 1)) ^ (3 / 2)
  
  ### Sample skewness
  b1 <- num / den
  b1
}

### Function that returns sample kurtosis
getb2 <- function(X){
  ### Numerator
  num <- mean((X - mean(X)) ^ 4)
  
  ### Denominator
  den <- (mean((X - mean(X)) ^ 2)) ^ 2
  
  ### Sample kurtosis
  b2 <- num / den
  b2
}

### Quick check
getb1(X)

[1] 0.09172504

getb2(X)

[1] 2.853368

b.

n <- 50

### Any number greater than 500 would work
B <- 1000

### Sample from standard normal distribution
X <- rnorm(n)

bstar <- map(seq_len(B), function(b){
  ### Bootstrap sample
  Xstar <- sample(X, n, replace = TRUE)
  
  ### Bootstrap estimate
  b1 <- getb1(Xstar)
  b2 <- getb2(Xstar)
  
  c(skewness = b1, kurtosis = b2)
})

### Combine as a data.frame (for map later)
bstar <- as.data.frame(do.call(rbind, bstar))

i.

### Standard error
(SEstar <- map_dbl(bstar, function(x){
  sqrt(mean((x - mean(x)) ^ 2))
}))

 skewness  kurtosis 
0.2860932 0.5055912

ii.

### Estimate from the original sample
b <- c(skewness = getb1(X), kurtosis = getb2(X))

### Set default confidence level
alp <- 0.1

### Lower bound
normCI.L <- b - qnorm(alp / 2, lower.tail = FALSE) * SEstar

### Upper bound
normCI.U <- b + qnorm(alp / 2, lower.tail = FALSE) * SEstar

### Normal bootstrap CI
rbind(lower = normCI.L, upper = normCI.U)

        skewness kurtosis
lower -0.2826211 1.993980
upper  0.6585419 3.657227

iii.

### Sample quantile
qstar <- map(bstar, quantile, c(alp / 2, 1 - alp / 2))

### Combine as a matrix
qstar <- do.call(cbind, qstar)

### Lower bound
pivCI.L <- 2 * b  - qstar[2, ]

### Upper bound
### Note that both are minus
pivCI.U <- 2 * b  - qstar[1, ]

### Pivotal boostrap CI
rbind(lower = pivCI.L, upper = pivCI.U)

        skewness kurtosis
lower -0.2370707 2.001658
upper  0.6915765 3.607317

iv.

### Percentile bootstrap CI
perCI <- qstar
rownames(perCI) <- c("lower", "upper")
perCI

        skewness kurtosis
lower -0.3156556 2.043890
upper  0.6129915 3.649549

c.

M <- 500
mu3 <- 0
mu4 <- 3

### Repeat for M times
q1c <- map(seq_len(M), function(m) {
  set.seed(m)
  
  ### Sample from standard normal distribution
  X <- rnorm(n)

  bstar <- map(seq_len(B), function(b) {
    ### Bootstrap sample
    Xstar <- sample(X, n, replace = TRUE)

    ### Bootstrap estimate
    b1 <- getb1(Xstar)
    b2 <- getb2(Xstar)

    c(skewness = b1, kurtosis = b2)
  })

  ### Combine as a data.frame (for map later)
  bstar <- as.data.frame(do.call(rbind, bstar))

  ### Standard error
  SEstar <- map_dbl(bstar, function(x) {
    sqrt(mean((x - mean(x))^2))
  })

  ### Estimate from the original sample
  b <- c(skewness = getb1(X), kurtosis = getb2(X))

  ### Lower bound
  normCI.L <- b - qnorm(alp / 2, lower.tail = FALSE) * SEstar

  ### Upper bound
  normCI.U <- b + qnorm(alp / 2, lower.tail = FALSE) * SEstar

  ### Normal bootstrap CI
  normCI <- rbind(lower = normCI.L, upper = normCI.U)

  ### Sample quantile
  qstar <- map(bstar, quantile, c(alp / 2, 1 - alp / 2))

  ### Combine as a matrix
  qstar <- do.call(cbind, qstar)

  ### Lower bound
  pivCI.L <- 2 * b - qstar[2, ]

  ### Upper bound
  ### Note that both are minus
  pivCI.U <- 2 * b - qstar[1, ]

  ### Pivotal boostrap CI
  pivCI <- rbind(lower = pivCI.L, upper = pivCI.U)

  ### Percentile bootstrap CI
  perCI <- qstar
  rownames(perCI) <- c("lower", "upper")
  
  ### Save all CI as a list
  CI <- list(normCI = normCI, pivCI = pivCI, perCI = perCI)
  
  ### Indicator for coverage, TRUE as actually covered
  q1cm <- map(CI, function(ci) {
    ### Check for each CI
    map2_lgl(as.data.frame(ci), c(mu3, mu4), function(x, y) {
      (x[1] < y) & (y < x[2])
    })
  })
  
  ### Logical vector ouput
  unlist(q1cm, recursive = FALSE)
})

### Compute the proportion
colMeans(do.call(rbind, q1c))

normCI.skewness normCI.kurtosis  pivCI.skewness  pivCI.kurtosis  perCI.skewness 
          0.826           0.750           0.786           0.690           0.858 
 perCI.kurtosis 
          0.788

Another way is to write a function that output CI for each method first, and then map over different methods.

### A function that returns estimates from the original sample and bootstrap samples
getTTstar <- function(getT){
  ### Sample from standard normal distribution
  X <- rnorm(n)
  
  ### Estimate from the original sample
  T <- getT(X)
  
  Tstar <- map_dbl(seq_len(B), function(b) {
    ### Bootstrap sample
    Xstar <- sample(X, n, replace = TRUE)

    ### Bootstrap estimate
    tstar <- getT(Xstar)
    tstar
  })
  
  list(T = T, Tstar = Tstar)
}

### A function that asks for statistic as input: getT = getb1
getCI <- function(getT){
  TTstar <- getTTstar(getT)
  ### Estimate from the original sample
  T <- TTstar$T
  ### Bootstrap estimates
  Tstar <- TTstar$Tstar

  ### Standard error
  SEstar <- sqrt(mean((Tstar - mean(Tstar))^2))

  ### Lower bound
  normCI.L <- T - qnorm(alp / 2, lower.tail = FALSE) * SEstar

  ### Upper bound
  normCI.U <- T + qnorm(alp / 2, lower.tail = FALSE) * SEstar

  ### Normal bootstrap CI
  normCI <- c(lower = normCI.L, upper = normCI.U)

  ### Sample quantile
  qstar <- quantile(Tstar, c(alp / 2, 1 - alp / 2))

  ### Lower bound
  pivCI.L <- 2 * T - qstar[2]

  ### Upper bound
  ### Note that both are minus
  pivCI.U <- 2 * T - qstar[1]

  ### Pivotal boostrap CI
  pivCI <- c(lower = pivCI.L, upper = pivCI.U)

  ### Percentile bootstrap CI
  perCI <- qstar
  names(perCI) <- c("lower", "upper")
  
  ### Save all results as a list
  ### All outputs for q1b
  q1b <- list(SEstar = SEstar, normCI = normCI, pivCI = pivCI, perCI = perCI)
  q1b
}
### skewness
getCI(getb1)

$SEstar
[1] 0.2839336

$normCI
     lower      upper 
-0.7305260  0.2035323 

$pivCI
 lower.95%   upper.5% 
-0.7127563  0.2143669 

$perCI
     lower      upper 
-0.7413606  0.1857625

### kurtosis
getCI(getb2)

$SEstar
[1] 0.3653069

$normCI
   lower    upper 
1.751062 2.952815 

$pivCI
lower.95%  upper.5% 
 1.636960  2.815057 

$perCI
   lower    upper 
1.888820 3.066917

q1c <- map2(list(getb1, getb2), c(mu3, mu4), function(getT, y) {
  ### Indicator for coverage, TRUE as actually covered
  I <- map(seq_len(M), function(m) {
    set.seed(m)
    
    ### Check for each CI
    map_lgl(getCI(getT)[-1], function(x) {
      (x[1] < y) & (y < x[2])
    })
  })

  ### Compute the proportion
  colMeans(do.call(rbind, I))
})

names(q1c) <- c("skewness", "kurtosis")
q1c

$skewness
normCI  pivCI  perCI 
 0.826  0.786  0.858 

$kurtosis
normCI  pivCI  perCI 
 0.750  0.690  0.788

In this case, the coverage for all 3 confidence intervals are close, with percentile confidence interval having the highest coverage rate. Also note that the coverage rate is closer to \(90 \%\) when estimating the skewness, which means these confidence intervals are more reliable compared with those from kurtosis. When estimating kurtosis, we are likely to have a larger variation due to the fact that it is a measure of “tailedness”. which might lead us to lower coverage rate.

2.

uber <- read.csv("uber.csv")
str(uber)

'data.frame':   1156 obs. of  7 variables:
 $ START_DATE.: chr  "1/1/2016 21:11" "1/2/2016 1:25" "1/2/2016 20:25" "1/5/2016 17:31" ...
 $ END_DATE.  : chr  "1/1/2016 21:17" "1/2/2016 1:37" "1/2/2016 20:38" "1/5/2016 17:45" ...
 $ CATEGORY.  : chr  "Business" "Business" "Business" "Business" ...
 $ START.     : chr  "Fort Pierce" "Fort Pierce" "Fort Pierce" "Fort Pierce" ...
 $ STOP.      : chr  "Fort Pierce" "Fort Pierce" "Fort Pierce" "Fort Pierce" ...
 $ MILES.     : num  5.1 5 4.8 4.7 63.7 4.3 7.1 0.8 8.3 16.5 ...
 $ PURPOSE.   : chr  "Meal/Entertain" "" "Errand/Supplies" "Meeting" ...

a.

A point estimate for the difference \(\mu_1 - \mu_2\) between the population means would be the difference between the sample means.

### Note that there are actually 3 levels
(lvs <- unique(uber$CATEGORY.))

[1] "Business" "Personal" ""

### Indicator for population 1 and 2
I1 <- (uber$CATEGORY. == lvs[1])
I2 <- (uber$CATEGORY. == lvs[2])

### Seperate the data set
X1 <- uber[I1, "MILES."]
X2 <- uber[I2, "MILES."]

### Point estimate
getT <- function(X1, X2){
  mean(X1) - mean(X2)
}
getT(X1, X2)

[1] 1.335065

b.

Assume the common standard deviation is \(\sigma\), by CLT, we have \[ \widehat{\mu}_1 := \overline{X}_1 \sim N \left(\mu_1, \frac{\sigma^2}{n_1} \right) , \qquad \text{and} \qquad \widehat{\mu}_2 := \overline{X}_2 \sim N \left(\mu_2, \frac{\sigma^2}{n_2} \right) , \] where \(n_1\) and \(n_2\) are the number of observations for population 1 and 2 respectively.

As all observations are independent, we have \(\widehat{\mu}_1\) and \(\widehat{\mu}_2\) to be independent. Therefore, we have \[ \widehat{\mu}_1 - \widehat{\mu}_2 \sim N \left(\mu_1 - \mu_2, \frac{\sigma^2}{n_1} + \frac{\sigma^2}{n_2} \right) = N \left(\mu_1 - \mu_2, \sigma^2 \left(\frac{1}{n_1} + \frac{1}{n_2} \right) \right) . \] So, a point estimate of the standard error for \(\widehat{\mu}_1 - \widehat{\mu}_2\) is \(\sqrt{\widehat{\sigma^2} \left(1 / n_1 + 1 / n_2 \right)}\), where \(\widehat{\sigma^2}\) is the variance estimate from the whole sample.

### Variance estimate
sig2.est <- var(c(X1, X2))
n1 <- length(X1)
n2 <- length(X2)

### SE estimate
sqrt(sig2.est * (1 / n1 + 1 / n2))

[1] 2.54548

c.

B <- 1000

### A function that returns estimates from the original sample and bootstrap samples
getTTstar <- function(getT){
  ### Estimate from the original sample
  T <- getT(X1, X2)
  
  Tstar <- map_dbl(seq_len(B), function(b) {
    ### Bootstrap sample
    X1star <- sample(X1, n1, replace = TRUE)
    X2star <- sample(X2, n2, replace = TRUE)

    ### Bootstrap estimate
    tstar <- getT(X1star, X2star)
    tstar
  })
  
  list(T = T, Tstar = Tstar)
}

### Set the confidence level
alp <- 0.05

getCI(getT)

$SEstar
[1] 2.369901

$normCI
    lower     upper 
-3.309855  5.979985 

$pivCI
lower.97.5%  upper.2.5% 
  -2.180867    6.794369 

$perCI
    lower     upper 
-4.124239  4.850997

We obtain a reasonable estimate from bootstrap compared with the result from part b.

d.

t.test(X1, X2, conf.level = 0.95)


    Welch Two Sample t-test

data:  X1 and X2
t = 0.53268, df = 87.643, p-value = 0.5956
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -3.646029  6.316159
sample estimates:
mean of x mean of y 
10.655844  9.320779

The confindence interval from bootstrap as shown in part c is similar to what we obtained from t.test. Both of them cover 0, so we do NOT have enough evidence to reject the null hypothesis that the difference in population means is 0, which means it is plausible that the population means from two groups are the same.

e.

(Not graded)

A point estimate for the difference \(\theta_1 - \theta_2\) between the population medians would be the difference between the sample medians.

### Point estimate
getT <- function(X1, X2){
  median(X1) - median(X2)
}
getT(X1, X2)

[1] 1.9

f.

(Not graded)

getCI(getT)

$SEstar
[1] 0.6323559

$normCI
    lower     upper 
0.6606052 3.1393948 

$pivCI
lower.97.5%  upper.2.5% 
    0.89875     3.20250 

$perCI
  lower   upper 
0.59750 2.90125

3.

a.

getT <- function(X) {
  ### Two-sided
  abs(mu0 - mean(X))
}

perm.test <- function(X, mu0 = 3, getT, B = 1000, alp) {
  ### Test statistic from original data set
  T <- getT(X)
  
  ### Vector of permutation test statistic from permutation data set
  Tperm <- map_dbl(seq_len(B), function(b) {
    ### Sign for permutation
    sign <- sample(c(-1, 1), length(X), replace = TRUE)
    
    ### Permutation data set
    Xperm <- sign * (X - mu0) + mu0
    
    ### Test statistic
    getT(Xperm)
  })

  ### p-value
  pv <- mean(Tperm >= T)
  
  ### Indicator, TRUE means reject the null hypothesis
  I <- (pv <= alp)
  
  list(p.value = pv, Decision = I)
}

n <- 1000
B <- 1000
mu0 <- 3

### Set the confidence level
alp <- 0.1


### Unit test
### Symmetric X around mu0
X <- rnorm(n, mean = mu0)
testthat::expect_equal(perm.test(X, mu0, getT, B, alp)$Decision, FALSE)

### Usymmetric X around mu0
X <- rexp(n)
testthat::expect_equal(perm.test(X, mu0, getT, B, alp)$Decision, TRUE)

b.

drug1 <- sleep$extra[sleep$group == 1]
drug2 <- sleep$extra[sleep$group == 2]
str(drug1)

 num [1:10] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2

str(drug2)

 num [1:10] 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4

getT <- function(X1, X2) {
  abs(mean(X1) - mean(X2))
}

### Test statistic from original sample
T <- getT(drug1, drug2)

n <- length(drug1)
B <- 1000

### Method 1 for creating permutation data sets:
Tperm <- map_dbl(seq_len(B), function(b) {
  ### Sign for permutation
  sign <- sample(c(-1, 1), length(drug1), replace = TRUE)
  
  ### Difference between observations without absolute value
  drug.diff <- drug1 - drug2
  
  ### Permutation difference
  ### -1 means labels are shuffled for that subject
  drug.diff.perm <- sign * drug.diff
  
  ### Permutation test statistic
  ### Absolute value of difference in mean is equal to absolution value of mean in difference when we have same number of observations
  abs(mean(drug.diff.perm))
})

### p-value
(pv <- mean(Tperm >= T))

[1] 0.007

### Decision, TRUE means reject the null hypothesis
pv <= alp

[1] TRUE

### Method 2 for creating permutation data sets:
drug <- data.frame(rbind(drug1, drug2))

Tperm <- map_dbl(seq_len(B), function(b) {
  ### Create chosen index
  I <- sample(1:2, n, replace = TRUE)
  
  ### Permutation data sets
  drug1perm <- map2_dbl(drug, I, `[`)
  drug2perm <- map2_dbl(drug, -I, `[`)
  
  ### Permutation test statistic
  getT(drug1perm, drug2perm)
})

### p-value
(pv <- mean(Tperm >= T))

[1] 0.006

### Decision, TRUE means reject the null hypothesis
pv <= alp

[1] TRUE

As p-value is less than \(0.1\), we have sufficient evidence to reject the null and claim that two group means are different.

c.

(Not graded)

n <- 20
B <- 1000
alp <- 0.1

getT <- function(X) {
  ### Two-sided
  abs(mu0 - mean(X))
}
mu0 <- 3

### True mean is 3
mu <- 3

### Normal
X <- rnorm(n, mean = mu, sd = 1)
perm.test(X, mu0, getT, B, alp)

$p.value
[1] 0.203

$Decision
[1] FALSE

t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)


    One Sample t-test

data:  X
t = -1.325, df = 19, p-value = 0.2009
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
 2.148410 3.112694
sample estimates:
mean of x 
 2.630552

### Exponential
X <- rexp(n, rate = 1 / mu)
perm.test(X, mu0, getT, B, alp)

$p.value
[1] 0.86

$Decision
[1] FALSE

t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)


    One Sample t-test

data:  X
t = 0.18494, df = 19, p-value = 0.8552
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
 2.009781 4.227402
sample estimates:
mean of x 
 3.118592

### True mean is 4
mu <- 4

### Normal
X <- rnorm(n, mean = mu, sd = 1)
perm.test(X, mu0, getT, B, alp)

$p.value
[1] 0.001

$Decision
[1] TRUE

t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)


    One Sample t-test

data:  X
t = 4.2467, df = 19, p-value = 0.0004363
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
 3.696159 4.652432
sample estimates:
mean of x 
 4.174296

### Exponential
X <- rexp(n, rate = 1 / mu)
perm.test(X, mu0, getT, B, alp)

$p.value
[1] 0.266

$Decision
[1] FALSE

t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)


    One Sample t-test

data:  X
t = 1.148, df = 19, p-value = 0.2652
alternative hypothesis: true mean is not equal to 3
90 percent confidence interval:
 2.376994 6.084246
sample estimates:
mean of x 
  4.23062

d.

(Not graded)

### Try 1000 instead of 100
M <- 1000

### True mean is 3
mu <- 3

### Normal
q3d <- map(seq_len(M), function(m){
  X <- rnorm(n, mean = mu, sd = 1)
  
  ### TRUE means reject the null hypothesis
  perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
  t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
  
  c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)

### Proportion of rejctions
### Type I error / size
(sizeq3d1 <- colMeans(q3d))

 perm     t 
0.105 0.103

### Exponential
q3d <- map(seq_len(M), function(m){
  X <- rexp(n, rate = 1 / mu)
  
  ### TRUE means reject the null hypothesis
  perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
  t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
  
  c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)

### Proportion of rejctions
### Type I error / size
(sizeq3d2 <- colMeans(q3d))

 perm     t 
0.131 0.129

### True mean is 4
mu <- 4

### Normal
q3d <- map(seq_len(M), function(m){
  X <- rnorm(n, mean = mu, sd = 1)
  
  ### TRUE means reject the null hypothesis
  perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
  t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
  
  c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)

### Proportion of rejctions
### Power
(powerq3d1 <- colMeans(q3d))

 perm     t 
0.997 0.997

### Exponential
q3d <- map(seq_len(M), function(m){
  X <- rexp(n, rate = 1 / mu)
  
  ### TRUE means reject the null hypothesis
  perm.dec <- perm.test(X, mu0, getT, B, alp)$Decision
  t.dec <- t.test(X, mu = mu0, alternative = "two.sided", conf.level = 1 - alp)$p.value <= alp
  
  c(perm = perm.dec, t = t.dec)
})
q3d <- do.call(rbind, q3d)

### Proportion of rejctions
### Power
(powerq3d2 <- colMeans(q3d))

 perm     t 
0.225 0.224

The size for exponential case with true mean equal to 3 is 0.131, which is inflated because the exponential distribution is not symmetric.

4. (Optional)

(Not graded)

a.

We can rewrite the pmf for \(X\) and \(Y\) separately, \[ P (X = x) = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } x = 0, \\ \frac{1}{2}, & \text{if } x = 1, \\ \frac{1}{n}, & \text{if } x = 1. \end{cases} = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } x = 0, \\ \frac{1}{2} + \frac{1}{n}, & \text{if } x = 1. \end{cases} , \] \[ P (Y = y) = \begin{cases} \frac{1}{2} - \frac{1}{n}, & \text{if } y = 0, \\ \frac{1}{2}, & \text{if } y = 1, \\ \frac{1}{n}, & \text{if } y = - \frac{n}{2}. \end{cases} . \] So, we have \[ E (X) = \left(\frac{1}{2} - \frac{1}{n} \right) \times 0 + \left(\frac{1}{2} + \frac{1}{n} \right) \times 1 = \frac{1}{2} + \frac{1}{n} , \] \[ E (Y) = \left(\frac{1}{2} - \frac{1}{n} \right) \times 0 + \frac{1}{2} \times 1 + \frac{1}{n} \times \left(- \frac{n}{2} \right) = 0 , \] \[ E (X Y) = \left(\frac{1}{2} - \frac{1}{n} \right) \times (0 \times 0) + \frac{1}{2} \times (1 \times 1) + \frac{1}{n} \times \left(1 \times \left(- \frac{n}{2} \right) \right) = 0 . \] So, we have the correlation as \[\begin{align*} Cor (X, Y) = & \frac{E (X Y) - E (X) E (Y)}{\sqrt{E (X ^ 2) - E (X) ^ 2} \sqrt{E (Y ^ 2) - E (Y) ^ 2}} \\ = & \frac{0 - \left(\frac{1}{2} + \frac{1}{n} \right) \times 0}{\sqrt{E (X ^ 2) - E (X) ^ 2} \sqrt{E (Y ^ 2) - E (Y) ^ 2}} \\ = & 0 . \end{align*}\]

b.

n <- 50

### Generate sample
XY <- sample(list(c(X = 0, Y = 0), c(X = 1, Y = 1), c(X = 1, Y = - n / 2)), n, replace = TRUE, prob = c(1 / 2 - 1 / n, 1 / 2, 1 / n))
XY <- data.frame(do.call(rbind, XY))

### Visualize
plot(XY)

c.

M <- 100
alp <- 0.05
B <- 1000

getT <- function(XY) {
  with(XY, cor(X, Y))
}

q4c <- map_lgl(seq_len(M), function(m) {
  ### Generate sample
  XY <- sample(list(c(X = 0, Y = 0), c(X = 1, Y = 1), c(X = 1, Y = -n / 2)), n, replace = TRUE, prob = c(1 / 2 - 1 / n, 1 / 2, 1 / n))
  XY <- data.frame(do.call(rbind, XY))

  ### Test statistic for the original sample
  T <- getT(XY)

  Tperm <- map_dbl(seq_len(B), function(b) {
    ### Index for shuffling
    I <- sample(n)

    XYperm <- XY

    ### Shuffling X
    XYperm$X <- XYperm$X[I]

    ### Test statistic for permutation sample
    getT(XYperm)
  })

  ### p-value
  pv <- mean(Tperm >= T)

  ### Decision, TRUE means reject the null hypothesis
  pv <= alp
})

### Proportion of rejections
### Power
mean(q4c)

[1] 0.29

If we are really testing for correlation as we think, then we should obtain the size as the sample is uncorrelated (null hypothesis is true), and close to \(\alpha = 0.05\), which is not the case here. We are actually testing for independence and the number represent the power of test, as the sample we generated are dependent (null hypothesis is false). That means the permutation test for correlation is not applicable as it is actually testing for independence.

Homework 9

STAT 486/586 – Introduction to Statistical Computing

Yuhang (Tom) LIN

Due: April 19, 2022

1.

a.

b.

i.

ii.

iii.

iv.

c.

2.

a.

b.

c.

d.

e.

f.

3.

a.

b.

c.

d.

4. (Optional)

a.

b.

c.