library(stringr)

Strings

1.

Q1 <- "\"Age is an issue of mind over matter. If you don't mind, it doesn't matter.\"\n\n-- Mark Twain"
writeLines(Q1)
"Age is an issue of mind over matter. If you don't mind, it doesn't matter."

-- Mark Twain

2.

Q2 <- c('apple', 'appleapple', 'BananaBanana', 'Banana', 'cherryCherry')

### Get the first half of the string
(first.half <- str_sub(Q2, 1, str_count(Q2)/2))
[1] "ap"     "apple"  "Banana" "Ban"    "cherry"
### Get the second half of the string
(second.half <- str_sub(Q2, -(str_count(Q2)/2), -1))
[1] "le"     "apple"  "Banana" "ana"    "Cherry"
### Check if they are equal
str_to_lower(first.half) == str_to_lower(second.half)
[1] FALSE  TRUE  TRUE FALSE  TRUE

3.

Q3 <- "\"Dear Friends,\n\nThe exuberance of the change of seasons at Iowa State University never grows old — from the blooms of spring to the joys of another graduating class. During this holiday season, let your spirits be lifted by family and friends — and by this short video of captivating images of this magnificent campus.\n\nOur best wishes to you and yours for a happy, hope-filled 2022!\n\nWendy Wintersteen and Robert Waggoner\""
writeLines(Q3)
"Dear Friends,

The exuberance of the change of seasons at Iowa State University never grows old — from the blooms of spring to the joys of another graduating class. During this holiday season, let your spirits be lifted by family and friends — and by this short video of captivating images of this magnificent campus.

Our best wishes to you and yours for a happy, hope-filled 2022!

Wendy Wintersteen and Robert Waggoner"
### Method 1
length(str_split(Q3, pattern = "[^[:alnum:]]+")[[1]]) - 2
[1] 71
### Method 2
str_count(Q3, pattern = "[:alnum:]+")
[1] 71
### Method 3
str_count(Q3, "\\w+")
[1] 71

4.

(Not graded)

1.

script <- readLines("matrix_script.txt")
str(script)
 chr [1:7166] "FADE IN:" "" "ON COMPUTER SCREEN" ...
head(script)
[1] "FADE IN:"                                                
[2] ""                                                        
[3] "ON COMPUTER SCREEN"                                      
[4] "so close it has no boundaries."                          
[5] ""                                                        
[6] "A blinding cursor pulses in the electric darkness like a"

2.

sum(str_count(script, "^NEO$"))
[1] 212

3.

sum(str_count(script, "Trin"))
[1] 128
sum(str_count(script, "Trin.?"))
[1] 128
sum(str_count(script, "Trin\\w*"))
[1] 128
### Line 640 causes problem
sum(str_count(script, "Trin.*"))
[1] 127

Factors and Logical variables

happy <- readr::read_csv("happy.csv")

1.

### happy
(lv.happy <- unique(happy$happy))
[1] "not too happy" "pretty happy"  "very happy"    NA             
happy$happy <- factor(happy$happy, levels = lv.happy)
levels(happy$happy)
[1] "not too happy" "pretty happy"  "very happy"   
### marital
(lv.marital <- unique(happy$marital))
[1] "never married" "separated"     "divorced"      "widowed"      
[5] "married"       NA             
### Switch the order
happy$marital <- factor(happy$marital, levels = lv.marital[c(1, 5, 2, 3, 4)])
levels(happy$marital)
[1] "never married" "married"       "separated"     "divorced"     
[5] "widowed"      
### degree
(lv.degree <- unique(happy$degree))
[1] "bachelor"       "lt high school" "high school"    "graduate"      
[5] "junior college" NA              
### Switch the order
happy$degree <- factor(happy$degree, levels = lv.degree[c(2, 3, 5, 1, 4)])
levels(happy$degree)
[1] "lt high school" "high school"    "junior college" "bachelor"      
[5] "graduate"      
### health
(lv.health <- unique(happy$health))
[1] "good"      "fair"      "excellent" "poor"      NA         
### Switch the order
happy$health <- factor(happy$health, levels = lv.health[c(4, 2, 1, 3)])
levels(happy$health)
[1] "poor"      "fair"      "good"      "excellent"

2.

(i).

table(happy$happy)

not too happy  pretty happy    very happy 
         7332         32256         18122 

(ii).

table(happy$health)

     poor      fair      good excellent 
     2638      8768     20788     13827 

(iii).

table(happy$happy, happy$health)
               
                 poor  fair  good excellent
  not too happy   900  1785  2040       888
  pretty happy   1141  4758 11744      6045
  very happy      424  1651  5563      5959

3.

(Not graded)

marital.happy <- table(happy$marital, happy$happy)

### Recycling rule
marital.happy <- marital.happy/rowSums(marital.happy)
# marital.happy <- marital.happy/rep(rowSums(marital.happy), 3)

### Type I
barplot(t(marital.happy), legend.text=TRUE, args.legend=list(bg="transparent"))

### Create plotting vector
vec.plot <- as.vector(marital.happy)
names(vec.plot) <- as.vector(outer(str_sub(rownames(marital.happy), 1, 3), str_sub(colnames(marital.happy), 1, 3), paste, sep="."))

### Type II
barplot(vec.plot)

### Combine happy levels
### happy: pretty happy, very happy
temp <- happy
levels(temp$happy) <- c("not too happy", "happy", "happy")
dat.tab <- temp[temp$happy=="happy", "marital"]
barplot(table(dat.tab)/nrow(dat.tab))

4.

(Not graded)

a.

### Define the level of marital and happy appropriately
### Here use levels other than never married as married
### Here use levels other than not too happy as happy
### Result would be different if all NA in the data is removed
### Here only removed NA in marital and happy
Q4a <- happy[(happy$marital!="never married") & (happy$happy!="not too happy") & !is.na(happy$marital) & !is.na(happy$happy), ]
str(Q4a)
tibble [40,211 x 10] (S3: tbl_df/tbl/data.frame)
 $ happy  : Factor w/ 3 levels "not too happy",..: 2 2 2 2 2 2 3 3 3 3 ...
 $ year   : num [1:40211] 1972 1972 1972 1972 1972 ...
 $ age    : num [1:40211] 48 61 30 30 56 54 41 24 62 46 ...
 $ sex    : chr [1:40211] "female" "female" "female" "female" ...
 $ marital: Factor w/ 5 levels "never married",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ degree : Factor w/ 5 levels "lt high school",..: 2 2 2 2 1 1 1 2 1 4 ...
 $ finrela: chr [1:40211] "average" "above average" "below average" "average" ...
 $ health : Factor w/ 4 levels "poor","fair",..: 4 3 2 3 2 4 4 2 2 4 ...
 $ partyid: chr [1:40211] "independent" "strong democrat" "strong democrat" "strong democrat" ...
 $ wtssall: num [1:40211] 0.889 0.889 0.889 0.889 1.334 ...

b.

Q4b <- happy[(happy$marital!="never married") | (happy$happy!="not too happy") & !is.na(happy$marital) & !is.na(happy$happy), ]
str(Q4b)
tibble [59,452 x 10] (S3: tbl_df/tbl/data.frame)
 $ happy  : Factor w/ 3 levels "not too happy",..: 1 2 1 2 2 1 2 2 2 2 ...
 $ year   : num [1:59452] 1972 1972 1972 1972 1972 ...
 $ age    : num [1:59452] 70 48 27 61 26 28 21 30 30 56 ...
 $ sex    : chr [1:59452] "male" "female" "female" "female" ...
 $ marital: Factor w/ 5 levels "never married",..: 3 3 3 3 1 4 1 3 3 3 ...
 $ degree : Factor w/ 5 levels "lt high school",..: 1 2 4 2 2 2 2 2 2 1 ...
 $ finrela: chr [1:59452] "above average" "average" "average" "above average" ...
 $ health : Factor w/ 4 levels "poor","fair",..: 2 4 3 3 3 4 4 2 3 2 ...
 $ partyid: chr [1:59452] "not str democrat" "independent" "not str democrat" "strong democrat" ...
 $ wtssall: num [1:59452] 0.889 0.889 0.889 0.889 0.445 ...

c.

sum((happy$marital!="never married") | (happy$happy!="not too happy"), na.rm = TRUE) - sum((happy$marital!="never married") & (happy$happy!="not too happy"), na.rm = TRUE)
[1] 19230

d.

sum((happy$marital=="never married") & (happy$happy=="not too happy"), na.rm = TRUE)
[1] 1920

5.

a.

happy$partyid[happy$partyid %in% c("ind,near dem", "not str democrat", "strong democrat")] <- "democrat"

b.

happy$partyid[happy$partyid %in% c("ind,near rep", "not str republican", "strong republican")] <- "republican"

c.

### Results may vary when NA are removed
table(happy$partyid)

   democrat independent other party  republican 
      30426        9474         995       21186