### A few activities for learning some R basics 

## vectors in R, created by c()
vec1 = c(1,4,3,6)  # can make them out of numbers 
vec = c("apple", "2 bananas", "pineapple", "another fruit")  # can make them out of "characters", or "strings"

### without knowing anything about strings, we could ask R to tell us which of these elements is equal to "apple": 
vec == "apple"

### but what if we want to ask R which of these elements contain "apple"?

grepl("apple", vec)
grepl("app", vec)

## the syntax is grepl(pattern, text)
## "apple" is the regular expression -- the search pattern. 

### grep() is a related command that by default gives you the indices of the matching elements 
grep("apple", vec)

## or the matching elements themselves 
grep("apple", vec, value = T)

### Now what if you wanted to know which entries contain the word "apple", but not as part of another word. 


## first I am going to show you another command that could be useful for this
sub("a", "A", vec)
gsub("a", "A", vec)

# brute force option that requires you to know exactly what you want to avoid
vec2 = sub("pineapple", "pineXXX", vec)
grepl("apple", vec2)
# more subtle approach 
grepl("^apple", vec)
# inside a regular expression, "^" means "the start of the string of characters."
# so what would 
grepl("^a")
# give us?  1 & 4

# inside a regular expression, "$" means "the end of the string of characters"
# so how do we identify the entries that end with e? 
grepl("e$", vec)

### Some other useful metacharacters 
# "." is the wildcard
# \\d is a digit 
# \\D is non-digit 
# \\s means a space 
# + means one or more 
# * means zero or more 
grepl("\\s", vec)
grepl("\\d", vec)

# [abc] match any one of a, b, or c
grepl("b[ai]n", c("banana", "binary", "bones"))

# another useful thing: "abc|def" means "match abc or def"
grepl("app|ban", vec)


## OK so on to the activity 
# Times Guide to the House of Commons: short text biographies for every candidacy in the 1950 - 1970 elections. 
d = read.csv("http://andy.egge.rs/data/THC_candidates.csv", as.is = T)

## let's try to get some covariates from the bios 
# how could we record whether they went to Oxford or Cambridge? 

d$oxford_match = grepl("Oxford", d$bio)
d$cambridge_match = grepl("Cambridge", d$bio)
# some issues there but no big deal


# let's try to get the date of birth 
# we start by just asking does it say age (numbers), or born in (numbers)
d$age_xx = grepl("aged? \\d\\d\\D", tolower(d$bio))
d$born_in_xx = grepl("born in \\d{4}\\D", tolower(d$bio))
d$xx_years_of_age = grepl("\\d\\d years of age", tolower(d$bio))

# but now we actually want to get the age out of it. 
# let's just focus on born in XXXX
# and let's try the str_match() function, which is in stringr
require(stringr) 
dd = d[1:50,]
str_match(dd$bio, "born in \\d{4}")
str_match(dd$bio, "born in (\\d{4})")
str_match(dd$bio, "born in .{40}")
str_match(dd$bio, "born in .{0,40}?(\\d{4})")

# and then we would do a similar thing for age or aged, and XX years of age 


# can you think of way to code the gender of the candidates? 

# how do we extract the first word of the bio 
head(str_match(d$bio, "^\\w+"))
# extract the first word of the bio 
sort(table(str_match(d$bio, "^\\w+")))

d$female = grepl("^Mrs|^Miss|^Lady|^Dame|^Viscountess", d$bio)
d$female.maybe = grepl("^Dr|^The|^Prof", d$bio) & grepl("\\sshe\\s|\\sher\\s", d$bio)
d$female[d$female.maybe] = T

## OK then if there is still time we could look at how these things change over time. 

# Did the proportion of female candidates increase over this period? 
d$Date = as.Date(d$date)
counts = table(d$female, d$Date)
fem.counts = counts[2,]
plot(sort(unique(d$Date)), fem.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Number of female candidates", ylim = c(0, max(counts[2,])))
tot.counts = counts[1,] + counts[2,]
plot(sort(unique(d$Date)), fem.counts/tot.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Proportion of female candidates", ylim = c(0, .1))

use = d$winner == 1
counts = table(d$female[use], d$Date[use])
fem.counts = counts[2,]
plot(sort(unique(d$Date)), fem.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Number of female winners", ylim = c(0, max(counts[2,])))
tot.counts = counts[1,] + counts[2,]
plot(sort(unique(d$Date)), fem.counts/tot.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Proportion of female winners", ylim = c(0, .06))


## so this is your introduction to "metacharacters" in regular expressions. 

# how would we identify the elements of vec that end with an a? 
grepl("a$", vec)

## let's learn a few more metacharacters and see what we can use 


## download the data
d = read.csv("http://andy.egge.rs/data/candidates.csv")
d$miss = grepl("miss", d$bio, ignore.case = T) 
d$barrister = grepl("barrister", d$bio, ignore.case = T) 
d$legal = grepl("barrister|solicitor|", d$bio, ignore.case = T)
grepl("^Miss|^Mrs|^Lady", d$bio)


### Activity using the SOTU and inaugural data for something 

inaug = tbl_df(read.csv("inaugTexts.csv"))

## in how many speeches does the word "citizen" appear? 
sum(grepl("citizen", inaug$inaugSpeech, ignore.case = T))

# how about England, English, Britain, British 
sum(grepl("england|english|britain|british", inaug$inaugSpeech, ignore.case = T))

# but not New England, British Columbia 
inaug$inaugSpeechPurged = gsub("New England", "Eggers", inaug$inaugSpeech)
inaug$inaugSpeechPurged = gsub("British Columbia", "Krawatzek", inaug$inaugSpeechPurged)
sum(grepl("england|english|britain|british", inaug$inaugSpeechPurged, ignore.case = T))

# look at the text around the instances of England/English/British 
pattern = "England|English|Britain|British"
# pattern = "England|English|Britain|British"
# str_extract_all(inaug$inaugSpeechPurged, pattern)

diff = 100
for(i in 1:nrow(inaug)){
  m = gregexpr(pattern, inaug$inaugSpeechPurged[i])
  if(m[[1]][1] == -1){next}   # if there is no match 
  cat("####", inaug$Year[i], ": ", inaug$President[i], "\n\n", sep = "")
  for(j in m[[1]][1]){
    cat(j, "\n")
    cat(str_sub(inaug$inaugSpeechPurged[i], j - diff, j + diff), "\n")
  }
  cat(" --------  \n\n\n")
}

### let's count the tokens in each one 
# let's just add a column that says whether the word appears in the text, and a another column to say how many times 
inaug$strong.binary = grepl("strong", tolower(inaug$inaugSpeech))
inaug$weak.binary = grepl("weak", tolower(inaug$inaugSpeech))

require(quanteda)
token_list = tokenize(tolower(as.character(inaug$inaugSpeech)), removePunct = T)  # tokenize from quanteda

inaug$token.count = sapply(token_list, length) # how many tokens/words were used? 


# how many times does something appear in the text? 
inaug$token.count = sapply(token_list, length)


# some of the students could try to program that themselves 

### and then count the number of times a given token appeared