### A few activities for learning some R basics ## vectors in R, created by c() vec1 = c(1,4,3,6) # can make them out of numbers vec = c("apple", "2 bananas", "pineapple", "another fruit") # can make them out of "characters", or "strings" ### without knowing anything about strings, we could ask R to tell us which of these elements is equal to "apple": vec == "apple" ### but what if we want to ask R which of these elements contain "apple"? grepl("apple", vec) grepl("app", vec) ## the syntax is grepl(pattern, text) ## "apple" is the regular expression -- the search pattern. ### grep() is a related command that by default gives you the indices of the matching elements grep("apple", vec) ## or the matching elements themselves grep("apple", vec, value = T) ### Now what if you wanted to know which entries contain the word "apple", but not as part of another word. ## first I am going to show you another command that could be useful for this sub("a", "A", vec) gsub("a", "A", vec) # brute force option that requires you to know exactly what you want to avoid vec2 = sub("pineapple", "pineXXX", vec) grepl("apple", vec2) # more subtle approach grepl("^apple", vec) # inside a regular expression, "^" means "the start of the string of characters." # so what would grepl("^a") # give us? 1 & 4 # inside a regular expression, "$" means "the end of the string of characters" # so how do we identify the entries that end with e? grepl("e$", vec) ### Some other useful metacharacters # "." is the wildcard # \\d is a digit # \\D is non-digit # \\s means a space # + means one or more # * means zero or more grepl("\\s", vec) grepl("\\d", vec) # [abc] match any one of a, b, or c grepl("b[ai]n", c("banana", "binary", "bones")) # another useful thing: "abc|def" means "match abc or def" grepl("app|ban", vec) ## OK so on to the activity # Times Guide to the House of Commons: short text biographies for every candidacy in the 1950 - 1970 elections. d = read.csv("http://andy.egge.rs/data/THC_candidates.csv", as.is = T) ## let's try to get some covariates from the bios # how could we record whether they went to Oxford or Cambridge? d$oxford_match = grepl("Oxford", d$bio) d$cambridge_match = grepl("Cambridge", d$bio) # some issues there but no big deal # let's try to get the date of birth # we start by just asking does it say age (numbers), or born in (numbers) d$age_xx = grepl("aged? \\d\\d\\D", tolower(d$bio)) d$born_in_xx = grepl("born in \\d{4}\\D", tolower(d$bio)) d$xx_years_of_age = grepl("\\d\\d years of age", tolower(d$bio)) # but now we actually want to get the age out of it. # let's just focus on born in XXXX # and let's try the str_match() function, which is in stringr require(stringr) dd = d[1:50,] str_match(dd$bio, "born in \\d{4}") str_match(dd$bio, "born in (\\d{4})") str_match(dd$bio, "born in .{40}") str_match(dd$bio, "born in .{0,40}?(\\d{4})") # and then we would do a similar thing for age or aged, and XX years of age # can you think of way to code the gender of the candidates? # how do we extract the first word of the bio head(str_match(d$bio, "^\\w+")) # extract the first word of the bio sort(table(str_match(d$bio, "^\\w+"))) d$female = grepl("^Mrs|^Miss|^Lady|^Dame|^Viscountess", d$bio) d$female.maybe = grepl("^Dr|^The|^Prof", d$bio) & grepl("\\sshe\\s|\\sher\\s", d$bio) d$female[d$female.maybe] = T ## OK then if there is still time we could look at how these things change over time. # Did the proportion of female candidates increase over this period? d$Date = as.Date(d$date) counts = table(d$female, d$Date) fem.counts = counts[2,] plot(sort(unique(d$Date)), fem.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Number of female candidates", ylim = c(0, max(counts[2,]))) tot.counts = counts[1,] + counts[2,] plot(sort(unique(d$Date)), fem.counts/tot.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Proportion of female candidates", ylim = c(0, .1)) use = d$winner == 1 counts = table(d$female[use], d$Date[use]) fem.counts = counts[2,] plot(sort(unique(d$Date)), fem.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Number of female winners", ylim = c(0, max(counts[2,]))) tot.counts = counts[1,] + counts[2,] plot(sort(unique(d$Date)), fem.counts/tot.counts, type = "b", pch = 19, xlab = "Election date", ylab = "Proportion of female winners", ylim = c(0, .06)) ## so this is your introduction to "metacharacters" in regular expressions. # how would we identify the elements of vec that end with an a? grepl("a$", vec) ## let's learn a few more metacharacters and see what we can use ## download the data d = read.csv("http://andy.egge.rs/data/candidates.csv") d$miss = grepl("miss", d$bio, ignore.case = T) d$barrister = grepl("barrister", d$bio, ignore.case = T) d$legal = grepl("barrister|solicitor|", d$bio, ignore.case = T) grepl("^Miss|^Mrs|^Lady", d$bio) ### Activity using the SOTU and inaugural data for something inaug = tbl_df(read.csv("inaugTexts.csv")) ## in how many speeches does the word "citizen" appear? sum(grepl("citizen", inaug$inaugSpeech, ignore.case = T)) # how about England, English, Britain, British sum(grepl("england|english|britain|british", inaug$inaugSpeech, ignore.case = T)) # but not New England, British Columbia inaug$inaugSpeechPurged = gsub("New England", "Eggers", inaug$inaugSpeech) inaug$inaugSpeechPurged = gsub("British Columbia", "Krawatzek", inaug$inaugSpeechPurged) sum(grepl("england|english|britain|british", inaug$inaugSpeechPurged, ignore.case = T)) # look at the text around the instances of England/English/British pattern = "England|English|Britain|British" # pattern = "England|English|Britain|British" # str_extract_all(inaug$inaugSpeechPurged, pattern) diff = 100 for(i in 1:nrow(inaug)){ m = gregexpr(pattern, inaug$inaugSpeechPurged[i]) if(m[[1]][1] == -1){next} # if there is no match cat("####", inaug$Year[i], ": ", inaug$President[i], "\n\n", sep = "") for(j in m[[1]][1]){ cat(j, "\n") cat(str_sub(inaug$inaugSpeechPurged[i], j - diff, j + diff), "\n") } cat(" -------- \n\n\n") } ### let's count the tokens in each one # let's just add a column that says whether the word appears in the text, and a another column to say how many times inaug$strong.binary = grepl("strong", tolower(inaug$inaugSpeech)) inaug$weak.binary = grepl("weak", tolower(inaug$inaugSpeech)) require(quanteda) token_list = tokenize(tolower(as.character(inaug$inaugSpeech)), removePunct = T) # tokenize from quanteda inaug$token.count = sapply(token_list, length) # how many tokens/words were used? # how many times does something appear in the text? inaug$token.count = sapply(token_list, length) # some of the students could try to program that themselves ### and then count the number of times a given token appeared