forked from turnerdan/joethecorpusrogan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.R
229 lines (172 loc) · 7.94 KB
/
ngram.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
### Joe The Corpus Rogan
### By Dan Turner
### NGRAM FILE GENERATION SCRIPT
# This file performs some basic summarization of the ngram frequency in the transcripts.
# It reads transcripts from /t-raw/ and uses scrapelist.rds to generate ngram frequencies.
# The output goes in /ngram/
### Updated 1-19-2020
###########
## Setup ##
###########
# Load packages
library(rvest) # Web scraping
library(stringr) # String handling
library(readr) # Nice reading and writing
library(ngram)
library(lubridate)
library(tm)
# WD
setwd("~/Git/joethecorpus")
# Episodes dataframe
eps <- readRDS("~/Git/joethecorpus/scrapelist.rds")
# We only care about the ones with transcripts
eps <- subset(eps, eps$t.avail == TRUE)
#############################
## Load and clean the text ##
#############################
# First let's see what episodes we have transcripts for (filename sans .txt)
t.list <- list.files(path = "./t-raw", recursive = TRUE,
pattern = "\\.txt$",
full.names = TRUE)
# Same as above, but just the numbers
t.list.n = as.double( sub(".txt", "", list.files("./t-raw")) )
# Read every transcript at once line by line
lines = lapply(t.list, readLines)
# Combine the above with the episode number list
lines = data.frame("n" = t.list.n, "txt" = unlist(lines))
# Combine the above with the episode list
eps = merge.data.frame(lines, eps, by.x = "n", by.y = "n")
# Garbage collection
rm(lines, t.list, t.list.n)
# Clean the text
eps$txt = str_remove_all(eps$txt, "[0-9][0-9]:[0-9][0-9]:[0-9][0-9]") # Remove timestamps
eps$txt = gsub("►", "", eps$txt) # Remove play arrows
eps$txt = gsub("[^[:alnum:][:space:]']", "", eps$txt) # Remove punctuation except "< ' >
eps$txt = str_squish(eps$txt) # Remove extra whitespace
##################################################
## Identify common terms across all transcripts ##
##################################################
# Extract the most frequent n tokens
cutoff = 1000
# Let's get 'cutoff' number of most frequent tokens across all transcripts
all.txt = paste(eps$txt, sep="", collapse="")
# Remove nonalphabetical characters, retain spaces
#all.txt = str_squish( gsub("[^[:alnum:][:space:]]", "", all.txt) )
# Filter out stopwords here if desired
#stopwords(kind = "en")
# Generate 1- 2- and 3- grams for all transcripts
all.gram1 = ngram(all.txt, n = 1, sep = " ")
all.gram2 = ngram(all.txt, n = 2, sep = " ")
all.gram3 = ngram(all.txt, n = 3, sep = " ")
# Frequency table generation
all.gram1 = get.phrasetable(all.gram1)
all.gram2 = get.phrasetable(all.gram2)
all.gram3 = get.phrasetable(all.gram3)
# Trim the frequnecy tables to the cutoff
# Have to unlist them and unname them for easier matching later
# Squish out unwanted white space (this is necessary after each tokenization)
top.gram1 = str_squish( unname(unlist(head(all.gram1, cutoff))) )
top.gram2 = str_squish( unname(unlist(head(all.gram2, cutoff))) )
top.gram3 = str_squish( unname(unlist(head(all.gram3, cutoff))) )
# After squishing we have dupliacates, because a space made some ngrams unique
top.gram1 = unique(top.gram1)
top.gram2 = unique(top.gram2)
top.gram3 = unique(top.gram3)
# Write the all.gram data
write_csv(all.gram1, "jtcr_unigram.csv")
write_csv(all.gram2, "jtcr_bigram.csv")
write_csv(all.gram3, "jtcr_trigram.csv")
write_lines(all.txt, "jtcr_oneline.txt")
# Garbage collection
rm(all.gram1, all.gram2, all.gram3, all.txt)
###################################################################
## Calculate 1-, 2-, and 3-gram term frequencies by month > year ##
###################################################################
# Prepare the ngram dataframes
gram1 = data.frame(matrix(ncol=(3 + length(top.gram1)), nrow=0, dimnames=list(NULL, c("year", "month", "eplist", top.gram1))))
gram2 = data.frame(matrix(ncol=(3 + length(top.gram2)), nrow=0, dimnames=list(NULL, c("year", "month", "eplist", top.gram2))))
gram3 = data.frame(matrix(ncol=(3 + length(top.gram3)), nrow=0, dimnames=list(NULL, c("year", "month", "eplist", top.gram3))))
# Extract the month and year for each episode and write to a new column
eps$date_year = year(eps$date)
eps$date_month = month(eps$date)
# # FOR TESTING
# month = 1
# year = 2019
# Set up a month > year dataframe
for (year in unique( year(eps$date) ) ){
for (month in 1:12){
# Get all the podcasts in this month and year
mo.set = subset(subset(eps, date_year == year), date_month == month)
# If there's no podcast for this month and year, skip
if (nrow(mo.set) < 1) next
# Collect the transcripts for these episodes
mo.txt = paste(mo.set$txt, sep="", collapse="")
# Tokenization
mo.gram1 = ngram(mo.txt, n = 1, sep = " ")
mo.gram2 = ngram(mo.txt, n = 2, sep = " ")
mo.gram3 = ngram(mo.txt, n = 3, sep = " ")
# Don't need the text anymore
rm(mo.txt)
# Frequency table generation
mo.gram1 = get.phrasetable(mo.gram1)
mo.gram2 = get.phrasetable(mo.gram2)
mo.gram3 = get.phrasetable(mo.gram3)
# Clean the monthly grams of extra whitespace
mo.gram1$ngrams = str_squish( mo.gram1$ngrams)
mo.gram2$ngrams = str_squish( mo.gram2$ngrams)
mo.gram3$ngrams = str_squish( mo.gram3$ngrams)
##################################################
## Write out the monthly ngram phrasetable data ##
##################################################
# New rows for gramNs
bottom1 = nrow(gram1) + 1
bottom2 = nrow(gram2) + 1
bottom3 = nrow(gram3) + 1
# Prepare the new row in the gram dataframes
gram1[bottom1,] <- rep(0, ncol(gram1))
gram1$year[bottom1] <- year
gram1$month[bottom1] <- month
gram1$eplist[bottom1] <- list(mo.set$n)
# Prepare the new row in the gram dataframes
gram1[bottom2,] <- rep(0, ncol(gram2))
gram1$year[bottom2] <- year
gram1$month[bottom2] <- month
gram1$eplist[bottom2] <- list(mo.set$n)
# Prepare the new row in the gram dataframes
gram1[bottom3,] <- rep(0, ncol(gram3))
gram1$year[bottom3] <- year
gram1$month[bottom3] <- month
gram1$eplist[bottom3] <- list(mo.set$n)
# Stage the loops for each gramN: what grams are available?
stage.gram1 = intersect(mo.gram1$ngrams, top.gram1)
stage.gram2 = intersect(mo.gram2$ngrams, top.gram2)
stage.gram3 = intersect(mo.gram3$ngrams, top.gram3)
# Populate gram1
for (g in stage.gram1){
gram1[bottom1, g] <- mo.gram1$freq[ mo.gram1$ngrams == g ]
}
# Garbage collection
rm(stage.gram1, bottom1, mo.gram1)
# Populate gram2
for (g in stage.gram2){
gram2[bottom2, g] <- mo.gram2$freq[ mo.gram2$ngrams == g ]
}
# Garbage collection
rm(stage.gram2, bottom2, mo.gram2)
# Populate gram3
for (g in stage.gram3){
gram3[bottom3, g] <- mo.gram3$freq[ mo.gram3$ngrams == g ]
}
# Garbage collection
rm(stage.gram3, bottom3, mo.gram3)
} # /month loop
} # /year loop
# Note: this takes quite long. I could rewrite the for loops into something faster, but I'm only going to run this once ever, so I don't mind waiting and writing a long comment instead of splitting up the loop into parallelized foreach calls or something like that -- but YOU could do something like this if it's interesting to you. Also note there's a couple places I split off the data flow so you can handle the text in a way more tailored for your application, such as filtering out stopwords if that's a priority for you. I did not do this because some words that are 'stopwords' are imporant in a multigram context. For example, in Joe The Corpus Rogan, 'the' is not the same gram as in 'this is the end of the comment'.
# Write the mo.gram data
write_csv(as.data.frame(gram1), "jtcr_unigram_mo.csv")
write_csv(gram2, "jtcr_bigram_mo.csv")
write_csv(gram3, "jtcr_trigram_mo.csv")
# Garbage collection
rm(top.gram1, top.gram2, top.gram3, g, cutoff, year, month, mo.set)
# That's all! Now have have the building blocks for basic text analysis, and some advanced stuff.
# Done