forked from turnerdan/joethecorpusrogan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrim.R
124 lines (89 loc) · 3.96 KB
/
trim.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
### Joe The Corpus Rogan
### By Dan Turner
### TRANSCRIPT & AUDIO PAIRING SCRIPT
# This file takes transcripts with inline timestamps and creates csv's that provide instructions to Praat for where to align the text.
# Transcripts are in /t-raw/, the podcast dataframe is scrapelist.rds in the root, and the trim files are written to /podchunks/.
### Updated 1-18-2020
###########
## Setup ##
###########
# Load packages
library(rvest) # Web scraping
library(stringr) # String handling
library(readr) # Nice reading and writing
library(lubridate) # Time interval calculator
# Working dir
setwd("~/Git/joethecorpus")
# Episodes dataframe
eps <- readRDS("~/Git/joethecorpus/scrapelist.rds")
# We only care about the ones with transcripts
eps <- subset(eps, eps$t.avail == TRUE)
########################################
## Extract timestamps and transcripts ##
########################################
# Columns for the timestamps in the transcripts, and the text itself
eps$t.span <- list(NA)
eps$t.txt <- ""
# First let's see what episodes we have transcripts for (filename sans .txt)
t.list = list.files(path = "./t-raw/")
t.list.n = as.double(gsub(".txt", "", t.list))
# Report
if (length( setdiff(eps$n, t.list.n) ) > 0){
print("Warning: Missing transcripts.")
}else{
print("All available transcripts scraped.")
}
# Extract the timestamps by looping the transcripts
for (tr in 1:length(t.list)){
# What is the corresponding row in eps for this podcast?
tr.ep = which(eps$n == t.list.n[tr])
# Read the file in
file = read_file( paste0("t-raw/", t.list[tr]) )
# Look for the time stamps
stamps.i = gregexpr("[0-9][0-9]:[0-9][0-9]:[0-9][0-9]", file)
stamps.i = as.numeric(stamps.i[[1]])
# Write all the timestamps to stamps, using their span in the file
stamps = substring(file, first = stamps.i, last = (stamps.i + 7))
# Write the stamps to the data frame
eps$t.span[tr.ep] <- list(stamps)
# Clean the transcript a bit, while it's in memory
#file = str_remove_all(file, "[0-9][0-9]:[0-9][0-9]:[0-9][0-9]") # Remove timestamps
# Split the file up by the play symbols
t.split = strsplit(file, "[0-9][0-9]:[0-9][0-9]:[0-9][0-9]")
# Save the transcript to the data frame
eps$t.txt[tr.ep] <- t.split
}
#############################################
## Trim transcripts to timestamp intervals ##
#############################################
# Loop the transcripts and intervals to save all intervals to /podchunks/
for (transcript in 1:nrow(eps)){
# Load the transcript text and number
transcript.text = eps$t.txt[transcript][[1]]
transcript.n = eps$n[transcript]
transcript.cnt = length(eps$t.txt[transcript][[1]])
# Get the start and end of each transcript interval in HMS
transcript.start = eps$t.span[transcript][[1]]
transcript.end = transcript.start[2:transcript.cnt]
#transcript.end[transcript.cnt] <- NA
# Convert times to seconds
transcript.start = period_to_seconds(hms(transcript.start))
transcript.end = period_to_seconds(hms(transcript.end))
# Clean the transcript of non-alphanumeric characters and spaces
transcript.text = gsub("[^[:alnum:][:space:]]","", transcript.text)
# Make it all lowercase
transcript.text = tolower(transcript.text)
# Create a temporary data frame with our interval number, its text, and its bounds for Praat
trim.frame = data.frame("interval" = 1:(transcript.cnt -1),
"text" = transcript.text[2:transcript.cnt],
"leftbound" = transcript.start,
"rightbound" = transcript.end
)
# Peg the last time stamp at the end of the episode
trim.frame$rightbound[nrow(trim.frame)] <- eps$dur[transcript]
# Write the result to /podchunks/
write_csv(trim.frame, paste0("podchunks/", transcript.n, ".csv"))
} #/transcript loop
# Done
# Next: Create TextGrids with the attached Praatscript, podchunk.praat
# Then we will force align using montreal forced aligner, using align.R