-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlessons.Rmd
176 lines (125 loc) · 7.15 KB
/
lessons.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
---
title: "R Notebook"
output: html_notebook
---
```{r lesson 1}
install.packages("Gabina")
library(splitstackshape)
person1 <- "Gabina"
person2 <- "Ania"
persons <- c("Gabina", "Ania")
persons <- data.frame(name = c("Gabina|Vojta", "Czarek|Ania"),gender = c("female|male","male|female"))
persons3 <- data.frame(name = c("Gabina", "Ania"),gender = "female|female",test = c("test1", "test2", "test3"))
persons <- read.csv2("C:/Users/Cezary/Desktop/names.csv", encoding = "UTF-8", header = TRUE, stringsAsFactors = FALSE) %>%
select(1,2) %>%
cSplit(.,c("name", "gender"),sep = "|", direction = "long")
name1 <- str_replace(persons$name,"(^.*)(\\|)(.*$)","\\1")
name2 <- str_replace(persons$name,"(^.*)(\\|)(.*$)","\\3")
gender1 <-str_replace(persons$gender,"(^.*)(\\|)(.*$)","\\1")
gender2 <- str_replace(persons$gender,"(^.*)(\\|)(.*$)","\\3")
persons2 <- data.frame(name = c(name1,name2),gender = c(gender1,gender2))
test1 <- persons %>%
cSplit(.,c("name", "gender"),sep = "|", direction = "long")
name <- persons$name
name <- str_extract(name,"(.*)(?=.*$)")
plist <- cSplit(plist,"name_single",sep = "|",direction = "long")
number1 <- 1
persons <- as.list(persons)
```
```{r lesson 2}
pacman::p_load(googlesheets,zoo,openxlsx,stringr,splitstackshape,plyr,dplyr,stringdist,fuzzyjoin,data.table,svMisc,tidyverse,rvest,RSelenium,RJDBC,rjson,jsonlite,sqldf,XML,methods)
#empty table
json_file <- data.frame(nlp_json = character(), stringsAsFactors=FALSE)
#for loop
#binman::rm_platform("phantomjs")
#wdman::selenium(retcommand = TRUE)
rD <- rsDriver(port=4444L,browser="chrome", chromever="78.0.3904.11")
#binman::list_versions("chromedriver")
remDr <- rD$client
###Seifert###
library(jsonlite)
library(tidyverse)
url <- c("https://data.bn.org.pl/api/bibs.json?author=seifert+jaroslav&limit=100","https://data.bn.org.pl/api/bibs.json?limit=100&subject=seifert+jaroslav")
seifert_author <- jsonlite::fromJSON("https://data.bn.org.pl/api/bibs.json?author=seifert+jaroslav&limit=100") %>% .$bibs %>%
select(-28)
seifert_subject <- jsonlite::fromJSON("https://data.bn.org.pl/api/bibs.json?limit=100&subject=seifert+jaroslav") %>% .$bibs %>%
select(-28)
write.csv2(seifert_author, file = "C:/Users/Cezary/Desktop/seifert_author.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
write.csv2(seifert_subject, file = "C:/Users/Cezary/Desktop/seifert_subject.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
name <- paste(str_replace_all(str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`100` %>% `[[`(3)),collapse = "|"),"\\|NA"),"(\\|)(\\(\\d+)"," \\2"),str_replace_all(str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`400` %>% `[[`(3)),collapse = "|"),"\\|NA"),"(\\|)(\\(\\d+)"," \\2"),sep = "|")
viaf <- str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`024` %>% .$subfields),collapse = "|"),"(\\|NA)|(\\|viaf)")
--------------------------------------------------------------
x <- 1:length(url)
for (i in x) {
progress(match(i,x), max.value = length(x))
webpage <- url[i]
remDr$navigate(webpage)
webElem <- remDr$findElement(using = 'css selector', "body:nth-child(2) > pre:nth-child(1)")
json_text <- webElem$getElementText()
json_text <- as.character(json_text)
iteration <- cbind(i, json_text)
json_file <- rbind(json_file,iteration)
}
nlp_json <- fromJSON(as.character(json_file$json_text)[1])
nlp_json <- nlp_json$bibs
####
test <- nlp_json[1]
names(test)
df <- data.frame(matrix(unlist(test), nrow=length(test), byrow=T))
data.frame(id=names(l), nobs=unlist(l))
df <- data.frame(matrix(unlist(nlp_json[1]), nrow=length(nlp_json[2]), byrow=T))
df <- as.data.frame(as.matrix(nlp_json))
onelist <- data.frame(list(df$V1))
nlp_json2 <- nlp_json$
nlp_json2 <- data.frame(nlp_json$bibs)
################
word.list <- list(letters[1:4], letters[1:5], letters[1:2], letters[1:6])
n.obs <- sapply(word.list, length)
seq.max <- seq_len(max(n.obs))
mat <- t(sapply(word.list, "[", i = seq.max))
####################
test <- data.frame(nlp_json[1])
```
```{r}
pbl_bn_viaf <- gs_read(gs_title("mapowanie_osob_bn_pbl"), ws = "pbl_bn") %>%
select(id) %>%
unique()
pbl_viaf <- data.frame(stringsAsFactors=FALSE)
x <- 1:length(pbl_bn_viaf$id)
for (i in x) {
progress(match(i,x), max.value = length(x))
tryCatch({
name <- paste(str_replace_all(str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`100` %>% `[[`(3)),collapse = "|"),"\\|NA"),"(\\|)(\\(\\d+)"," \\2"),str_replace_all(str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`400` %>% `[[`(3)),collapse = "|"),"\\|NA"),"(\\|)(\\(\\d+)"," \\2"),sep = "|")
viaf <- str_remove_all(paste(unlist(jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`024` %>% .$subfields),collapse = "|"),"(\\|NA)|(\\|viaf)")
}, error=function(e){
name <<- "brak danych (CR)"
viaf <<- "brak danych (CR)"
})
iteration <- data.frame(id = pbl_bn_viaf$id[i], name = name, viaf = viaf)
pbl_viaf <- rbind(pbl_viaf,iteration)
}
write.csv2(pbl_viaf, file = "C:/Users/Cezary/Desktop/pbl_bn_viaf.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
a <- jsonlite::fromJSON("https://data.bn.org.pl/api/authorities.json?id=1275862") %>% .$authorities %>% .$marc %>% .$fields %>% `[[`(1) %>% .$`024` %>% .$subfields
paste(unlist(a),sep = "|")
```
```{r Seifert}
###Seifert###
library(jsonlite)
library(tidyverse)
url <- c("https://data.bn.org.pl/api/bibs.json?author=seifert+jaroslav&limit=100","https://data.bn.org.pl/api/bibs.json?limit=100&subject=seifert+jaroslav")
seifert_author <- jsonlite::fromJSON("https://data.bn.org.pl/api/bibs.json?author=seifert+jaroslav&limit=100") %>% .$bibs %>%
select(-28)
seifert_subject <- jsonlite::fromJSON("https://data.bn.org.pl/api/bibs.json?limit=100&subject=seifert+jaroslav") %>% .$bibs %>%
select(-28)
write.csv2(seifert_author, file = "C:/Users/[nazwa użytkownika]/Desktop/seifert_author.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
write.csv2(seifert_subject, file = "C:/Users/[nazwa użytkownika]/Desktop/seifert_subject.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
```
```{r excercise}
write.csv2(data1, file = "C:/Users/Cezary/Desktop/exercise.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
#1. Download csv file to RStudio
#2. Select only these columns from the dataset: id, 008, 100, 245
#3. Extract in new variable a year of publication from the column 008.
#4. Filter the dataset for 2009 only.
#5. Save the data frame as a csv file on your local disk.
persons <- read.csv2("C:/Users/Cezary/Desktop/names.csv", encoding = "UTF-8", header = TRUE, stringsAsFactors = FALSE)
```