-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
121 lines (96 loc) · 3.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import time
import requests
from selenium import webdriver
from bs4 import BeautifulSoup as bs
def get_video_links(channel_url, verbose, sleep_time):
videos_url = channel_url + '/videos'
# if you get error look at readme file for instructions
driver = webdriver.Firefox()
driver.get(videos_url)
time.sleep(sleep_time)
# scroll dow to the button of the page
if verbose:
print("Opening the channel in FireFox and scrolling to the bottom of the page ....")
while True:
old_height = driver.execute_script("return document.documentElement.scrollHeight")
driver.execute_script("window.scrollTo(0, " + str(old_height) + ");")
time.sleep(sleep_time)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == old_height:
break
# parse the html and get the links for the videos
soup = bs(driver.page_source, "html.parser")
video_tags = soup.findAll('a', attrs={'class': 'yt-simple-endpoint style-scope ytd-grid-video-renderer'})
if verbose:
print("##################")
print("Video links:")
links = []
for tag in video_tags:
if 'href' in tag.attrs:
links.append(tag.attrs['href'])
if verbose:
print(tag.attrs['href'])
return links
def get_video_info(video_url):
""" gets a youtube video url and returns its info in a json format"""
information = {'url': video_url}
response = requests.get(video_url)
soup = bs(response.content, "html.parser")
description_tag = soup.find('p', id='eow-description')
if description_tag:
information['description'] = description_tag.text
else:
information['description'] = ''
return information
def get_article_title(description):
""" gets a description of a video in 2 min paper channel and returns the title of article"""
keyword = 'The paper "'
try:
if keyword in description:
description = description[description.index(keyword) + len(keyword):]
title = description[0:description.index('"')]
else:
title = 'unknown'
except:
title = 'unknown'
return title
def crawl_youtube_channel(channel_url, verbose=False, sleep_time=3, links_path=None):
""" gets a Youtube channel url and returns a dictionary containing info about the videos"""
if links_path:
links = []
links_file = open(links_path, "r")
lines = links_file.readlines()
for line in lines:
links.append(line)
links_file.close()
else:
links = get_video_links(channel_url, verbose=verbose, sleep_time=sleep_time)
result = {}
video_ID = 1
unknowns = 0
counter = 1
for link in links:
information = get_video_info("https://www.youtube.com/" + link)
information['article'] = get_article_title(information['description'])
del information['description']
if information['article'] != 'unknown':
result[video_ID] = information
video_ID += 1
if verbose:
print('>>> processing video : ' + str(counter) + ' success')
else:
if verbose:
print('>>> processing video : ' + str(counter) + ' Failed')
unknowns += 1
counter += 1
print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
print(str(unknowns) + ' failure')
print(str(len(links) - unknowns) + ' out of ' + str(len(links)) + ' success')
return result
if __name__ == '__main__':
# provide the youtube channel url here
youtube_url = 'https://www.youtube.com/channel/UCbfYPyITQ-7l4upoX8nvctg'
data = crawl_youtube_channel(youtube_url, verbose=True, links_path='Data/Two-min-papers_video-links.txt')
with open('Data/' + youtube_url.split('/')[-1] + '.txt', 'w') as outfile:
json.dump(data, outfile, indent=4)