-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadCommentsBySubreddit.py
executable file
·135 lines (91 loc) · 4.05 KB
/
DownloadCommentsBySubreddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# https://pushshift.io/api-parameters/
# https://www.reddit.com/r/pushshift/comments/8h31ei/documentation_pushshift_api_v40_partial/
# all the replies to a specific comment
# https://www.reddit.com/r/pushshift/comments/a7zzhn/using_parent_id_to_search_for_comments/
import pandas as pd
from psaw import PushshiftAPI
import requests
import os
import praw
import sys
import datetime
from functions import CreateLinkPushshift
###############################################################################
my_client_id = '74CBTlXRo31zAg'
my_client_secret = 'qvpDreHLSY2-QicrieFmc_R-NfM'
my_user_agent = 'newVisualization'
reddit = praw.Reddit(client_id = my_client_id,
client_secret = my_client_secret,
user_agent = my_user_agent)
api = PushshiftAPI(reddit)
# Defining folder paths
parent_folder = '/home/jpre/Documents/DTU/COVIDpolitics2021/COVIDvsPOLITICS/data/'
data_folder = parent_folder + 'data/'
posts_folder = data_folder + 'posts/'
comment_folder = posts_folder + 'comments/'
BODYTITLE_DICT = {'comment' : 'body', 'submission' : 'title'}
# Defining the date range
# %H:%M:%S %d/%m/%Y
# datetime.datetime.fromtimestamp(1582260266)
first_t = datetime.datetime.strptime(('{0}:{1}:{2} {3}/{4}/{5}').format(0,0,0,1,1,2019), '%H:%M:%S %d/%m/%Y')
last_t = datetime.datetime.strptime(('{0}:{1}:{2} {3}/{4}/{5}').format(23,59,59,31,12,2019), '%H:%M:%S %d/%m/%Y')
delta_t = 60*60*4
range_of_t = range(int(first_t.timestamp()),int(last_t.timestamp()), delta_t)
###############################################################################
# Defining search
type_of_search = 'comment' # there are three main options: comment, submission, subreddit
# https://pushshift.io/api-parameters/
filters = {}
fields = ['created_utc',
'retrieved_on',
'author',
'subreddit',
'score',
'id',
'parent_id',
'link_id',
'author_flair_text',
'author_flair_type',
'total_awards_received']
b_t = BODYTITLE_DICT[type_of_search]
########################################
########################################
istest = False
########################################
########################################
# Name of the file where to store all data
relevant_subreddits = ['republican']
for foldername in relevant_subreddits:
filters['subreddit'] = foldername
# Creating a folder where to store the data
file_folder = comment_folder + foldername + '2019/'
if istest:
file_folder = file_folder + 'test/'
if not os.path.exists(file_folder):
os.makedirs(file_folder)
# Creating main files
file_elements = open(file_folder + 'data1' +'.txt', 'w')
file_metadata_elements = open(file_folder + 'metadata1' +'.txt', 'w')
for day_i in range_of_t:
# To find comments from a specific post: filters['link_id'] = post_id
filters['after'] = day_i
filters['before'] = filters['after'] + delta_t - 1
search_link = CreateLinkPushshift(type_of_search,
search_criteria = filters,
fields_to_retrieve = fields + [b_t],
N = 100)
#download all comments from post
# "https://api.pushshift.io/reddit/comment/search/?link_id=bc99el"
try:
# Get the data from Pushshift as JSON.
retrieved_data = requests.get(search_link)
returned_elements = retrieved_data.json()['data']
for element in returned_elements:
# element is a dictionary
element_info = '\t'.join([str(element[field]) for field in fields]) + '\n'
file_elements.write(element[b_t] + '_/zvzvzvzv/EndOfElement\yxyxyxy\_')
file_metadata_elements.write(element_info)
except:
continue
file_elements.close()
file_metadata_elements.close()