-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathreader.py
39 lines (31 loc) · 1.14 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
def get_comments():
"""Get the toxic comments from csv.
Returns
-------
x_train, x_test : Pandas Series of Object Data Type
Unclean and untokenize Comments
y_train, y_test : Pandas Dataframe of int64 Data Type
Labels (either 0 or 1) for each category: toxic, severe_toxic, obscene, threat, insult, identity_hate
Examples
--------
>>> from reader import get_comments
>>> x_train, y_train, x_test, y_test = get_comments()
Notes
-----
The data is split into train and test sets as described in the original Kaggle
"""
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test_cleaned.csv')
x_train = df_train['comment_text']
x_test = df_test['comment_text']
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = df_train[list_classes]
y_test = df_test[list_classes]
return x_train, y_train, x_test, y_test
if __name__ == "__main__":
x_train, y_train, x_test, y_test = get_comments()
print(x_train.head())
print(y_train.head())
print(x_test.head())
print(y_test.head())