-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
125 lines (103 loc) · 4.56 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
import os
from zipfile import ZipFile
from io import BytesIO
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
class Download_read_csv():
def __init__(self, root, filename, filetype, download):
self.root = root
self.filename = filename
self.filetype = filetype
self.download = download
if self.download:
self.download_movielens()
self.ratings = self.read_ratings_csv()
else:
self.ratings = self.read_ratings_csv()
def download_movielens(self) -> None: # "-> None" 리턴값을 나타내는 것
root_path = self.root # 'dataset'
file_name = self.filename
file_type = self.filetype
url = "http://files.grouplens.org/datasets/movielens/" + file_name + file_type
# Downloading the file by sending the request to the URL
req = requests.get(url)
# 최초 directory 생성
if not os.path.exists(root_path):
os.makedirs(root_path)
with open(os.path.join(root_path, file_name + file_type), 'wb') as output_file:
output_file.write(req.content)
# extracting the zip file contents
zipfile = ZipFile(BytesIO(req.content))
zipfile.extractall(path=root_path)
print('Dataset Download Complete')
def read_ratings_csv(self):
ratings = pd.read_csv(self.root + '/' + self.filename + '/' + 'ratings.csv')
ratings = ratings.drop("timestamp", axis=1)
#ratings = sklearn.utils.shuffle(ratings)
return ratings
def data_processing(self):
train_ratings = self.ratings.copy()
test_ratings = self.ratings.sample(frac=1).drop_duplicates(['userId'])
tmp_dataframe = pd.concat([train_ratings, test_ratings])
train_ratings = tmp_dataframe.drop_duplicates(keep=False)
# ignore warnings
np.warnings.filterwarnings('ignore')
# explicit feedback -> implicit feedback
train_ratings.loc[:, 'rating'] = 1
test_ratings.loc[:, 'rating'] = 1
return train_ratings, test_ratings
class MovieLens(Dataset):
def __init__(self, total_ratings ,ratings, ng_num):
super(MovieLens, self).__init__()
self.total_ratings = total_ratings
self.ratings = ratings
self.ng_num = ng_num
self.num_users, self.num_items = self.get_num()
self.all_movieIds = self.get_allmovieIds()
self.users, self.items, self.labels = self.negative_feedback_augmentation()
def __len__(self):
return len(self.users)
# index에 맞는 sample을 return
def __getitem__(self, index):
return self.users[index], self.items[index], self.labels[index]
def get_num(self):
num_users = self.total_ratings['userId'].max() + 1
num_items = self.total_ratings['movieId'].max() + 1
return num_users, num_items
def get_allmovieIds(self):
all_movieIds = self.total_ratings['movieId'].unique()
return all_movieIds
def negative_feedback_augmentation(self):
'''
ratings.csv는 explicit feedback이다. NCF 논문에 따라 0을 negative feedback으로 가정한다.
rating column에 존재하는 value는 1로 치환한다.
'''
users, items, labels = [], [], []
user_item_set = set(zip(self.ratings['userId'], self.ratings['movieId']))
total_user_item_set = set(zip(self.total_ratings['userId'], self.total_ratings['movieId']))
# negative feedback dataset 증가 비율
negative_ratio = self.ng_num
for u, i in user_item_set:
# positive instance
users.append(u)
items.append(i)
labels.append(1)
# visited check
visited_check_list = []
# negative instance
for i in range(negative_ratio):
# first item random choice
negative_item = np.random.choice(self.all_movieIds)
# 해당 item이 user와 interaction이 있었는지 확인하고, interaction이 있었다면 negative_item을 계속 랜덤하게 할당
while (u, negative_item) in total_user_item_set or negative_item in visited_check_list:
negative_item = np.random.choice(self.all_movieIds)
users.append(u)
items.append(negative_item)
labels.append(0)
visited_check_list.append(negative_item)
return torch.tensor(users), torch.tensor(items), torch.tensor(labels)