forked from foursking1/jd
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
82 lines (69 loc) · 3.15 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
__author__ = 'foursking'
from gen_feat import make_train_set
from gen_feat import make_test_set
from sklearn.model_selection import train_test_split
import xgboost as xgb
from gen_feat import report
def xgboost_make_submission():
train_start_date = '2016-03-10'
train_end_date = '2016-04-11'
test_start_date = '2016-04-11'
test_end_date = '2016-04-16'
sub_start_date = '2016-03-15'
sub_end_date = '2016-04-16'
user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
dtrain=xgb.DMatrix(X_train, label=y_train)
dtest=xgb.DMatrix(X_test, label=y_test)
param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3,
'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
num_round = 283
param['nthread'] = 4
#param['eval_metric'] = "auc"
plst = param.items()
plst += [('eval_metric', 'logloss')]
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst=xgb.train(plst, dtrain, num_round, evallist)
sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
y = bst.predict(sub_trainning_data)
sub_user_index['label'] = y
pred = sub_user_index[sub_user_index['label'] >= 0.03]
pred = pred[['user_id', 'sku_id']]
pred = pred.groupby('user_id').first().reset_index()
pred['user_id'] = pred['user_id'].astype(int)
pred.to_csv('./sub/submission.csv', index=False, index_label=False)
def xgboost_cv():
train_start_date = '2016-03-05'
train_end_date = '2016-04-06'
test_start_date = '2016-04-11'
test_end_date = '2016-04-16'
sub_start_date = '2016-02-05'
sub_end_date = '2016-03-05'
sub_test_start_date = '2016-03-05'
sub_test_end_date = '2016-03-10'
user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0)
dtrain=xgb.DMatrix(X_train, label=y_train)
dtest=xgb.DMatrix(X_test, label=y_test)
param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
num_round = 4000
param['nthread'] = 4
param['eval_metric'] = "auc"
plst = param.items()
plst += [('eval_metric', 'logloss')]
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst=xgb.train( plst, dtrain, num_round, evallist)
sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date,
sub_test_start_date, sub_test_end_date)
test = xgb.DMatrix(sub_trainning_date)
#y = bst.predict(test)
pred = sub_user_index.copy()
y_true = sub_user_index.copy()
pred['label'] = y
y_true['label'] = label
report(pred, y_true)
if __name__ == '__main__':
#xgboost_cv()
xgboost_make_submission()