-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwo sigma financial modeling
292 lines (261 loc) · 9.51 KB
/
two sigma financial modeling
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 1 17:02:01 2017
@author: Cynthia Long
"""
'''
#Python Project Template
# 1. Prepare Problem
# a) Load libraries
# b) Load dataset
# 2. Summarize Data
# a) Descriptive statistics
# b) Data visualizations--8.3
# 3. Prepare Data
# a) Data Cleaning --8.4
# b) Feature Selection --8.9
# c) Data Transforms --8.11
# 4. Evaluate Algorithms
# a) Split-out validation dataset
# b) Test options and evaluation metric
# c) Spot Check Algorithms
# d) Compare Algorithms
# 5. Improve Accuracy
# a) Algorithm Tuning
# b) Ensembles
# 6. Finalize Model
# a) Predictions on validation dataset
# b) Create standalone model on entire training dataset
# c) Save model for later use'''
'''
Data Introduction**************************************************************
'''
#This dataset contains anonymized features pertaining to a time-varying value for a financial instrument.
#Each instrument has an id.
#Time is represented by the 'timestamp' feature and the variable to predict is 'y'.
#No further information will be provided on the meaning of the features, the transformations that were applied to them, the timescale, or the type of instruments that are included in the data. Moreover, in accordance with competition rules, participants must not use data other than the data linked from the competition website for the purpose of use in this competition to develop and test their models and submissions.
#In addition to the data, you will need to familiarize yourself with the Kernels environment and the competition data API. The API is designed to prevent accessing data beyond the timestamp for which you are predicting and informs you which ids require predictions at which timestamps. The API also provides a "reward" for each timestamp, in the form of an average R value over the predicted values for the previous day. You may choose to use this reward to do reinforcement-style learning. Your code should expect and handle missing values.
'''
#1. Prepare Problem************************************************************
#a) Load libraries
#b) Load dataset
#c) Slice dataset by ids & Save them
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_t=pd.read_csv('data_t.csv')
y_t=pd.read_csv('y_t.csv',names='y')
del data_t['Unnamed: 0']
#'''
## 2. Summarize Data************************************************************
## a) Descriptive statistics
## b) Data visualizations
#'''
'''
3. Prepare Data***************************************************************
a) Data Cleaning --8.4
b) Feature Selection --8.9
c) Data Transforms --8.11
'''
#n_nan=data_t.isnull().sum(axis=0)
#print(n_nan)
#percent=n_nan/data_t.shape[0]
#data_T=data_t.T
#data_v0=data_T[percent<=0.4]
#data_v=data_v0.T
m=round(data_t.shape[0]*0.6)#剔除缺失值超过40%的列
data_vcols=data_t.dropna(axis=1,thresh=m)
l=round(data_t.shape[1]*0.6)#剔除缺失值超过40%的行
data_vrc=data_vcols.dropna(axis=0,thresh=l)
data_i=data_vrc.interpolate(how='linear')
#找到含有nan的所有行——nan_rows = data_i[data_i.isnull().T.any().T]
del data_vcols,data_vrc,l,m
data_i=data_i.dropna(axis=0)
from sklearn.ensemble import IsolationForest
clf=IsolationForest()
## 训练
#fit()可以说是调用的通用方法。fit(X),表示用数据X来训练某种模型。
#函数返回值一般为调用fit方法的对象本身。fit(X,y=None)为无监督学习算法,fit(X,Y)为监督学习算法
#ilf.fit(data[X_cols])
clf.fit(data_i)
#shape = data.shape[0]
#batch = 10**6
cols=data_i.shape[0]
batch=10**3
#
#all_pred = []
all_ol=[]
#for i in range(shape/batch+1):
# start = i * batch
# end = (i+1) * batch
# test = data[X_cols][start:end]
# # 预测
# pred = ilf.predict(test)
# all_pred.extend(pred)
for i in range(int(cols/batch)):
start=i*batch
end=(i+1)*batch
test=data_i[start:end]
outliers=clf.predict(test)
all_ol.extend(outliers)
lg=cols-int(cols/batch)*batch
last_group=data_i[:][-lg:]
last_ol=clf.predict(last_group)
all_ol.extend(last_ol)
all_ol=pd.DataFrame(all_ol)
all_ol.columns=['ol']
#data['pred'] = all_pred
#data.to_csv('outliers.csv', columns=["pred",], header=False)
data_i=data_i.reset_index()
data_i1=pd.concat([data_i,all_ol],axis=1)
filter=data_i1['ol']
normal=filter>0
data_i2=data_i1[normal]
data_n=data_i2.drop('ol',1)
orig_index=data_n['index']
orig_index=pd.DataFrame(orig_index)
del data_n['index']
names=data_n.columns
#作者:云戒
#链接:http://www.jianshu.com/p/1b020e2605e2
#來源:简书
#著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
#scaler = MinMaxScaler(feature_range=(0, 1))
#rescaledX = scaler.fit_transform(X)
## summarize transformed data
#set_printoptions(precision=3)
y_t['index']=range(y_t.shape[0])
y_t1=pd.merge(y_t,orig_index,how='inner',on='index')
del y_t1['index']
del all_ol,batch,cols,data_i,data_i1,data_i2
del end,filter,i,last_group,last_ol,lg,normal,outliers,start,test
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))
res_data=scaler.fit_transform(data_n)
from numpy import set_printoptions
set_printoptions(precision=3)
print(res_data[:5])
res_data=pd.DataFrame(res_data)
#%%
res_data.columns=names
var=res_data.var()
var.plot()
var_rank=var.sort_values(ascending=False)
plt.plot(range(res_data.shape[1]),var_rank)
plt.show()
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.05)
data_fs0=sel.fit_transform(res_data)
del var
data_fs01=pd.DataFrame(data_fs0)
feature_var=pd.DataFrame(var_rank).T
feature0=feature_var.iloc[:,0:14]
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
X=np.array(res_data)
Y=np.array(y_t1).ravel()
test = SelectKBest(score_func=f_regression, k=14)
fit = test.fit(X,Y)
set_printoptions(precision=3)
#print(fit.scores_
s=pd.DataFrame(fit.scores_).T
#s.columns=names
#
#res_data=pd.DataFrame(res_data)
data_fs1=pd.concat([res_data,s],axis=0)
#data_fs1=data_fs1.reset_index()
#del data_fs1['index']
data_fs1.columns=names
data_fs11=data_fs1.T
data_fs11.columns=list(range(data_fs1.shape[0]-1))+['SKBrank']
data_fs12=data_fs11.sort_values('SKBrank',ascending=False).head(14).T
data_fs13=data_fs12.drop('SKBrank')
feature01=np.array(feature0.columns)
feature1=np.array(data_fs13.columns)
for i in feature01:
if i in feature1:
print(i)
#print("Selected Features: %s")
#print(fit.support_)
#
#print("Feature Ranking: %s")
#print(fit.ranking_)
#RFErank=pd.DataFrame(fit.ranking_)
#data_fs2=pd.concat([res_data,RFErank],axis=0)
## Feature Extraction with RFE
#from pandas import read_csv
#from sklearn.feature_selection import RFE
#from sklearn.linear_model import LogisticRegression
## load data
#filename = 'pima-indians-diabetes.data.csv'
#names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
#dataframe = read_csv(filename, names=names)
#array = dataframe.values
#X = array[:,0:8]
#Y = array[:,8]
## feature extraction
#model = LogisticRegression()
#rfe = RFE(model, 3)
#fit = rfe.fit(X, Y)
#print("Num Features: %d") % fit.n_features_
#print("Selected Features: %s") % fit.support_
#print("Feature Ranking: %s") % fit.ranking
#
## Feature Extraction with PCA
#from sklearn.decomposition import PCA
#X=np.array(data_fs14)
#Y=np.array(y_t1).ravel()
## feature extraction
#pca = PCA(n_components=15)
#fit = pca.fit(X)
## summarize components
##print("Explained Variance: %s") % fit.explained_variance_ratio_
#data_fspca=pca.fit_transform(X)
#print(data_fspca)
#%%
################################################################################################
#决策树,用处理缺失值和不处理缺失值两种
#此处使用Gradient Boosted Regression Trees (GBRT)
#from random import randint,uniform
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.cross_validation import train_test_split
#from sklearn.metrics import mean_squared_error
y_t2=np.ravel(y_t1)
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.model_selection import cross_val_score
X_train,X_test,y_train,y_test=train_test_split(data_fs0,y_t2,test_size=0.33, random_state=42)
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,\
max_depth=6, random_state=0, loss='lad')
est.fit(X_train,y_train)
print(est.score(X_test,y_test))
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0,oob_score=True)
regr.fit(data_fs0, y_t2)
print(regr.oob_score_)
#from scipy.stats import randint
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import RandomizedSearchCV
#
## Setup the parameters and distributions to sample from: param_dist
#param_dist = {"max_depth": [3, None],
# "max_features": randint(1, 9),
# "min_samples_leaf": randint(1, 9),
# "criterion": ["gini", "entropy"]}
#
## Instantiate a Decision Tree classifier: tree
#tree = DecisionTreeClassifier()
#
## Instantiate the RandomizedSearchCV object: tree_cv
#tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)
#
## Fit it to the data
#tree_cv.fit(X,y)
#
## Print the tuned parameters and score
#print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
#print("Best score is {}".format(tree_cv.best_score_))
################################################################################################
#模型调参+模型选择用CV+R2/MSE
################################################################################################
#用Pipeline:数据插值+标准化+模型调参一条龙