-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbitcoin.py
139 lines (119 loc) · 4.8 KB
/
bitcoin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import statsmodels.formula.api as smf
import sklearn.metrics as sm
import pandas as pd
import numpy as np
import math
import sys
# The path to the data folder should be given as input
if len(sys.argv) != 2:
print('bitcoin.py <path to data folder>')
sys.exit(1)
data_path = sys.argv[1]
# Reading the vectors from the given csv files
train1_90 = pd.read_csv(data_path+'/train1_90.csv')
train1_180 = pd.read_csv(data_path+'/train1_180.csv')
train1_360 = pd.read_csv(data_path+'/train1_360.csv')
train2_90 = pd.read_csv(data_path+'/train2_90.csv')
train2_180 = pd.read_csv(data_path+'/train2_180.csv')
train2_360 = pd.read_csv(data_path+'/train2_360.csv')
test_90 = pd.read_csv(data_path+'/test_90.csv')
test_180 = pd.read_csv(data_path+'/test_180.csv')
test_360 = pd.read_csv(data_path+'/test_360.csv')
def measure(a, b):
# Calculating the mean and std deviation for a and b
meanOfA, meanOfB = np.mean(a), np.mean(b)
stdOfA, stdOfB = np.std(a), np.std(b)
n, result = len(a), 0
for i in range(0, n):
numer = ((meanOfA - a[i]) * (meanOfB - b[i]))
denom = float(stdOfA * stdOfB * n)
result += (numer/denom)
return result
def computeDelta(wt, X, Xi):
"""
This function computes equation 6 of the paper, but with the euclidean distance
replaced by the similarity function given in Equation 9.
Parameters
----------
wt : int
This is the constant c at the top of the right column on page 4.
X : A row of Panda Dataframe
Corresponds to (x, y) in Equation 6.
Xi : Panda Dataframe
Corresponds to a dataframe of (xi, yi) in Equation 6.
Returns
-------
float
The output of equation 6, a prediction of the average price change.
"""
numerator, denominator = 0, 0
n = len(X) - 1
matrx = Xi.as_matrix()
for i in range(0, len(matrx)):
numerator += (matrx[i][n] * math.exp(wt *
measure(X[0:n], matrx[i][0:n])))
denominator += math.exp(wt * measure(X[0:n], matrx[i][0:n]))
E_emp = float(numerator)/denominator
return E_emp
# Perform the Bayesian Regression to predict the average price change for each dataset of train2 using train1 as input.
# These will be used to estimate the coefficients (w0, w1, w2, and w3) in equation 8.
weight = 2 # This constant was not specified in the paper, but we will use 2.
trainDeltaP90 = np.empty(0)
trainDeltaP180 = np.empty(0)
trainDeltaP360 = np.empty(0)
for i in xrange(0, len(train1_90.index)):
trainDeltaP90 = np.append(trainDeltaP90, computeDelta(
weight, train2_90.iloc[i], train1_90))
for i in xrange(0, len(train1_180.index)):
trainDeltaP180 = np.append(trainDeltaP180, computeDelta(
weight, train2_180.iloc[i], train1_180))
for i in xrange(0, len(train1_360.index)):
trainDeltaP360 = np.append(trainDeltaP360, computeDelta(
weight, train2_360.iloc[i], train1_360))
# Actual deltaP values for the train2 data.
trainDeltaP = np.asarray(train2_360[['Yi']])
trainDeltaP = np.reshape(trainDeltaP, -1)
# Combine all the training data
d = {'deltaP': trainDeltaP,
'deltaP90': trainDeltaP90,
'deltaP180': trainDeltaP180,
'deltaP360': trainDeltaP360}
trainData = pd.DataFrame(d)
# Feed the data: [deltaP, deltaP90, deltaP180, deltaP360] to train the linear model.
# Use the statsmodels ols function.
# Use the variable name model for your fitted model
model = smf.ols(
formula='deltaP ~ deltaP90 + deltaP180 + deltaP360', data=trainData).fit()
# Print the weights from the model
print model.params
# Perform the Bayesian Regression to predict the average price change for each dataset of test using train1 as input.
# This should be similar to above where it was computed for train2.
weight = 2
testDeltaP90 = testDeltaP180 = testDeltaP360 = np.empty(0)
for i in xrange(0, len(train1_90.index)):
testDeltaP90 = np.append(testDeltaP90, computeDelta(
weight, test_90.iloc[i], train1_90))
for i in xrange(0, len(train1_180.index)):
testDeltaP180 = np.append(testDeltaP180, computeDelta(
weight, test_180.iloc[i], train1_180))
for i in xrange(0, len(train1_360.index)):
testDeltaP360 = np.append(testDeltaP360, computeDelta(
weight, test_360.iloc[i], train1_360))
# Actual deltaP values for test data.
testDeltaP = np.asarray(test_360[['Yi']])
testDeltaP = np.reshape(testDeltaP, -1)
# Combine all the test data
d = {'deltaP': testDeltaP,
'deltaP90': testDeltaP90,
'deltaP180': testDeltaP180,
'deltaP360': testDeltaP360}
testData = pd.DataFrame(d)
# Predict price variation on the test data set.
result = model.predict(testData)
compare = {'Actual': testDeltaP,
'Predicted': result}
compareDF = pd.DataFrame(compare)
# Compute the MSE and print the result
MSE = 0.0
MSE = sm.mean_squared_error(compareDF['Actual'], compareDF['Predicted'])
print "The MSE is %f" % (MSE)