exodus-stock/predictor.py

625 lines
20 KiB
Python
Raw Normal View History

2022-04-06 03:56:09 -04:00
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm
sns.set()
import json
tf.compat.v1.random.set_random_seed(1234)
from matplotlib import style
# import matplotlib.backends.backend_qt5agg
# %matplotlib qt
style.use('ggplot')
import math
import yfinance as yf
import time
from datetime import date, timedelta
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import cProfile
import pstats
# In[34]:
symbols=os.listdir()
symbols
tf.compat.v1.disable_eager_execution()
# INITIAL VARS
test_size = 14
simulation_size = 1
# MODEL VARS
num_layers = 2
size_layer = 128
timestamp = 7
epoch = 20
dropout_rate = 0.8
prediction_gap = sys.argv[2]
future_day = test_size
learning_rate = 0.01
graph_loss = []
# In[35]:
# Necessary Dirs
# def date_manage(date1,date2=None):
# if date2 is None:
# date2=date1+timedelta(days=365)
# date_col=[]
# for n in range(int ((date2 - date1).days)+1):
# date_col.append(date1 + timedelta(n))
# weekdays = [5,6]
# date_result=[]
# for dt in date_col:
# if dt.weekday() not in weekdays:
# dt.strftime("%Y-%m-%d")
# return date_result
# In[36]:
def loss_animate(ax, i):
json_loss = pd.DataFrame(total_loss)
ax.plot(i)
return ax
def loader(symbol,test_size,date):
# dateparse = lambda dates : pd.datetime.strptime(dates,'%Y-%m')
# df = pd.read_csv('../dataset/IBMCUT.csv',parse_dates=['Date'], index_col = 'Date', date_parser=dateparse)
df=yf.Ticker(symbol)
# df=df.history(period="1y",interval="1d")
# df=df.history(start=date-timedelta(days=365),end=date,interval="1d")
df=df.history(start=date-timedelta(days=365*3),end=date,interval="1d")
df=df.reset_index(level=0)
# df=df.drop(columns=['Dividends'], axis=1)
df=df.drop(columns=['Stock Splits'], axis=1)
df['Up'] = df['High'].ewm(span=6,adjust=False).mean() + 2* df['High'].rolling(window=6).std()
df['Down']= df['Low'].ewm(span=8,adjust=False).mean() - 2* df['Low'].rolling(window=8).std()
df=df.dropna()
df=df.drop(df.tail(5).index)
date_ori = pd.to_datetime(df.iloc[:, 0]).tolist()
# for i in range(test_size):
# #date_ori.append(date_ori[-1] + timedelta(days = 1))
# add=1
# while ((date_ori[-1]) + timedelta(days = add)).weekday() in [5,6]:
# add=add+1
# date_ori.append(date_ori[-1] + timedelta(days = add))
date_ori = pd.Series(date_ori).dt.strftime(date_format = '%Y-%m-%d').tolist()
print(len(df),len(date_ori))
return df,date_ori
def trueloader(symbol,test_size,date):
# df2 = pd.read_csv(symbol)
# print("LENDF2:",len(df2))
df2 = yf.Ticker(symbol)
# df2 = df2.history(start=date-timedelta(days=365),end=date,interval="1d")
df2 = df2.history(start=date-timedelta(days=365*3),end=date,interval="1d")
df2 = df2.reset_index(level=0)
df2 = df2.drop(columns=['Dividends'], axis=1)
df2 = df2.drop(columns=['Stock Splits'], axis=1)
df2 = df2.drop(df2.head(7).index)
# df2 = df2.drop(df2.tail(test_size).index)
return df2
# In[38]:
def preproc(df):
minmax = MinMaxScaler().fit(df.iloc[:,1:9].astype('float32')) # Close, Volume, and all
df_log = minmax.transform(df.iloc[:, 1:9].astype('float32')) # Close, Volume, and all
df_log = pd.DataFrame(df_log)
df_log.head()
return df_log,minmax
# In[39]:
class Model:
def __init__(
self,
learning_rate,
num_layers,
size,
size_layer,
output_size,
forget_bias = 0.1,
):
def lstm_cell(size_layer):
# print("ASDasdasdasd",len(tf.compat.v1.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False))
return tf.compat.v1.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False)
rnn_cells = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
[lstm_cell(size_layer) for _ in range(num_layers)],
state_is_tuple = False,
)
self.X = tf.compat.v1.placeholder(tf.float32, (None, None, size))
self.Y = tf.compat.v1.placeholder(tf.float32, (None, output_size))
# self.X = tf.keras.Input((None, size),dtype=tf.float32)
# self.Y = tf.keras.Input((output_size),dtype=tf.float32)
drop = tf.compat.v1.nn.rnn_cell.DropoutWrapper(
rnn_cells, output_keep_prob = forget_bias
)
# print("XXXX:",self.X)
# print("XXXX:",self.X.shape)
# print("XXXX:",self.Y)
# print("XXXX:",self.Y.shape)
#print("LOOOASDSDASD")
self.hidden_layer = tf.compat.v1.placeholder(
tf.float32, (None, num_layers * 2 * size_layer)
)
self.outputs, self.last_state = tf.compat.v1.nn.dynamic_rnn(
drop, self.X, initial_state = self.hidden_layer, dtype = tf.float32
)
#rint("INIDIA",self.outputs)
self.logits = tf.compat.v1.layers.dense(self.outputs[-1], output_size)
self.cost = tf.reduce_mean(tf.square(self.Y - self.logits))
self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(
self.cost
)
#print("cost:",self.cost)
#print("cost:",self.optimizer)
def calculate_accuracy(real, predict):
real = np.array(real) + 1
predict = np.array(predict) + 1
percentage = 1 - np.sqrt(np.mean(np.square((real - predict) / real)))
return percentage * 100
def anchor(signal, weight):
buffer = []
last = signal[0]
for i in signal:
smoothed_val = last * weight + (1 - weight) * i
buffer.append(smoothed_val)
last = smoothed_val
return buffer
# In[40]:
def main_train(df_beta, df_train, df, minmax):
modelnn = Model(
learning_rate, num_layers, df_beta.shape[1], size_layer, df_beta.shape[1], dropout_rate
)
sess = tf.compat.v1.Session()
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(intra_op_parallelism_threads=64,inter_op_parallelism_threads=64))
sess.run(tf.compat.v1.global_variables_initializer())
pbar = tqdm(range(10), desc = 'Main train loop') # Default 500 range
for i in pbar:
init_value = np.zeros((1, num_layers * 2 * size_layer))
total_loss, total_acc = [], []
print("Degugging : ")
for k in range(0, df_train.shape[0] - 1, timestamp):
index = min(k + timestamp, df_train.shape[0] - 1)
print(index)
batch_x = np.expand_dims(
df_train.iloc[k : index, :].values, axis = 0
)
batch_y = df_train.iloc[k + 1 : index + 1, :].values
#print("BATCH_X:",batch_x)
#print("BATCH_Y:",batch_y)
logits, last_state,__,loss = sess.run(
[modelnn.logits, modelnn.last_state,modelnn.optimizer, modelnn.cost],
feed_dict = {
modelnn.X: batch_x,
modelnn.Y: batch_y,
modelnn.hidden_layer: init_value,
},
)
init_value = last_state
total_loss.append(loss)
total_acc.append(calculate_accuracy(batch_y[:, 0], logits[:, 0]))
# json_loss.to_json("./loss.json")
graph_loss.append(np.mean(total_loss))
#np.save(loss_file, np.array(graph_loss))
pbar.set_postfix(cost = np.mean(total_loss), min_acc = np.min(total_acc), mean_acc=np.mean(total_acc))
def forecast(df_beta,df_train,df,minmax):
# print("DF_BETA:",df_beta)
# print("DF_TRAIN:",df_train)
# tf.compat.v1.variable_scope("AAA", reuse=True)
tf.compat.v1.reset_default_graph()
modelnn = Model(
learning_rate, num_layers, df_beta.shape[1], size_layer, df_beta.shape[1], dropout_rate
)
# print("MODELX: ",modelnn.X)
# print("MODELY: ",modelnn.Y)
# print("MODELLayer: ",modelnn.hidden_layer)
sess = tf.compat.v1.Session()
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(intra_op_parallelism_threads=64,inter_op_parallelism_threads=64))
sess.run(tf.compat.v1.global_variables_initializer())
date_ori = pd.to_datetime(df.iloc[:, 0]).tolist()
# print("INI___!:",df_train.shape[0] - 1, timestamp)
# print(df_train.shape[0])
pbar = tqdm(range(epoch), desc = 'train loop')
for i in pbar:
# init_value = np.zeros((1, num_layers * 2 * size_layer))
total_loss, total_acc = [], []
print("Degugging : ")
for k in range(0, df_train.shape[0] - 1, timestamp):
init_value = np.zeros((1, num_layers * 2 * size_layer))
index = min(k + timestamp, df_train.shape[0] - 1)
# print(index)
batch_x = np.expand_dims(
df_train.iloc[k : index, :].values, axis = 0
)
batch_y = df_train.iloc[k + 1 : index + 1, :].values
print("BATCH_X:",batch_x)
print("BATCH_Y:",batch_y)
logits, last_state,__,loss = sess.run(
[modelnn.logits, modelnn.last_state,modelnn.optimizer, modelnn.cost],
feed_dict = {
modelnn.X: batch_x,
modelnn.Y: batch_y,
modelnn.hidden_layer: init_value,
},
)
# print("BATCHX:",batch_x)
# print("MODELX: ",modelnn.X)
# print("MODELY: ",modelnn.Y)
# print("MODELLayer: ",modelnn.hidden_layer)
# print("OUTSSS:",len(outs))
# print("opt:",opt)
# print("outs1",batch_x[0])
# print("outs2",outs[1])
# print("outs3",outs[2])
# print("outs4",outs[3])
# input()
init_value = last_state
total_loss.append(loss)
#print("LOGITS:",logits[:, 0])
total_acc.append(calculate_accuracy(batch_y[:, 0], logits[:, 0]))
# json_loss.to_json("./loss.json")
graph_loss.append(np.mean(total_loss))
#np.save(loss_file, np.array(graph_loss))
pbar.set_postfix(cost = np.mean(total_loss), min_acc = np.min(total_acc), mean_acc=np.mean(total_acc))
future_day = test_size
output_predict = np.zeros((df_train.shape[0] + future_day, df_train.shape[1]))
output_predict[0] = df_train.iloc[0]
upper_b = (df_train.shape[0] // timestamp) * timestamp
init_value = np.zeros((1, num_layers * 2 * size_layer))
for k in range(0, (df_train.shape[0] // timestamp) * timestamp, timestamp):
out_logits, last_state = sess.run(
[modelnn.logits, modelnn.last_state],
feed_dict = {
modelnn.X: np.expand_dims(
df_train.iloc[k : k + timestamp], axis = 0
),
modelnn.hidden_layer: init_value,
},
)
init_value = last_state
output_predict[k + 1 : k + timestamp + 1] = out_logits
if upper_b != df_train.shape[0]:
out_logits, last_state = sess.run(
[modelnn.logits, modelnn.last_state],
feed_dict = {
modelnn.X: np.expand_dims(df_train.iloc[upper_b:], axis = 0),
modelnn.hidden_layer: init_value,
},
)
output_predict[upper_b + 1 : df_train.shape[0] + 1] = out_logits
future_day -= 1
date_ori.append(date_ori[-1] + timedelta(days = 1))
init_value = last_state
for i in range(future_day):
o = output_predict[-future_day - timestamp + i:-future_day + i]
out_logits, last_state = sess.run(
[modelnn.logits, modelnn.last_state],
feed_dict = {
modelnn.X: np.expand_dims(o, axis = 0),
modelnn.hidden_layer: init_value,
},
)
init_value = last_state
output_predict[-future_day + i] = out_logits[-1]
date_ori.append(date_ori[-1] + timedelta(days = 1))
output_predict = minmax.inverse_transform(output_predict)
deep_future = anchor(output_predict[:, 0], 0.3)
sess.close()
sess.__del__()
return deep_future
# In[41]:
def newaccuration(accepted_results,truetrend):
hasilutama=0
indexbagus=0
truest=0
predictest=0
for i,x in enumerate(accepted_results):
a=x[-(test_size+2):]
#a=x[:((test_size+2)/2)]
print("a",a)
b=truetrend[-(test_size+1):]
#print("b",b)
hasil=0
true=[]
predict=[]
for xy in range(1,len((a))):
if a[xy]<a[xy-1]:
predict.append("Down")
else:
predict.append("Up")
if b[xy]<b[xy-1]:
true.append("Down")
else:
true.append("Up")
print(true)
print(predict)
for xz in range(len(true)):
if true[xz]==predict[xz]:
hasil=hasil+1
if hasil > hasilutama:
hasilutama=hasil
indexbagus=i
truest=true
predictest=predict
salah=[]
for xz in range(len(truest)):
if truest[xz]!=predictest[xz]:
salah.append(xz)
# if xz!=0:
# salah.append(xz-1)
# print("INI:",b)
print("TRUEST",truest)
print("predictest",predictest)
return hasilutama,indexbagus,salah
# In[42]:
def betaforecast(simulationsize,dfx,dftrain,df,df2,minmax):
results = []
for i in range(simulationsize):
forecast_res = forecast(df,dftrain,dfx,minmax)
results.append(forecast_res)
accepted_results = []
while not (np.array(results[0][-test_size:]) < np.min(dfx['Close'])).sum() == 0 and (np.array(results[0][-test_size:]) > np.max(dfx['Close']) * 2).sum() == 0:
print("++++++++++++++++++++++++")
print("Forecast Recalled...")
results[0]=forecast(df,dftrain,dfx,minmax)
return results[0]
# In[43]:
def interval(p1,p2):
return abs((p1) - (p2))
# In[44]:
def checkaccuracy2(true):
avg=[]
for x in range(len(true)-7):
avg.append(interval(true[x],true[x+1]))
average=sum(avg) / len(avg)
return average
# In[45]:
def checkaccuracy(predict,true,filterx, test_size):
print("True Length: ",len(true))
print("Predict Length: ",len(predict))
# avg=[]
# for x in range(len(true)-5):
# avg.append(interval(true[x],predict[x]))
# average=sum(avg) / len(avg)
# print("AVG1:",average)
# print("AVG2:",threshold)
temp_predict=predict[-test_size:]
temp_true=true[-test_size:]
# avg2=interval(max(predict),min(predict))
count=0
print("------------------------------------")
for x in range(test_size):
# acc_var1 = temp_true[x]-(1/filterx*temp_true[x])
acc_var1 = temp_true[x]-(filterx/10)
acc_var2 = temp_predict[x]
# acc_var3 = temp_true[x]+(1/filterx*temp_true[x])
acc_var3 = temp_true[x]+(filterx/10)
acc_condition = acc_var1 <= acc_var2 <= acc_var3
# print("Var 1 : ",acc_var1)
# print("Var 2 : ",acc_var2)
# print("Var 3 : ",acc_var3)
# print("Day "+str(x+1)+" "+str(int(acc_var1))+" "+str(int(acc_var2))+" "+str(int(acc_var3))+" : ",acc_condition)
print("Day "+str(x+1)+", Price : "+str(int(temp_true[x]))+" ,Gap = "+str(int(abs(temp_predict[x]-temp_true[x])))+" : ",acc_condition)
if (acc_condition):
count=count+1
print("------------------------------------")
if count>7:
print("Result True")
return True
else:
print("Result False")
return False
# if average>threshold:
# return False
# else:
# return True
# In[46]:
def findthreshold(simulationsize,dfx,dftrain,df,df2,minmax):
results=[]
for i in range(simulationsize):
results.append(forecast(df,dftrain,dfx,minmax))
accepted_results = []
for r in results:
if (np.array(r[-test_size:]) < np.min(dfx['Close'])).sum() == 0 and (np.array(r[-test_size:]) > np.max(dfx['Close']) * 2).sum() == 0:
accepted_results.append(r)
finalavg=999999
for o in accepted_results:
avg=[]
for x in range(len(o)-5):
avg.append(interval(o[x],df2[x]))
average=sum(avg) / len(avg)
if average<=finalavg:
finalavg=average
return finalavg
def temp_data(date, xi, resultfinal, df2, date_col,x):
print("Called . . . ")
if os.path.isdir("TempData/") == False:
os.mkdir("TempData/")
if os.path.isdir("TempData/%s"%x) == False:
os.mkdir("TempData/%s"%x)
if os.path.isdir("TempData/%s/"%x+str(date)) == False:
os.mkdir("TempData/%s/"%x+str(date))
with open("TempData/%s/"%x+str(date)+"/"+x+str(xi)+".vezpal2","w+") as oop:
main=[]
main.append(resultfinal) # prediction
main.append(list(df2['Close']))
main.append(date_col)
# main.append(3)
# main.append([0])
json.dump(main,oop)
def automaton(simulationsize,date):
# symbols=["AAPL"]
symbols = sys.argv[1]
times=[]
x=symbols
# for x in symbols:
temp_time=[]
temp_time.append(x)
counter=0
validity=0
df,date_col=loader(x,test_size,date)
# print(type(df))
dfx=df
# print("ASDSAD")
df2=trueloader(x,test_size,date)
df,minmax=preproc(df)
dftrain=df
wrong=[1,2,3,4,5]
# avg=checkaccuracy2(list(df2["Close"]))
# start=time.time()
# avg=findthreshold(50,dfx,dftrain,df,list(df2["Close"]),minmax)
# temp_time.append(time.time()-start)
start=time.time()
filterx = int(prediction_gap)
able=False
print("============== || Initial Train || =============")
main_train(df,dftrain,dfx,minmax)
for xi in range(5):
decision=False
while (decision==False):
print()
print("====== [ Foreacasting Attempt : "+str(counter+1)+" ] ===========")
print("====== [ Progress : "+str(xi)+"/5 ] ")
resultfinal=betaforecast(simulationsize,dfx,dftrain,df,df2,minmax)
# validity=valid
decision=checkaccuracy(resultfinal,list(df2["Close"]),filterx, test_size)
# wrong=invalid
if decision==True:
able=True
print("ABLE")
print(str(filterx))
if counter > 10 and decision != True:
counter = 0
filterx=filterx+10
print("Filter X new value : "+str(filterx))
print("Decision Status : ", decision)
print("**************************************")
# avg=avg+(1/3*avg)
if filterx>1000:
print("====== [ GG, we gave up] =====")
continue
counter=counter+1
temp_data(date, xi, resultfinal, df2, date_col, x)
print("[ Loop : "+x+" done ] =========================")
print()
if os.path.isdir("Backtest/") == False:
os.mkdir("Backtest/")
if os.path.isdir("Backtest/%s"%x) == False:
os.mkdir("Backtest/%s"%x)
if os.path.isdir("Backtest/%s/"%x+str(date)) == False:
os.mkdir("Backtest/%s/"%x+str(date))
with open("Backtest/%s/"%x+str(date)+"/"+x+str(xi)+".vezpal2","w+") as oop:
main=[]
main.append(resultfinal) #prediction
main.append(list(df2['Close']))
main.append(date_col)
# main.append(3)
# main.append([0])
json.dump(main,oop)
print("Time for %s :"%x,time.time()-start)
temp_time.append(time.time()-start)
times.append(temp_time)
return times
def predictor(simulationsize,current):
for x in range(52):
tf.compat.v1.reset_default_graph()
current+=timedelta(days=7)
automaton(simulationsize,current)
current_date=date(2020,1,1)
if os.path.isdir("Loss/") == False:
os.mkdir("Loss/")
if os.path.isdir("Loss/"+str(current_date)) == False:
os.mkdir("Loss/"+str(current_date))
# loss_file = time.strftime("%Y%m%d-%H%M%S")
# loss_file = "Loss/"+str(date.today())+"/"+loss_file
global_start = time.time()
# profile = cProfile.Profile()
# main_func = "predictor(simulation_size,current_date)"
predictor(simulation_size,current_date)
# ps = pstats.Stats(profile.run(main_func))
print("Overall time consumption ", str(time.time()-global_start))
# ps.dump_stats("./Cprofile_model_01.ps")