source code

This commit is contained in:
George 2024-06-19 10:33:24 -07:00
parent 4a5c1569d1
commit fe44f5b4ff
3 changed files with 1557 additions and 0 deletions

1023
indicators.py Normal file

File diff suppressed because it is too large Load Diff

163
marketsim.py Normal file
View File

@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 5 21:56:42 2020
@author: cpan
"""
"""MC2-P1: Market simulator."""
import pandas as pd
import numpy as np
# import datetime as dt
# import os
from util import get_data, plot_data
def normalize_data(df):
return df/df.iloc[0,:]
def fill_missing_values(df_data):
'''First fill forward and then fill backward'''
df_data.fillna(method="ffill", inplace=True)
df_data.fillna(method="bfill", inplace=True)
def get_orders(orders_file):
if isinstance(orders_file, pd.DataFrame):
# orders_df = orders_file.set_index('Date')
orders_df = orders_file
else:
orders_df = pd.read_csv(orders_file, index_col = 'Date', parse_dates = True,
na_values = ['nan'])
orders_df = orders_df.dropna()
orders_df = orders_df.sort_index()
return orders_df
def compute_daily_returns(df):
daily_returns = df.copy()
daily_returns[1:] = (df[1:] / df[:-1].values) - 1
daily_returns.iloc[0, :] = 0
return daily_returns
def compute_portfolio_stats(price, allocs=[0.1,0.2,0,3,0.4], rfr=0.0, sf=252.0):
norm_price = normalize_data(price)
norm_positions_val = norm_price * allocs
if len(norm_positions_val.columns) == 1:
norm_portfolio_val = norm_positions_val
else:
norm_portfolio_val = norm_positions_val.sum(axis=1).to_frame('PORTFOLIO')
cr = norm_portfolio_val.iloc[-1] / norm_portfolio_val.iloc[0] -1
daily_returns = compute_daily_returns(norm_portfolio_val)
daily_returns = daily_returns[1:] # remove first row (all zeros)
adr = daily_returns.mean()
sddr = daily_returns.std()
sr = np.sqrt(sf) * (adr - rfr)/sddr
return cr, adr, sddr, sr
def plot_against_SPY(df):
df_temp = df.copy()
if 'SPY' not in df_temp.columns:
df_SPY = get_data(['SPY'], pd.to_datetime(df_temp.index.values))
df_temp['SPY'] = df_SPY.values
else:
df_SPY = df_temp['SPY']
df_temp = normalize_data(df_temp)
plot_data(df_temp)
return df_SPY
def compute_portvals(orders_file = "./orders/orders.csv", start_val = 1000000,
commission=9.95, impact=0.005):
#read in order data
orders_df = get_orders(orders_file)
#scan symbols
symbols = list(set(orders_df['Symbol'].values))
#get date range
dates = pd.date_range(orders_df.index.values[0], orders_df.index.values[-1])
#read in prices
prices = get_data(symbols, dates)
# fill_missing_values(prices) # included in get_data() function
prices = prices[symbols]
#add an extra column 'Cash' and initialize it to all ones
prices['Cash'] = np.ones(prices.shape[0])
#duplicate price df into a units df, intialize it to all zeros
positions=prices*0.0
#initialize cash position with starting value
positions.loc[positions.index.values[0],['Cash']]=start_val
#adjust positions to show how stock units and cash are changing over time with orders
for index, row in orders_df.iterrows():
stock_sym = row['Symbol']
order_price = prices.loc[index, stock_sym]
order_shrs = row['Shares']
if row['Order'].upper() == 'BUY':
if positions.loc[index, 'Cash'] < order_shrs*order_price +\
commission + order_shrs*order_price*impact:
# print('Not enough cash to excute the order:\n', row)
pass
else:
#update positions on transaction days
positions.loc[index, stock_sym] += order_shrs
positions.loc[index, "Cash"] -= order_shrs*order_price
#deduct commission
positions.loc[index,"Cash"] -= commission
#impact = no. of orders in transaction * price of each share * impact.
positions.loc[index,"Cash"] -= order_shrs*order_price*impact
elif row['Order'].upper() == 'SELL':
if positions.loc[index, stock_sym] < order_shrs:
# print('Not enough shares to sell to fill the order:\n', row)
pass
else:
positions.loc[index, stock_sym] -= order_shrs
positions.loc[index, "Cash"] += order_shrs*order_price
#deduct commission
positions.loc[index,"Cash"] -= commission
#impact = no. of orders in transaction * price of each share * impact.
positions.loc[index,"Cash"] -= order_shrs*order_price*impact
# propagate positions beyond transaction days
start_row = positions.index.get_loc(index) + 1
positions.iloc[start_row:, :] = positions.iloc[start_row-1].values
#calculate port_vals
port_vals=prices*positions
port_vals.insert(0, 'Portfolio', port_vals.sum(axis=1))
return port_vals
def test_code():
of = "./orders/orders-05.csv"
sv = 1000000
# Process orders
portvals = compute_portvals(orders_file = of, start_val = sv)
if isinstance(portvals, pd.DataFrame):
portvals = portvals[portvals.columns[0]].to_frame() # just get the first column
else:
print("warning, code did not return a DataFrame")
# Get portfolio stats
start_date = pd.to_datetime(portvals.index.values[0])
end_date = pd.to_datetime(portvals.index.values[-1])
price_SPY = plot_against_SPY(portvals)
#portfolio stats calculated similar to assess_portfolio
rfr=0
sf=252
cr, adr, sddr, sr = compute_portfolio_stats(portvals, [1.0], rfr, sf)
crSP,adrSP,sddrSP,srSP = compute_portfolio_stats(price_SPY, [1.0], rfr, sf)
# Compare portfolio against $SPX
print("\nDate Range: {} to {}".format(start_date.date(), end_date.date()))
print()
print("Sharpe Ratio: {}, {}".format(sr, srSP))
print()
print("Cumulative Return: {}, {}".format(cr, crSP))
print()
print("Standard Deviation: {}, {}".format(sddr, sddrSP))
print()
print("Average Daily Return: {}, {}".format(adr, adrSP))
print()
print("Final Portfolio Value: {:.2f}".format(portvals['Portfolio'].iloc[-1]))
if __name__ == "__main__":
test_code()

371
util.py Normal file
View File

@ -0,0 +1,371 @@
"""
Use Yahoo Finance data
"""
import warnings
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import datetime as dt
import os
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from lxml import html
from io import StringIO
from time import sleep
WEBSITE = 'https://www.isolo.org/dokuwiki/knowledge_base/investing/watchlist'
BATCHSIZE = 20
TIMEGAP = 0.2
def fill_missing_data(df):
temp = df.ffill()
temp = temp.bfill()
return temp
def symbol_to_path(symbol, base_dir=None):
"""Return CSV file path given ticker symbol."""
if base_dir is None:
base_dir = os.environ.get("MARKET_DATA_DIR", '../data/')
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
"""
Read stock data (adjusted close) for given symbols from Yahoo Finance
from start_date to the latest date available (usually the current date).
"""
if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
symbols = ['SPY'] + symbols
df = yf.download(symbols, start = start_date)[colname]
if len(symbols) == 1:
df.name = symbols[0]
df = df.to_frame()
return df
# def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
"""
Read stock data (adjusted close) for given symbols from CSV files
from start_date to the latest date available in the CSV files.
"""
# df_temp = pd.read_csv(symbol_to_path('SPY'), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: 'SPY'})
# end_date = df_temp.index.values[-1]
# dates = pd.date_range(start_date, end_date)
# df = pd.DataFrame(index=dates)
# df = df.join(df_temp)
# df = df.dropna()
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# # symbols = ['SPY'] + symbols
# for symbol in symbols:
# df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: symbol})
# df = df.join(df_temp)
# # if symbol == 'SPY': # drop dates SPY did not trade
# # df = df.dropna(subset=["SPY"])
# if not addSPY:
# df = df[symbols]
# return df
def get_data_range(df, dates):
"""
Extract sections of the data in the dates range from the full data set
"""
df_range = pd.DataFrame(index=dates)
df_range = df_range.join(df, how='inner')
return df_range
def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
"""
Read stock data (adjusted close) for given symbols from Yahoo Finance
"""
org_sym = symbols
sd = dates[0]
ed = dates[-1]
# if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
if 'SPY' not in symbols:
symbols = ['SPY'] + symbols
df = yf.download(symbols, start=sd, end = ed)[colname]
if len(symbols) == 1:
df.name = symbols[0]
df = df.to_frame()
df = df.dropna(subset=['SPY'])
df = fill_missing_data(df)
if addSPY==False:
# df = df.drop(columns=['SPY'])
df = df[org_sym]
return df
def yf_batch_download(symbols, start, end, batch_size, time_gap):
"""
download in small batches to avoid connection closure by host
Parameters
----------
symbols : list
stock symbols.
start : datetime
start date.
end : datetime
stop date.
batch_size : integer
batch size.
time_gap : float
in seconds or fraction of seconds.
Returns
-------
df : dataframe
stock price volume information.
"""
n = len(symbols)
batches = n // batch_size
df = pd.DataFrame()
for i in range(batches - 1):
tmp = yf.download(symbols[i*batch_size:(i+1)*batch_size], start, end)
df = pd.concat([df, tmp], axis=1)
sleep(time_gap)
tmp = yf.download(symbols[(batches-1)*batch_size:n], start, end)
df = pd.concat([df, tmp], axis=1)
return df
def get_price_volume(symbols, dates, addSPY=False):
"""
Read stock data (adjusted close and volume) for given symbols from local
file unless data is not in local. It only gets date from Yahoo Finance
when necessary to increase speed and reduce internet data.
It will refresh local data if the symbols are on the _refresh.csv. This
is necessary when stock splits, spins off or something else happens.
"""
# DATAFILE = "_stkdata.pickle"
# REFRESH = "_refresh.csv"
org_sym = symbols
sd = dates[0]
ed = dates[-1]
# if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
if 'SPY' not in symbols:
symbols = ['SPY'] + symbols
df = yf_batch_download(symbols, start=sd, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
if len(symbols) == 1:
tuples = list(zip(df.columns.values.tolist(), \
[symbols[0]]*len(df.columns.values)))
df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
# if not os.path.exists(DATAFILE):
# df = yf_batch_download(symbols, start=sd, end=ed, \
# batch_size=BATCHSIZE, time_gap=TIMEGAP)
# if len(symbols) == 1:
# tuples = list(zip(df.columns.values.tolist(), \
# [symbols[0]]*len(df.columns.values)))
# df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
# else:
# df = pd.read_pickle(DATAFILE)
# exist_syms = df["Adj Close"].columns.values.tolist()
# if os.path.exists(REFRESH):
# try:
# refresh_df = pd.read_csv(REFRESH, header=None)
# refresh_syms = refresh_df.values.tolist()
# refresh_syms = [x for sublist in refresh_syms for x in sublist]
# remove_syms = [x for x in exist_syms if x in refresh_syms]
# if remove_syms:
# df.drop(columns=remove_syms, axis=1, level=1, inplace=True)
# exist_syms = [x for x in exist_syms if x not in refresh_syms]
# except:
# pass
exist_syms = []
last_day = pd.to_datetime(df.index.values[-1])
first_day = pd.to_datetime(df.index.values[0])
intersect_syms = list(set(org_sym) & set(exist_syms))
# reduce df to only contain intersect_syms
df = df.loc[:, (slice(None), intersect_syms)]
if sd < first_day:
# fill gap from online
tmp_df = yf_batch_download(intersect_syms, start=sd, end=first_day, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
df = pd.concat([tmp_df, df])
if ed >= last_day:
# fill gap from online incl last two days to get mkt close data
if ed.date() == last_day.date():
tmp_df = yf_batch_download(intersect_syms, start=ed, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
else:
tmp_df = yf_batch_download(intersect_syms, start=last_day, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
df = pd.concat([df[:-1], tmp_df])
# get data online when new stks were added
new_stks = np.setdiff1d(symbols, exist_syms).tolist()
if not new_stks == []:
tmp_df = yf_batch_download(new_stks, start=sd, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
if len(new_stks) == 1:
tuples = list(zip(tmp_df.columns.values.tolist(), \
[new_stks[0]]*len(tmp_df.columns.values)))
tmp_df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
df = df.join(tmp_df)
# df.to_pickle(DATAFILE) # save to local, overwrite existing file
# if os.path.exists(REFRESH):
# with open(REFRESH, 'w'):
# pass
df = df.dropna(subset=[('Adj Close', 'SPY')])
price = df['Adj Close']
price = fill_missing_data(price)
volume = df['Volume']
volume = volume.fillna(0)
# if len(symbols) == 1:
# price.name = symbols[0]
# volume.name = symbols[0]
# price = price.to_frame()
# volume = volume.to_frame()
if addSPY==False:
price = price[org_sym]
volume = volume[org_sym]
return price, volume
def get_price_volume_online(symbols, dates, addSPY=False):
"""
Read stock data (adjusted close and volume) for given symbols from Yahoo
Finance
"""
org_sym = symbols
sd = dates[0]
ed = dates[-1]
# if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
if 'SPY' not in symbols:
symbols = ['SPY'] + symbols
df = yf.download(symbols, start=sd, end = ed)
if len(symbols) == 1:
df = df.dropna(subset = ['Adj Close'])
else:
df = df.dropna(subset=[('Adj Close', 'SPY')])
price = df['Adj Close']
price = fill_missing_data(price)
volume = df['Volume']
volume = volume.fillna(0)
if len(symbols) == 1:
price.name = symbols[0]
volume.name = symbols[0]
price = price.to_frame()
volume = volume.to_frame()
if addSPY==False:
price = price[org_sym]
volume = volume[org_sym]
return price, volume
def get_watchlist(website: str = WEBSITE):
page = requests.get(WEBSITE)
# page = requests.get(WEBSITE, verify=False) # skip certificate check for https
tree = html.fromstring(page.content)
watchlist = tree.xpath('//*[@id="dokuwiki__content"]/div[1]/div/div[3]/div/pre/text()')[0]
file_name = StringIO(watchlist)
df = pd.read_csv(file_name, index_col = 'Symbol',
comment = '#', na_filter=False)
return df
# def get_watchlist(file_name: str = 'watchlist.csv'):
# df = pd.read_csv(file_name, index_col = 'Symbol',
# comment = '#', na_filter=False)
# return df
# def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
# """
# Read stock data (adjusted close) for given symbols from CSV files.
# (done) TODO: there are nan values in the data when addSPY=False is passed. The
# strategy should be using SPY to clean the data first including fill
# forward and fill backward, then to drop the SPY if addSPY=False
# """
# org_sym = symbols
# df = pd.DataFrame(index=dates)
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# # symbols = ['SPY'] + symbols
# if 'SPY' not in symbols:
# symbols = ['SPY'] + symbols
# for symbol in symbols:
# df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: symbol})
# df = df.join(df_temp)
# if symbol == 'SPY': # drop dates SPY did not trade
# df = df.dropna(subset=["SPY"])
# # fill missing data
# df = fill_missing_data(df)
# if addSPY == False: # drop SPY
# # df = df.drop(columns=['SPY'])
# df = df[org_sym]
# return df
def plot_data(df, axs=[], title=[], xlabel='', ylabel=''):
"""Plot stock prices with a custom title and meaningful axis labels."""
if axs == []:
ax = df.plot(title = title)
else:
ax = df.plot(ax=axs, title=title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.grid()
# def plot_data(df, title=[], xlabel='', ylabel=''):
# import matplotlib.pyplot as plt
# """Plot stock prices with a custom title and meaningful axis labels."""
# ax = df.plot(title=title, fontsize=12, figsize=(10, 7))
# ax.set_xlabel(xlabel)
# ax.set_ylabel(ylabel)
# plt.grid()
# plt.show()
def get_orders_data_file(basefilename):
return open(os.path.join(os.environ.get("ORDERS_DATA_DIR",'orders/'),basefilename))
def get_learner_data_file(basefilename):
return open(os.path.join(os.environ.get("LEARNER_DATA_DIR",'Data/'),basefilename),'r')
def get_robot_world_file(basefilename):
return open(os.path.join(os.environ.get("ROBOT_WORLDS_DIR",'testworlds/'),basefilename))
def test_code():
symbol = ['GOOG', 'AMZN']
# lookback years
lb_year = 0.08
ed = dt.datetime.today()
sd = ed - dt.timedelta(days = 365 * lb_year + 1)
# If ed or sd falls on to a non-trading day, you might get warnings saying
# "No data found for this date range, symbol may be delisted". This is
# normal behavior.
prices, volume = get_price_volume(symbol, pd.date_range(sd, ed), addSPY=False)
if __name__ == '__main__':
test_code()