Compare commits

..

No commits in common. "fetch_on_demand_simple_auth" and "obsolete" have entirely different histories.

12 changed files with 1587 additions and 1940 deletions

View File

@ -1,3 +0,0 @@
.*
__pycache__
!.env

4
.gitignore vendored
View File

@ -1,4 +0,0 @@
.*
__pycache__
!.gitignore
!.dockerignore

View File

@ -1,8 +0,0 @@
FROM python:3.12-slim
RUN apt-get update && apt-get install -y libpq-dev gcc
WORKDIR /app
COPY . /app
RUN pip install -U pip && pip install -r requirements.txt
ADD stock_info.py /usr/local/lib/python3.12/site-packages/yahoo_fin/
EXPOSE 8050
CMD ["python", "./indicators.py"]

0
README.md Normal file
View File

File diff suppressed because it is too large Load Diff

163
marketsim.py Normal file
View File

@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 5 21:56:42 2020
@author: cpan
"""
"""MC2-P1: Market simulator."""
import pandas as pd
import numpy as np
# import datetime as dt
# import os
from util import get_data, plot_data
def normalize_data(df):
return df/df.iloc[0,:]
def fill_missing_values(df_data):
'''First fill forward and then fill backward'''
df_data.fillna(method="ffill", inplace=True)
df_data.fillna(method="bfill", inplace=True)
def get_orders(orders_file):
if isinstance(orders_file, pd.DataFrame):
# orders_df = orders_file.set_index('Date')
orders_df = orders_file
else:
orders_df = pd.read_csv(orders_file, index_col = 'Date', parse_dates = True,
na_values = ['nan'])
orders_df = orders_df.dropna()
orders_df = orders_df.sort_index()
return orders_df
def compute_daily_returns(df):
daily_returns = df.copy()
daily_returns[1:] = (df[1:] / df[:-1].values) - 1
daily_returns.iloc[0, :] = 0
return daily_returns
def compute_portfolio_stats(price, allocs=[0.1,0.2,0,3,0.4], rfr=0.0, sf=252.0):
norm_price = normalize_data(price)
norm_positions_val = norm_price * allocs
if len(norm_positions_val.columns) == 1:
norm_portfolio_val = norm_positions_val
else:
norm_portfolio_val = norm_positions_val.sum(axis=1).to_frame('PORTFOLIO')
cr = norm_portfolio_val.iloc[-1] / norm_portfolio_val.iloc[0] -1
daily_returns = compute_daily_returns(norm_portfolio_val)
daily_returns = daily_returns[1:] # remove first row (all zeros)
adr = daily_returns.mean()
sddr = daily_returns.std()
sr = np.sqrt(sf) * (adr - rfr)/sddr
return cr, adr, sddr, sr
def plot_against_SPY(df):
df_temp = df.copy()
if 'SPY' not in df_temp.columns:
df_SPY = get_data(['SPY'], pd.to_datetime(df_temp.index.values))
df_temp['SPY'] = df_SPY.values
else:
df_SPY = df_temp['SPY']
df_temp = normalize_data(df_temp)
plot_data(df_temp)
return df_SPY
def compute_portvals(orders_file = "./orders/orders.csv", start_val = 1000000,
commission=9.95, impact=0.005):
#read in order data
orders_df = get_orders(orders_file)
#scan symbols
symbols = list(set(orders_df['Symbol'].values))
#get date range
dates = pd.date_range(orders_df.index.values[0], orders_df.index.values[-1])
#read in prices
prices = get_data(symbols, dates)
# fill_missing_values(prices) # included in get_data() function
prices = prices[symbols]
#add an extra column 'Cash' and initialize it to all ones
prices['Cash'] = np.ones(prices.shape[0])
#duplicate price df into a units df, intialize it to all zeros
positions=prices*0.0
#initialize cash position with starting value
positions.loc[positions.index.values[0],['Cash']]=start_val
#adjust positions to show how stock units and cash are changing over time with orders
for index, row in orders_df.iterrows():
stock_sym = row['Symbol']
order_price = prices.loc[index, stock_sym]
order_shrs = row['Shares']
if row['Order'].upper() == 'BUY':
if positions.loc[index, 'Cash'] < order_shrs*order_price +\
commission + order_shrs*order_price*impact:
# print('Not enough cash to excute the order:\n', row)
pass
else:
#update positions on transaction days
positions.loc[index, stock_sym] += order_shrs
positions.loc[index, "Cash"] -= order_shrs*order_price
#deduct commission
positions.loc[index,"Cash"] -= commission
#impact = no. of orders in transaction * price of each share * impact.
positions.loc[index,"Cash"] -= order_shrs*order_price*impact
elif row['Order'].upper() == 'SELL':
if positions.loc[index, stock_sym] < order_shrs:
# print('Not enough shares to sell to fill the order:\n', row)
pass
else:
positions.loc[index, stock_sym] -= order_shrs
positions.loc[index, "Cash"] += order_shrs*order_price
#deduct commission
positions.loc[index,"Cash"] -= commission
#impact = no. of orders in transaction * price of each share * impact.
positions.loc[index,"Cash"] -= order_shrs*order_price*impact
# propagate positions beyond transaction days
start_row = positions.index.get_loc(index) + 1
positions.iloc[start_row:, :] = positions.iloc[start_row-1].values
#calculate port_vals
port_vals=prices*positions
port_vals.insert(0, 'Portfolio', port_vals.sum(axis=1))
return port_vals
def test_code():
of = "./orders/orders-05.csv"
sv = 1000000
# Process orders
portvals = compute_portvals(orders_file = of, start_val = sv)
if isinstance(portvals, pd.DataFrame):
portvals = portvals[portvals.columns[0]].to_frame() # just get the first column
else:
print("warning, code did not return a DataFrame")
# Get portfolio stats
start_date = pd.to_datetime(portvals.index.values[0])
end_date = pd.to_datetime(portvals.index.values[-1])
price_SPY = plot_against_SPY(portvals)
#portfolio stats calculated similar to assess_portfolio
rfr=0
sf=252
cr, adr, sddr, sr = compute_portfolio_stats(portvals, [1.0], rfr, sf)
crSP,adrSP,sddrSP,srSP = compute_portfolio_stats(price_SPY, [1.0], rfr, sf)
# Compare portfolio against $SPX
print("\nDate Range: {} to {}".format(start_date.date(), end_date.date()))
print()
print("Sharpe Ratio: {}, {}".format(sr, srSP))
print()
print("Cumulative Return: {}, {}".format(cr, crSP))
print()
print("Standard Deviation: {}, {}".format(sddr, sddrSP))
print()
print("Average Daily Return: {}, {}".format(adr, adrSP))
print()
print("Final Portfolio Value: {:.2f}".format(portvals['Portfolio'].iloc[-1]))
if __name__ == "__main__":
test_code()

View File

@ -1,63 +1,54 @@
appdirs==1.4.4
Authlib==1.3.2
beautifulsoup4==4.12.3
blinker==1.8.2
bs4==0.0.2
cachelib==0.9.0
certifi==2024.8.30
cffi==1.17.1
certifi==2024.6.2
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
cryptography==43.0.1
cssselect==1.2.0
dash==2.18.1
dash==2.17.1
dash-core-components==2.0.0
dash-html-components==2.0.0
dash-table==5.0.0
dash_auth==2.3.0
fake-useragent==1.5.1
feedparser==6.0.11
Flask==3.0.3
Flask-Caching==2.3.0
idna==3.10
importlib_metadata==8.5.0
idna==3.7
importlib_metadata==8.0.0
itsdangerous==2.2.0
Jinja2==3.1.4
lxml==5.3.0
lxml_html_clean==0.2.2
lxml==5.2.2
MarkupSafe==2.1.5
nest-asyncio==1.6.0
numpy==2.1.1
numpy==2.0.0
packaging==24.1
pandas==2.2.3
pandas==2.2.2
parse==1.20.2
plotly==5.24.1
psycopg2==2.9.9
pycparser==2.22
pyee==11.1.1
plotly==5.22.0
pyee==11.1.0
pyppeteer==2.0.0
pyquery==2.0.1
pyquery==2.0.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.2
pytz==2024.1
requests==2.32.3
requests-html==0.10.0
retrying==1.3.4
scipy==1.14.1
sec-cik-mapper==2.1.0
setuptools==75.1.0
scipy==1.14.0
setuptools==70.1.1
sgmllib3k==1.0.0
six==1.16.0
soupsieve==2.6
tenacity==9.0.0
tqdm==4.66.5
soupsieve==2.5
tenacity==8.4.2
tqdm==4.66.4
typing_extensions==4.12.2
tzdata==2024.2
urllib3==1.26.20
tzdata==2024.1
urllib3==1.26.19
w3lib==2.2.1
waitress==3.0.0
websockets==10.4
Werkzeug==3.0.4
Werkzeug==3.0.3
yahoo-fin==0.8.9.1
zipp==3.20.2
zipp==3.19.2

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
# Define the __all__ variable
__all__ = ["Security", "remove_from_db", "insert_into_db", "get_watchlist", "hash_password"]
# Import the submodules
from .security import Security, get_crossing, get_sma_slope
from .dbutil import remove_from_db, insert_into_db, get_watchlist, hash_password

View File

@ -1,125 +0,0 @@
import hashlib
import psycopg2
import sys
import os
import pandas as pd
def connect_db():
"""Connect to database
Returns:
psycopg2 connector: psycopg2 postgresql connector
"""
conn = None
try:
conn = psycopg2.connect(
host=os.environ['DB_PATH'],
database=os.environ['DB_NAME'],
user=os.environ['DB_USERNAME'],
password=os.environ['DB_PASSWORD'],
)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
sys.exit(1)
return conn
def get_watchlist(username : str):
"""Read list of tickers/descriptions from database
Args:
username (str): database table prefix
Returns:
Pandas DataFrame: it has two columns - first column is ticker, second column is description
"""
if username:
table_name = f"{username + '_watch_list'}"
else: # username is None, use default table
table_name = "stock_watch_list"
QUERY1 = f'''CREATE TABLE IF NOT EXISTS {table_name}
(
tick character varying(5) NOT NULL,
description text,
PRIMARY KEY (tick)
);'''
QUERY2 = f"INSERT INTO {table_name} SELECT 'SPY', 'SPDR S&P 500 ETF Trust' WHERE NOT EXISTS (SELECT NULL FROM {table_name});"
QUERY3 = f"SELECT * FROM {table_name};"
with connect_db() as conn:
with conn.cursor() as curs:
curs.execute(QUERY1)
curs.execute(QUERY2)
curs.execute(QUERY3)
tuples_list = curs.fetchall()
df = pd.DataFrame(tuples_list)
return df
def remove_from_db(username, tick):
"""Remove a row from database table using ticker as key
Args:
username (str): database table prefix
tick (str): ticker
"""
if username:
table_name = f"{username + '_watch_list'}"
else: # username is None, use default table
table_name = "stock_watch_list"
QUERY = f"DELETE FROM {table_name} WHERE tick = '{tick}';"
with connect_db() as conn:
with conn.cursor() as curs:
curs.execute(QUERY)
def insert_into_db(username : str, tick : str, name : str):
"""Insert ticker and description into database
Args:
username (str): database table prefix - each user has its own list of tickers
tick (str): stock or mutual fund ticker
name (str): company name for stock, series ID for mutual fund
"""
if username:
table_name = f"{username + '_watch_list'}"
else: # username is None, use default table
table_name = "stock_watch_list"
QUERY1 = f'''CREATE TABLE IF NOT EXISTS {table_name}
(
tick character varying(5) NOT NULL,
description text,
PRIMARY KEY (tick)
);'''
QUERY2 = f"INSERT INTO {table_name} SELECT 'SPY', 'SPDR S&P 500 ETF Trust' WHERE NOT EXISTS (SELECT NULL FROM {table_name});"
QUERY3 = f"INSERT INTO {table_name} VALUES ('{tick}', '{name}') ON CONFLICT DO NOTHING;"
with connect_db() as conn:
with conn.cursor() as curs:
curs.execute(QUERY1)
curs.execute(QUERY2)
curs.execute(QUERY3)
def hash_password(password : str):
"""Generate hash from string using sha256
Args:
password (str): any text
Returns:
str: hash string
"""
# Encode the password as bytes
password_bytes = password.encode('utf-8')
# Use SHA-256 hash function to create a hash object
hash_object = hashlib.sha256(password_bytes)
# Get the hexadecimal representation of the hash
password_hash = hash_object.hexdigest()
return password_hash

View File

@ -1,341 +0,0 @@
import numpy as np
from numpy.fft import fft, ifft
import scipy.signal as sig
class Security:
"""
This can be a list of stocks, bonds, or otherinvestment vehicles.
price - Pandas DataFrame with datetime as index sorted to chronical order
"""
def __init__(self, sym, price, volume=None, rfr: float = 0.01, sf: float = 252.0):
"""
Parameters
----------
price : TYPE pandas.DataFrame
DESCRIPTION. historical adj. daily close prices of stocks under
consideration
volume : TYPE pandas.DataFrame
DESCRIPTION. daily trading volume. The default is none.
rfr : TYPE float, optional
DESCRIPTION. annualized risk free rate. The default is 0.01.
sf : TYPE sample frequency, optional
DESCRIPTION. The default is 252 (daily). there are 252 trading
days in a year. Monthly sampling frequency would be 12. And
weekly sampling frequenc is 52.
"""
self._symbol = sym
self._price = price
self._volume = volume
# self._symbol = price.columns.values
self._rfr = rfr
self._sf = sf
@property
def symbol(self):
return self._symbol
@symbol.setter
def symbol(self, value):
raise AttributeError('security symbol is read only')
@property
def price(self):
return self._price
@price.setter
def price(self, value):
raise AttributeError('security price is read only')
@property
def volume(self):
if self._volume is None:
raise ValueError('trading volume information not available')
return self._volume
@volume.setter
def volume(self, value):
raise AttributeError('security volume is read only')
def sma(self, window):
return self.price.rolling(window).mean()
def vwma(self, window):
"""
Volume weighted moving average. When plotted against sma, it gives an
early indicator when VWMA crosses SMA. When VWMA is above SMA, it
indicates a strong upward trend and vice versa.
"""
price_vol = self.price * self.volume
return price_vol.rolling(window).sum() / self.volume.rolling(window).sum()
def vosma(self, window):
return self.volume.rolling(window).mean()
def ema(self, window): # default to 14 day window
# EMA pre-process the first point
price = self.price
temp = price.iloc[0:window].mean()
price.iloc[window-1] = temp
price.iloc[0:(window-1)] = np.nan
# process the EMA
avg = price.ewm(span=window, adjust=False).mean()
return avg
def voema(self, window): # default to 14 day window
# EMA pre-process the first point
vol = self.volume
temp = vol.iloc[0:window].mean()
vol.iloc[window-1] = temp
vol.iloc[0:(window-1)] = np.nan
# process the EMA
avg = vol.ewm(span=window, adjust=False).mean()
return avg
def rsi(self, window = 14):
"""
Traditional interpretation and usage of the RSI are that values of 70
or above indicate that a security is becoming overbought or overvalued
and may be primed for a trend reversal or corrective pullback in price.
An RSI reading of 30 or below indicates an oversold or undervalued
condition.
"""
# use exponential averaging
d_chg = self.price.diff()
d_up, d_dn = d_chg.copy(), d_chg.copy()
d_up[d_up < 0] = 0
d_dn[d_dn > 0] = 0
# EMA pre-process the first point
temp = d_up.iloc[1:(window+1)].mean()
d_up.iloc[window] = temp
d_up.iloc[1:window] = np.nan
temp = d_dn.iloc[1:(window+1)].mean()
d_dn.iloc[window] = temp
d_dn.iloc[1:window] = np.nan
# process the EMA
avg_up = d_up.ewm(span=window, adjust=False).mean()
avg_dn = d_dn.ewm(span=window, adjust=False).mean()
rs = avg_up / abs(avg_dn.values)
exp_rsi = 100 - 100 / (1+rs)
return exp_rsi
def volume_rsi(self, window = 14):
"""
The volume RSI (Relative Strength Index) is quite similar to the price
based RSI with difference that up-volume and down-volume are used in
the RSI formula instead changes in price. If price RSI shows relation
between up-moves and down-moves within an analyzed period of time by
revealing which moves are stronger, the volume RSI indicator shows the
relation between volume traded during these price up-moves and
down-moves respectfully by revealing whether up-volume (bullish money
flow) or down-volume (bearish money flow) is stronger.
The same as price RSI, volume RSI oscillates around 50% center-line in
the range from 0 to 100%. In technical analysis this indicator could be
used in the same way as well. The simplest way of using the volume RSI
would be to generate trading signals on the crossovers of the indicator
and 50% center-line around which it oscillates. Here you have to
remember following:
volume RSI reading above 50% are considered bullish as bullish volume
dominates over bearish volume; volume RSI readings below 50% are
considered bearish as bearish volume overcomes bullish volume.
Respectfully, technical analysis would suggest to generate buy/sell
signals by following rules:
Buy when indicators moves above 50% line after being below it;
Sell when indicator drops below 50% line after being above it.
"""
# use exponential averaging
volume = self.volume
up_vol, dn_vol = volume.copy(), volume.copy()
d_chg = self.price.diff()
up_vol[d_chg < 0] = 0
dn_vol[d_chg > 0] = 0
up_vol.iloc[0] = np.nan
dn_vol.iloc[0] = np.nan
# EMA pre-process the first point
temp = up_vol.iloc[1:(window+1)].mean()
up_vol.iloc[window] = temp
up_vol.iloc[1:window] = np.nan
temp = dn_vol.iloc[1:(window+1)].mean()
dn_vol.iloc[window] = temp
dn_vol.iloc[1:window] = np.nan
# EMA processing
avg_up = up_vol.ewm(span=window, adjust=False).mean()
avg_dn = dn_vol.ewm(span=window, adjust=False).mean()
rs = avg_up / avg_dn.values
exp_rsi = 100 - 100 / (1+rs)
return exp_rsi
def daily_returns(self):
return self.price.pct_change()
@property
def annualized_return(self):
dr = self.daily_returns()
return self._sf * dr.mean()
@property
def annualized_stdev(self):
dr = self.daily_returns()
return np.sqrt(self._sf) * dr.std()
@property
def sharpe(self):
return (self.annualized_return - self._rfr) / self.annualize_stdev
def rolling_stdev(self, window):
return self.price.rolling(window).std()
def bollinger(self, window):
"""
Parameters
----------
window : TYPE int, optional
DESCRIPTION - averaging window in days.
Returns
-------
lower, upper : TYPE pandas.DataFrame
DESCRIPTION - lower band (minus 2 sigma) and the upper band.
"""
avg = self.sma(window)
sdd2 = self.rolling_stdev(window).mul(2)
lower = avg.sub(sdd2.values)
upper = avg.add(sdd2.values)
# low_up = lower.join(upper, lsuffix='_L', rsuffix='_U')
return lower, upper
def macd(self, short_wd = 12, long_wd = 26, sig_wd = 9):
"""
MACD Line: (12-day EMA - 26-day EMA)
Signal Line: 9-day EMA of MACD Line
MACD Histogram: MACD Line - Signal Line
MACD is calculated by subtracting the 26-period EMA from the 12-period
EMA. MACD triggers technical signals when it crosses above (to buy) or
below (to sell) its signal line. The speed of crossovers is also taken
as a signal of a market is overbought or oversold. MACD helps investors
understand whether the bullish or bearish movement in the price is
strengthening or weakening
MACD historgram represents signal line crossovers that are the most
common MACD signals. The signal line is a 9-day EMA of the MACD line.
As a moving average of the indicator, it trails the MACD and makes it
easier to spot MACD turns. A bullish crossover occurs when the MACD
turns up and crosses above the signal line. A bearish crossover occurs
when the MACD turns down and crosses below the signal line. Crossovers
can last a few days or a few weeks, depending on the strength of the
move.
"""
macd_short = self.ema(short_wd)
macd_long = self.ema(long_wd)
macd_line = macd_short - macd_long.values
macd_sig = macd_line.ewm(span=sig_wd, adjust=False).mean()
macd_hist = macd_line - macd_sig.values
norm_hist = macd_hist.div(macd_long.values)
return macd_line, macd_sig, macd_hist, norm_hist
def get_crossing(stocks):
"""
Parameters
----------
stocks : TYPE instance of class 'security'
Returns
-------
cross : TYPE pandas DataFrame
DESCRIPTION - +1 when 50 day moving average is above 200 day moving
average. -1 when vice versa. transition days are of value +3 and -3
respectively.
"""
sma50 = stocks.sma(50)
sma200 = stocks.sma(200)
cross = np.sign(sma50.sub(sma200.values))
cross_diff = cross.diff()
cross = cross.add(cross_diff.values)
cross.columns = stocks.price.columns
return cross
def get_sma_slope(stocks, wd = 50):
"""
Parameters
----------
stocks : TYPE
DESCRIPTION.
wd : TYPE, optional
DESCRIPTION. The default is 50.
Returns
-------
slope : TYPE pandas DataFrame
DESCRIPTION - +1 when n day moving average is positive. -1 when
negative. transition days are of value +3 and -3 respectively.
"""
sma = stocks.sma(wd)
slope = np.sign(sma.diff())
slope_diff = slope.diff()
slope = slope.add(slope_diff.values)
return slope
def fill_missing_data(df):
df.ffill(inplace=True)
df.bfilln(inplace=True)
def fft_convolve(signal, window):
fft_signal = fft(signal)
fft_window = fft(window)
return ifft(fft_signal * fft_window)
def zero_pad(array, n):
"""Extends an array with zeros.
array: numpy array
n: length of result
returns: new NumPy array
"""
res = np.zeros(n)
res[: len(array)] = array
return res
def smooth(price, hsize=10, sigma=3):
"""
Parameters
----------
price : TYPE DataFrame.
DESCRIPTION - with time index and no invalid values
hsize : TYPE integer
DESCRIPTION - this adds phase delay. similar to SMA window
sigma : TYPE float
DESCRIPTION - gaussian standard deviation affects smoothness
Returns
-------
TYPE DataFrame
DESCRIPTION - smoothed price
Doesn't offer much benefit over sma. Only theoretical values. For future
different smooth functiona experiments
"""
data = price.copy()
window = sig.gaussian(M=hsize, std=sigma)
window /= window.sum()
padded = zero_pad(window, data.shape[0])
for col in data.columns:
ys = data[col].values
smoo = abs(fft_convolve(ys, padded))
smoo[0:hsize-1] = np.nan
data[col] = smoo
return data

384
util.py Normal file
View File

@ -0,0 +1,384 @@
"""
Use Yahoo Finance data
"""
import warnings
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import datetime as dt
import os
import pandas as pd
import numpy as np
# import yfinance as yf
import yahoo_fin.stock_info as si
import requests
from lxml import html
from io import StringIO
from time import sleep
WEBSITE = 'https://www.isolo.org/dokuwiki/knowledge_base/investing/watchlist'
BATCHSIZE = 20
TIMEGAP = 0.2
def fill_missing_data(df):
temp = df.ffill()
temp = temp.bfill()
return temp
def symbol_to_path(symbol, base_dir=None):
"""Return CSV file path given ticker symbol."""
if base_dir is None:
base_dir = os.environ.get("MARKET_DATA_DIR", '../data/')
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
# def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
# """
# Read stock data (adjusted close) for given symbols from Yahoo Finance
# from start_date to the latest date available (usually the current date).
# """
# if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# symbols = ['SPY'] + symbols
# df = yf.download(symbols, start = start_date)[colname]
# if len(symbols) == 1:
# df.name = symbols[0]
# df = df.to_frame()
# return df
# def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
"""
Read stock data (adjusted close) for given symbols from CSV files
from start_date to the latest date available in the CSV files.
"""
# df_temp = pd.read_csv(symbol_to_path('SPY'), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: 'SPY'})
# end_date = df_temp.index.values[-1]
# dates = pd.date_range(start_date, end_date)
# df = pd.DataFrame(index=dates)
# df = df.join(df_temp)
# df = df.dropna()
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# # symbols = ['SPY'] + symbols
# for symbol in symbols:
# df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: symbol})
# df = df.join(df_temp)
# # if symbol == 'SPY': # drop dates SPY did not trade
# # df = df.dropna(subset=["SPY"])
# if not addSPY:
# df = df[symbols]
# return df
def get_data_range(df, dates):
"""
Extract sections of the data in the dates range from the full data set
"""
df_range = pd.DataFrame(index=dates)
df_range = df_range.join(df, how='inner')
return df_range
def yf_download(symbols, start, end):
df = pd.DataFrame(columns = pd.MultiIndex(levels=[["Adj Close", "Volume"],[]], codes=[[],[]], names=["param", "tick"]))
for sym in symbols:
# tmp = si.get_data(sym, start_date=start)
tmp = si.get_data(sym, start_date=start)[["adjclose", "volume"]]
tmp.rename(columns={"adjclose": "Adj Close", "volume": "Volume"}, inplace=True)
tmp.columns = pd.MultiIndex.from_product([list(tmp.columns)] + [[sym]], names=["param", "tick"])
df = df.join(tmp, how='outer')
return df
# def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
# """
# Read stock data (adjusted close) for given symbols from Yahoo Finance
# """
# org_sym = symbols
# sd = dates[0]
# ed = dates[-1]
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# if 'SPY' not in symbols:
# symbols = ['SPY'] + symbols
# df = yf.download(symbols, start=sd, end = ed)[colname]
# if len(symbols) == 1:
# df.name = symbols[0]
# df = df.to_frame()
# df = df.dropna(subset=['SPY'])
# df = fill_missing_data(df)
# if addSPY==False:
# # df = df.drop(columns=['SPY'])
# df = df[org_sym]
# return df
def yf_batch_download(symbols, start, end, batch_size, time_gap):
"""
download in small batches to avoid connection closure by host
Parameters
----------
symbols : list
stock symbols.
start : datetime
start date.
end : datetime
stop date.
batch_size : integer
batch size.
time_gap : float
in seconds or fraction of seconds.
Returns
-------
df : dataframe
stock price volume information.
"""
n = len(symbols)
batches = n // batch_size
df = pd.DataFrame()
for i in range(batches - 1):
tmp = yf_download(symbols[i*batch_size:(i+1)*batch_size], start, end)
df = pd.concat([df, tmp], axis=1)
sleep(time_gap)
tmp = yf_download(symbols[(batches-1)*batch_size:n], start, end)
df = pd.concat([df, tmp], axis=1)
return df
def get_price_volume(symbols, dates, addSPY=False):
"""
Read stock data (adjusted close and volume) for given symbols from local
file unless data is not in local. It only gets date from Yahoo Finance
when necessary to increase speed and reduce internet data.
It will refresh local data if the symbols are on the _refresh.csv. This
is necessary when stock splits, spins off or something else happens.
"""
# DATAFILE = "_stkdata.pickle"
# REFRESH = "_refresh.csv"
org_sym = symbols
sd = dates[0]
ed = dates[-1]
# if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
if 'SPY' not in symbols:
symbols = ['SPY'] + symbols
df = yf_batch_download(symbols, start=sd, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
if len(symbols) == 1:
tuples = list(zip(df.columns.values.tolist(), \
[symbols[0]]*len(df.columns.values)))
df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
# if not os.path.exists(DATAFILE):
# df = yf_batch_download(symbols, start=sd, end=ed, \
# batch_size=BATCHSIZE, time_gap=TIMEGAP)
# if len(symbols) == 1:
# tuples = list(zip(df.columns.values.tolist(), \
# [symbols[0]]*len(df.columns.values)))
# df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
# else:
# df = pd.read_pickle(DATAFILE)
# exist_syms = df["Adj Close"].columns.values.tolist()
# if os.path.exists(REFRESH):
# try:
# refresh_df = pd.read_csv(REFRESH, header=None)
# refresh_syms = refresh_df.values.tolist()
# refresh_syms = [x for sublist in refresh_syms for x in sublist]
# remove_syms = [x for x in exist_syms if x in refresh_syms]
# if remove_syms:
# df.drop(columns=remove_syms, axis=1, level=1, inplace=True)
# exist_syms = [x for x in exist_syms if x not in refresh_syms]
# except:
# pass
exist_syms = []
last_day = pd.to_datetime(df.index.values[-1])
first_day = pd.to_datetime(df.index.values[0])
intersect_syms = list(set(org_sym) & set(exist_syms))
# reduce df to only contain intersect_syms
df = df.loc[:, (slice(None), intersect_syms)]
if sd < first_day:
# fill gap from online
tmp_df = yf_batch_download(intersect_syms, start=sd, end=first_day, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
df = pd.concat([tmp_df, df])
if ed >= last_day:
# fill gap from online incl last two days to get mkt close data
if ed.date() == last_day.date():
tmp_df = yf_batch_download(intersect_syms, start=ed, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
else:
tmp_df = yf_batch_download(intersect_syms, start=last_day, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
df = pd.concat([df[:-1], tmp_df])
# get data online when new stks were added
new_stks = np.setdiff1d(symbols, exist_syms).tolist()
if not new_stks == []:
tmp_df = yf_batch_download(new_stks, start=sd, end=ed, \
batch_size=BATCHSIZE, time_gap=TIMEGAP)
if len(new_stks) == 1:
tuples = list(zip(tmp_df.columns.values.tolist(), \
[new_stks[0]]*len(tmp_df.columns.values)))
tmp_df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
df = df.join(tmp_df)
# df.to_pickle(DATAFILE) # save to local, overwrite existing file
# if os.path.exists(REFRESH):
# with open(REFRESH, 'w'):
# pass
df = df.dropna(subset=[('Adj Close', 'SPY')])
price = df['Adj Close']
price = fill_missing_data(price)
volume = df['Volume']
volume = volume.fillna(0)
# if len(symbols) == 1:
# price.name = symbols[0]
# volume.name = symbols[0]
# price = price.to_frame()
# volume = volume.to_frame()
if addSPY==False:
price = price[org_sym]
volume = volume[org_sym]
return price, volume
# def get_price_volume_online(symbols, dates, addSPY=False):
# """
# Read stock data (adjusted close and volume) for given symbols from Yahoo
# Finance
# """
# org_sym = symbols
# sd = dates[0]
# ed = dates[-1]
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# if 'SPY' not in symbols:
# symbols = ['SPY'] + symbols
# df = yf.download(symbols, start=sd, end = ed)
# if len(symbols) == 1:
# df = df.dropna(subset = ['Adj Close'])
# else:
# df = df.dropna(subset=[('Adj Close', 'SPY')])
# price = df['Adj Close']
# price = fill_missing_data(price)
# volume = df['Volume']
# volume = volume.fillna(0)
# if len(symbols) == 1:
# price.name = symbols[0]
# volume.name = symbols[0]
# price = price.to_frame()
# volume = volume.to_frame()
# if addSPY==False:
# price = price[org_sym]
# volume = volume[org_sym]
# return price, volume
def get_watchlist(website: str = WEBSITE):
page = requests.get(WEBSITE)
# page = requests.get(WEBSITE, verify=False) # skip certificate check for https
tree = html.fromstring(page.content)
watchlist = tree.xpath('//*[@id="dokuwiki__content"]/div[1]/div/div[3]/div/pre/text()')[0]
file_name = StringIO(watchlist)
df = pd.read_csv(file_name, index_col = 'Symbol',
comment = '#', na_filter=False)
return df
# def get_watchlist(file_name: str = 'watchlist.csv'):
# df = pd.read_csv(file_name, index_col = 'Symbol',
# comment = '#', na_filter=False)
# return df
# def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
# """
# Read stock data (adjusted close) for given symbols from CSV files.
# (done) TODO: there are nan values in the data when addSPY=False is passed. The
# strategy should be using SPY to clean the data first including fill
# forward and fill backward, then to drop the SPY if addSPY=False
# """
# org_sym = symbols
# df = pd.DataFrame(index=dates)
# # if addSPY and 'SPY' not in symbols: # add SPY for reference, if absent
# # symbols = ['SPY'] + symbols
# if 'SPY' not in symbols:
# symbols = ['SPY'] + symbols
# for symbol in symbols:
# df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
# parse_dates=True, usecols=['Date', colname], na_values=['nan'])
# df_temp = df_temp.rename(columns={colname: symbol})
# df = df.join(df_temp)
# if symbol == 'SPY': # drop dates SPY did not trade
# df = df.dropna(subset=["SPY"])
# # fill missing data
# df = fill_missing_data(df)
# if addSPY == False: # drop SPY
# # df = df.drop(columns=['SPY'])
# df = df[org_sym]
# return df
def plot_data(df, axs=[], title=[], xlabel='', ylabel=''):
"""Plot stock prices with a custom title and meaningful axis labels."""
if axs == []:
ax = df.plot(title = title)
else:
ax = df.plot(ax=axs, title=title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.grid()
# def plot_data(df, title=[], xlabel='', ylabel=''):
# import matplotlib.pyplot as plt
# """Plot stock prices with a custom title and meaningful axis labels."""
# ax = df.plot(title=title, fontsize=12, figsize=(10, 7))
# ax.set_xlabel(xlabel)
# ax.set_ylabel(ylabel)
# plt.grid()
# plt.show()
def get_orders_data_file(basefilename):
return open(os.path.join(os.environ.get("ORDERS_DATA_DIR",'orders/'),basefilename))
def get_learner_data_file(basefilename):
return open(os.path.join(os.environ.get("LEARNER_DATA_DIR",'Data/'),basefilename),'r')
def get_robot_world_file(basefilename):
return open(os.path.join(os.environ.get("ROBOT_WORLDS_DIR",'testworlds/'),basefilename))
def test_code():
symbol = ['GOOG', 'AMZN']
# lookback years
lb_year = 0.08
ed = dt.datetime.today()
sd = ed - dt.timedelta(days = 365 * lb_year + 1)
# If ed or sd falls on to a non-trading day, you might get warnings saying
# "No data found for this date range, symbol may be delisted". This is
# normal behavior.
prices, volume = get_price_volume(symbol, pd.date_range(sd, ed), addSPY=False)
if __name__ == '__main__':
test_code()