source code

2024-06-19 10:33:24 -07:00 · 2024-06-19 10:33:24 -07:00 · fe44f5b4ff
commit fe44f5b4ff
parent 4a5c1569d1
3 changed files with 1557 additions and 0 deletions
--- a/indicators.py
+++ b/indicators.py
--- a/marketsim.py
+++ b/marketsim.py
@ -0,0 +1,163 @@
 # -*- coding: utf-8 -*-
 """
 Created on Wed Feb  5 21:56:42 2020
@author: cpan
 """
 """MC2-P1: Market simulator."""
 import pandas as pd
 import numpy as np
 # import datetime as dt
 # import os
 from util import get_data, plot_data
 def normalize_data(df):
    return df/df.iloc[0,:]
 def fill_missing_values(df_data):
    '''First fill forward and then fill backward'''
    df_data.fillna(method="ffill", inplace=True)
    df_data.fillna(method="bfill", inplace=True)
 def get_orders(orders_file):
    if isinstance(orders_file, pd.DataFrame):
        # orders_df = orders_file.set_index('Date')
        orders_df = orders_file
    else:
        orders_df = pd.read_csv(orders_file, index_col = 'Date', parse_dates = True,
                                na_values = ['nan'])
        orders_df = orders_df.dropna()
        orders_df = orders_df.sort_index()
    return orders_df
 def compute_daily_returns(df):
    daily_returns = df.copy()
    daily_returns[1:] = (df[1:] / df[:-1].values) - 1
    daily_returns.iloc[0, :] = 0
    return daily_returns
 def compute_portfolio_stats(price, allocs=[0.1,0.2,0,3,0.4], rfr=0.0, sf=252.0):
    norm_price = normalize_data(price)
    norm_positions_val = norm_price * allocs
    if len(norm_positions_val.columns) == 1:
        norm_portfolio_val = norm_positions_val
    else:
        norm_portfolio_val = norm_positions_val.sum(axis=1).to_frame('PORTFOLIO')
    cr = norm_portfolio_val.iloc[-1] / norm_portfolio_val.iloc[0] -1
    daily_returns = compute_daily_returns(norm_portfolio_val)
    daily_returns = daily_returns[1:] # remove first row (all zeros)
    adr = daily_returns.mean()
    sddr = daily_returns.std()
    sr = np.sqrt(sf) * (adr - rfr)/sddr
    return cr, adr, sddr, sr
 def plot_against_SPY(df):
    df_temp = df.copy()
    if 'SPY' not in df_temp.columns:
        df_SPY = get_data(['SPY'], pd.to_datetime(df_temp.index.values))
        df_temp['SPY'] = df_SPY.values
    else:
        df_SPY = df_temp['SPY']
    df_temp = normalize_data(df_temp)
    plot_data(df_temp)
    return df_SPY
 def compute_portvals(orders_file = "./orders/orders.csv", start_val = 1000000,
                     commission=9.95, impact=0.005):
    #read in order data
    orders_df = get_orders(orders_file)
    #scan symbols
    symbols = list(set(orders_df['Symbol'].values))
    #get date range
    dates = pd.date_range(orders_df.index.values[0], orders_df.index.values[-1])
    #read in prices
    prices = get_data(symbols, dates)
    # fill_missing_values(prices) # included in get_data() function
    prices = prices[symbols]
    #add an extra column 'Cash' and initialize it to all ones
    prices['Cash'] = np.ones(prices.shape[0])
    #duplicate price df into a units df, intialize it to all zeros
    positions=prices*0.0
    #initialize cash position with starting value
    positions.loc[positions.index.values[0],['Cash']]=start_val
    #adjust positions to show how stock units and cash are changing over time with orders
    for index, row in orders_df.iterrows():
        stock_sym = row['Symbol']
        order_price = prices.loc[index, stock_sym]
        order_shrs = row['Shares']
        if row['Order'].upper() == 'BUY':
            if positions.loc[index, 'Cash'] < order_shrs*order_price +\
                                commission + order_shrs*order_price*impact:
                # print('Not enough cash to excute the order:\n', row)
                pass
            else:
                #update positions on transaction days
                positions.loc[index, stock_sym] += order_shrs
                positions.loc[index, "Cash"] -= order_shrs*order_price
                #deduct commission
                positions.loc[index,"Cash"] -= commission
                #impact = no. of orders in transaction * price of each share * impact.
                positions.loc[index,"Cash"] -= order_shrs*order_price*impact
        elif row['Order'].upper() == 'SELL':
            if positions.loc[index, stock_sym] < order_shrs:
                # print('Not enough shares to sell to fill the order:\n', row)
                pass
            else:
                positions.loc[index, stock_sym] -= order_shrs
                positions.loc[index, "Cash"] += order_shrs*order_price
                #deduct commission
                positions.loc[index,"Cash"] -= commission
                #impact = no. of orders in transaction * price of each share * impact.
                positions.loc[index,"Cash"] -= order_shrs*order_price*impact
        # propagate positions beyond transaction days
        start_row = positions.index.get_loc(index) + 1
        positions.iloc[start_row:, :] = positions.iloc[start_row-1].values
    #calculate port_vals
    port_vals=prices*positions
    port_vals.insert(0, 'Portfolio', port_vals.sum(axis=1))
    return port_vals
 def test_code():
    of = "./orders/orders-05.csv"
    sv = 1000000
    # Process orders
    portvals = compute_portvals(orders_file = of, start_val = sv)
    if isinstance(portvals, pd.DataFrame):
        portvals = portvals[portvals.columns[0]].to_frame() # just get the first column
    else:
        print("warning, code did not return a DataFrame")
    # Get portfolio stats
    start_date = pd.to_datetime(portvals.index.values[0])
    end_date = pd.to_datetime(portvals.index.values[-1])
    price_SPY = plot_against_SPY(portvals)
    #portfolio stats calculated similar to assess_portfolio
    rfr=0
    sf=252
    cr, adr, sddr, sr = compute_portfolio_stats(portvals, [1.0], rfr, sf)
    crSP,adrSP,sddrSP,srSP = compute_portfolio_stats(price_SPY, [1.0], rfr, sf)
    # Compare portfolio against $SPX
    print("\nDate Range: {} to {}".format(start_date.date(), end_date.date()))
    print()
    print("Sharpe Ratio: {}, {}".format(sr, srSP))
    print()
    print("Cumulative Return: {}, {}".format(cr, crSP))
    print()
    print("Standard Deviation: {}, {}".format(sddr, sddrSP))
    print()
    print("Average Daily Return: {}, {}".format(adr, adrSP))
    print()
    print("Final Portfolio Value: {:.2f}".format(portvals['Portfolio'].iloc[-1]))
 if __name__ == "__main__":
    test_code()
--- a/util.py
+++ b/util.py
@ -0,0 +1,371 @@
 """
 Use Yahoo Finance data
 """
 import warnings
 # Suppress FutureWarnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 import datetime as dt
 import os
 import pandas as pd
 import numpy as np
 import yfinance as yf
 import requests
 from lxml import html
 from io import StringIO
 from time import sleep
 WEBSITE = 'https://www.isolo.org/dokuwiki/knowledge_base/investing/watchlist'
 BATCHSIZE = 20
 TIMEGAP = 0.2
 def fill_missing_data(df):
    temp = df.ffill()
    temp = temp.bfill()
    return temp
 def symbol_to_path(symbol, base_dir=None):
    """Return CSV file path given ticker symbol."""
    if base_dir is None:
        base_dir = os.environ.get("MARKET_DATA_DIR", '../data/')
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))
 def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
    """
    Read stock data (adjusted close) for given symbols from Yahoo Finance
    from start_date to the latest date available (usually the current date).
    """
    if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
        symbols = ['SPY'] + symbols
    df = yf.download(symbols, start = start_date)[colname]
    if len(symbols) == 1:
        df.name = symbols[0]
        df = df.to_frame()
    return df
 # def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
    """
    Read stock data (adjusted close) for given symbols from CSV files
    from start_date to the latest date available in the CSV files.
    """
 #     df_temp = pd.read_csv(symbol_to_path('SPY'), index_col='Date',
 #             parse_dates=True, usecols=['Date', colname], na_values=['nan'])
 #     df_temp = df_temp.rename(columns={colname: 'SPY'})
 #     end_date = df_temp.index.values[-1]
 #     dates = pd.date_range(start_date, end_date)
 #     df = pd.DataFrame(index=dates)
 #     df = df.join(df_temp)
 #     df = df.dropna()
 #     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
 #     #     symbols = ['SPY'] + symbols
 #     for symbol in symbols:
 #         df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
 #                 parse_dates=True, usecols=['Date', colname], na_values=['nan'])
 #         df_temp = df_temp.rename(columns={colname: symbol})
 #         df = df.join(df_temp)
 #         # if symbol == 'SPY':  # drop dates SPY did not trade
 #         #     df = df.dropna(subset=["SPY"])
 #     if not addSPY:
 #         df = df[symbols]
 #     return df
 def get_data_range(df, dates):
    """
    Extract sections of the data in the dates range from the full data set
    """
    df_range = pd.DataFrame(index=dates)
    df_range = df_range.join(df, how='inner')
    return df_range
 def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
    """
    Read stock data (adjusted close) for given symbols from Yahoo Finance
    """
    org_sym = symbols
    sd = dates[0]
    ed = dates[-1]
    # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
    if 'SPY' not in symbols:
        symbols = ['SPY'] + symbols
    df = yf.download(symbols, start=sd, end = ed)[colname]
    if len(symbols) == 1:
        df.name = symbols[0]
        df = df.to_frame()
    df = df.dropna(subset=['SPY'])
    df = fill_missing_data(df)
    if addSPY==False:
        # df = df.drop(columns=['SPY'])
        df = df[org_sym]
    return df
 def yf_batch_download(symbols, start, end, batch_size, time_gap):
    """
    download in small batches to avoid connection closure by host
    Parameters
    ----------
    symbols : list
        stock symbols.
    start : datetime
        start date.
    end : datetime
        stop date.
    batch_size : integer
        batch size.
    time_gap : float
        in seconds or fraction of seconds.
    Returns
    -------
    df : dataframe
        stock price volume information.
    """
    n = len(symbols)
    batches = n // batch_size
    df = pd.DataFrame()
    for i in range(batches - 1):
        tmp = yf.download(symbols[i*batch_size:(i+1)*batch_size], start, end)
        df = pd.concat([df, tmp], axis=1)
        sleep(time_gap)
    tmp = yf.download(symbols[(batches-1)*batch_size:n], start, end)
    df = pd.concat([df, tmp], axis=1)
    return df
 def get_price_volume(symbols, dates, addSPY=False):
    """
    Read stock data (adjusted close and volume) for given symbols from local
    file unless data is not in local. It only gets date from Yahoo Finance
    when necessary to increase speed and reduce internet data.
    It will refresh local data if the symbols are on the _refresh.csv. This
    is necessary when stock splits, spins off or something else happens.
    """
    # DATAFILE = "_stkdata.pickle"
    # REFRESH = "_refresh.csv"
    org_sym = symbols
    sd = dates[0]
    ed = dates[-1]
    # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
    if 'SPY' not in symbols:
        symbols = ['SPY'] + symbols
    df = yf_batch_download(symbols, start=sd, end=ed, \
                            batch_size=BATCHSIZE, time_gap=TIMEGAP)
    if len(symbols) == 1:
        tuples = list(zip(df.columns.values.tolist(), \
                            [symbols[0]]*len(df.columns.values)))
        df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])           
    # if not os.path.exists(DATAFILE):
    #     df = yf_batch_download(symbols, start=sd, end=ed, \
    #                            batch_size=BATCHSIZE, time_gap=TIMEGAP)
    #     if len(symbols) == 1:
    #         tuples = list(zip(df.columns.values.tolist(), \
    #                           [symbols[0]]*len(df.columns.values)))
    #         df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])            
    # else:
    #     df = pd.read_pickle(DATAFILE)
    #     exist_syms = df["Adj Close"].columns.values.tolist()
    #     if os.path.exists(REFRESH):
    #         try:
    #             refresh_df = pd.read_csv(REFRESH, header=None)
    #             refresh_syms = refresh_df.values.tolist()
    #             refresh_syms = [x for sublist in refresh_syms for x in sublist]
    #             remove_syms = [x for x in exist_syms if x in refresh_syms]
    #             if remove_syms:
    #                 df.drop(columns=remove_syms, axis=1, level=1, inplace=True)
    #                 exist_syms = [x for x in exist_syms if x not in refresh_syms]
    #         except:
    #             pass
        exist_syms = []
        last_day = pd.to_datetime(df.index.values[-1])
        first_day = pd.to_datetime(df.index.values[0])
        intersect_syms = list(set(org_sym) & set(exist_syms))
        # reduce df to only contain intersect_syms
        df = df.loc[:, (slice(None), intersect_syms)]
        if sd < first_day:
            # fill gap from online
            tmp_df = yf_batch_download(intersect_syms, start=sd, end=first_day, \
                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
            df = pd.concat([tmp_df, df])
        if ed >= last_day:
            # fill gap from online incl last two days to get mkt close data
            if ed.date() == last_day.date():
                tmp_df = yf_batch_download(intersect_syms, start=ed, end=ed, \
                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
            else:
                tmp_df = yf_batch_download(intersect_syms, start=last_day, end=ed, \
                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
            df = pd.concat([df[:-1], tmp_df])
        # get data online when new stks were added    
        new_stks = np.setdiff1d(symbols, exist_syms).tolist()
        if not new_stks == []:
            tmp_df = yf_batch_download(new_stks, start=sd, end=ed, \
                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
            if len(new_stks) == 1:
                tuples = list(zip(tmp_df.columns.values.tolist(), \
                                  [new_stks[0]]*len(tmp_df.columns.values)))
                tmp_df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
            df = df.join(tmp_df)
    # df.to_pickle(DATAFILE) # save to local, overwrite existing file
    # if os.path.exists(REFRESH):
    #     with open(REFRESH, 'w'):
    #         pass
    df = df.dropna(subset=[('Adj Close', 'SPY')])
    price = df['Adj Close']
    price = fill_missing_data(price)
    volume = df['Volume']
    volume = volume.fillna(0)
    # if len(symbols) == 1:
    #     price.name = symbols[0]
    #     volume.name = symbols[0]
    #     price = price.to_frame()
    #     volume = volume.to_frame()
    if addSPY==False:
        price = price[org_sym]
        volume = volume[org_sym]
    return price, volume      
 def get_price_volume_online(symbols, dates, addSPY=False):
    """
    Read stock data (adjusted close and volume) for given symbols from Yahoo
    Finance
    """
    org_sym = symbols
    sd = dates[0]
    ed = dates[-1]
    # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
    if 'SPY' not in symbols:
        symbols = ['SPY'] + symbols
    df = yf.download(symbols, start=sd, end = ed)
    if len(symbols) == 1:
        df = df.dropna(subset = ['Adj Close'])
    else:
        df = df.dropna(subset=[('Adj Close', 'SPY')])
    price = df['Adj Close']
    price = fill_missing_data(price)
    volume = df['Volume']
    volume = volume.fillna(0)
    if len(symbols) == 1:
        price.name = symbols[0]
        volume.name = symbols[0]
        price = price.to_frame()
        volume = volume.to_frame()
    if addSPY==False:
        price = price[org_sym]
        volume = volume[org_sym]
    return price, volume
 def get_watchlist(website: str = WEBSITE):
    page = requests.get(WEBSITE)
    # page = requests.get(WEBSITE, verify=False) # skip certificate check for https
    tree = html.fromstring(page.content)
    watchlist = tree.xpath('//*[@id="dokuwiki__content"]/div[1]/div/div[3]/div/pre/text()')[0]
    file_name = StringIO(watchlist)
    df = pd.read_csv(file_name, index_col = 'Symbol',
                      comment = '#', na_filter=False)
    return df
 # def get_watchlist(file_name: str = 'watchlist.csv'):
 #     df = pd.read_csv(file_name, index_col = 'Symbol',
 #                      comment = '#', na_filter=False)
 #     return df
 # def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
 #     """
 #     Read stock data (adjusted close) for given symbols from CSV files.
 #     (done) TODO: there are nan values in the data when addSPY=False is passed. The
 #     strategy should be using SPY to clean the data first including fill
 #     forward and fill backward, then to drop the SPY if addSPY=False
 #     """
 #     org_sym = symbols
 #     df = pd.DataFrame(index=dates)
 #     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
 #     #     symbols = ['SPY'] + symbols
 #     if 'SPY' not in symbols:
 #         symbols = ['SPY'] + symbols
 #     for symbol in symbols:
 #         df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
 #                 parse_dates=True, usecols=['Date', colname], na_values=['nan'])
 #         df_temp = df_temp.rename(columns={colname: symbol})
 #         df = df.join(df_temp)
 #         if symbol == 'SPY':  # drop dates SPY did not trade
 #             df = df.dropna(subset=["SPY"])
 #     # fill missing data
 #     df = fill_missing_data(df)
 #     if addSPY == False: # drop SPY
 #         # df = df.drop(columns=['SPY'])
 #         df = df[org_sym]
 #     return df
 def plot_data(df, axs=[], title=[], xlabel='', ylabel=''):
    """Plot stock prices with a custom title and meaningful axis labels."""
    if axs == []:
        ax = df.plot(title = title)
    else:
        ax = df.plot(ax=axs, title=title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.grid()
 # def plot_data(df, title=[], xlabel='', ylabel=''):
 #     import matplotlib.pyplot as plt
 #     """Plot stock prices with a custom title and meaningful axis labels."""
 #     ax = df.plot(title=title, fontsize=12, figsize=(10, 7))
 #     ax.set_xlabel(xlabel)
 #     ax.set_ylabel(ylabel)
 #     plt.grid()
 #     plt.show()
 def get_orders_data_file(basefilename):
    return open(os.path.join(os.environ.get("ORDERS_DATA_DIR",'orders/'),basefilename))
 def get_learner_data_file(basefilename):
    return open(os.path.join(os.environ.get("LEARNER_DATA_DIR",'Data/'),basefilename),'r')
 def get_robot_world_file(basefilename):
    return open(os.path.join(os.environ.get("ROBOT_WORLDS_DIR",'testworlds/'),basefilename))
 def test_code():
    symbol = ['GOOG', 'AMZN']
    # lookback years
    lb_year = 0.08
    ed = dt.datetime.today()
    sd = ed - dt.timedelta(days = 365 * lb_year + 1)
    # If ed or sd falls on to a non-trading day, you might get warnings saying
    # "No data found for this date range, symbol may be delisted". This is 
    # normal behavior.
    prices, volume = get_price_volume(symbol, pd.date_range(sd, ed), addSPY=False)
 if __name__ == '__main__':
    test_code()