12 changed files with 1587 additions and 1940 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,3 +0,0 @@
-.*
-__pycache__
-!.env
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +0,0 @@
-.*
-__pycache__
-!.gitignore
-!.dockerignore
--- a/8
+++ b/8
@ -1,8 +0,0 @@
-FROM python:3.12-slim
-RUN apt-get update && apt-get install -y libpq-dev gcc
-WORKDIR /app
-COPY . /app
-RUN pip install -U pip && pip install -r requirements.txt
-ADD stock_info.py /usr/local/lib/python3.12/site-packages/yahoo_fin/
-EXPOSE 8050
-CMD ["python", "./indicators.py"]
--- a/README.md
+++ b/README.md
--- a/indicators.py
+++ b/indicators.py
--- a/marketsim.py
+++ b/marketsim.py
@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb  5 21:56:42 2020
+
+@author: cpan
+"""
+
+"""MC2-P1: Market simulator."""
+
+import pandas as pd
+import numpy as np
+# import datetime as dt
+# import os
+from util import get_data, plot_data
+
+def normalize_data(df):
+    return df/df.iloc[0,:]
+
+def fill_missing_values(df_data):
+    '''First fill forward and then fill backward'''
+    df_data.fillna(method="ffill", inplace=True)
+    df_data.fillna(method="bfill", inplace=True)
+
+def get_orders(orders_file):
+    if isinstance(orders_file, pd.DataFrame):
+        # orders_df = orders_file.set_index('Date')
+        orders_df = orders_file
+    else:
+        orders_df = pd.read_csv(orders_file, index_col = 'Date', parse_dates = True,
+                                na_values = ['nan'])
+        orders_df = orders_df.dropna()
+        orders_df = orders_df.sort_index()
+    return orders_df
+
+def compute_daily_returns(df):
+    daily_returns = df.copy()
+    daily_returns[1:] = (df[1:] / df[:-1].values) - 1
+    daily_returns.iloc[0, :] = 0
+    return daily_returns
+
+def compute_portfolio_stats(price, allocs=[0.1,0.2,0,3,0.4], rfr=0.0, sf=252.0):
+    norm_price = normalize_data(price)
+    norm_positions_val = norm_price * allocs
+    if len(norm_positions_val.columns) == 1:
+        norm_portfolio_val = norm_positions_val
+    else:
+        norm_portfolio_val = norm_positions_val.sum(axis=1).to_frame('PORTFOLIO')
+    cr = norm_portfolio_val.iloc[-1] / norm_portfolio_val.iloc[0] -1
+    daily_returns = compute_daily_returns(norm_portfolio_val)
+    daily_returns = daily_returns[1:] # remove first row (all zeros)
+    adr = daily_returns.mean()
+    sddr = daily_returns.std()
+    sr = np.sqrt(sf) * (adr - rfr)/sddr
+    return cr, adr, sddr, sr
+
+def plot_against_SPY(df):
+    df_temp = df.copy()
+    if 'SPY' not in df_temp.columns:
+        df_SPY = get_data(['SPY'], pd.to_datetime(df_temp.index.values))
+        df_temp['SPY'] = df_SPY.values
+    else:
+        df_SPY = df_temp['SPY']
+    df_temp = normalize_data(df_temp)
+    plot_data(df_temp)
+    return df_SPY
+
+def compute_portvals(orders_file = "./orders/orders.csv", start_val = 1000000,
+                     commission=9.95, impact=0.005):
+    #read in order data
+    orders_df = get_orders(orders_file)
+    #scan symbols
+    symbols = list(set(orders_df['Symbol'].values))
+    #get date range
+    dates = pd.date_range(orders_df.index.values[0], orders_df.index.values[-1])
+    #read in prices
+    prices = get_data(symbols, dates)
+    # fill_missing_values(prices) # included in get_data() function
+    prices = prices[symbols]
+    #add an extra column 'Cash' and initialize it to all ones
+    prices['Cash'] = np.ones(prices.shape[0])
+
+    #duplicate price df into a units df, intialize it to all zeros
+    positions=prices*0.0
+    #initialize cash position with starting value
+    positions.loc[positions.index.values[0],['Cash']]=start_val
+
+    #adjust positions to show how stock units and cash are changing over time with orders
+    for index, row in orders_df.iterrows():
+        stock_sym = row['Symbol']
+        order_price = prices.loc[index, stock_sym]
+        order_shrs = row['Shares']
+
+        if row['Order'].upper() == 'BUY':
+            if positions.loc[index, 'Cash'] < order_shrs*order_price +\
+                                commission + order_shrs*order_price*impact:
+                # print('Not enough cash to excute the order:\n', row)
+                pass
+            else:
+                #update positions on transaction days
+                positions.loc[index, stock_sym] += order_shrs
+                positions.loc[index, "Cash"] -= order_shrs*order_price
+                #deduct commission
+                positions.loc[index,"Cash"] -= commission
+                #impact = no. of orders in transaction * price of each share * impact.
+                positions.loc[index,"Cash"] -= order_shrs*order_price*impact
+        elif row['Order'].upper() == 'SELL':
+            if positions.loc[index, stock_sym] < order_shrs:
+                # print('Not enough shares to sell to fill the order:\n', row)
+                pass
+            else:
+                positions.loc[index, stock_sym] -= order_shrs
+                positions.loc[index, "Cash"] += order_shrs*order_price
+                #deduct commission
+                positions.loc[index,"Cash"] -= commission
+                #impact = no. of orders in transaction * price of each share * impact.
+                positions.loc[index,"Cash"] -= order_shrs*order_price*impact
+
+        # propagate positions beyond transaction days
+        start_row = positions.index.get_loc(index) + 1
+        positions.iloc[start_row:, :] = positions.iloc[start_row-1].values
+
+    #calculate port_vals
+    port_vals=prices*positions
+    port_vals.insert(0, 'Portfolio', port_vals.sum(axis=1))
+
+    return port_vals
+
+def test_code():
+    of = "./orders/orders-05.csv"
+    sv = 1000000
+
+    # Process orders
+    portvals = compute_portvals(orders_file = of, start_val = sv)
+    if isinstance(portvals, pd.DataFrame):
+        portvals = portvals[portvals.columns[0]].to_frame() # just get the first column
+    else:
+        print("warning, code did not return a DataFrame")
+
+    # Get portfolio stats
+    start_date = pd.to_datetime(portvals.index.values[0])
+    end_date = pd.to_datetime(portvals.index.values[-1])
+    price_SPY = plot_against_SPY(portvals)
+    #portfolio stats calculated similar to assess_portfolio
+    rfr=0
+    sf=252
+
+    cr, adr, sddr, sr = compute_portfolio_stats(portvals, [1.0], rfr, sf)
+    crSP,adrSP,sddrSP,srSP = compute_portfolio_stats(price_SPY, [1.0], rfr, sf)
+    # Compare portfolio against $SPX
+    print("\nDate Range: {} to {}".format(start_date.date(), end_date.date()))
+    print()
+    print("Sharpe Ratio: {}, {}".format(sr, srSP))
+    print()
+    print("Cumulative Return: {}, {}".format(cr, crSP))
+    print()
+    print("Standard Deviation: {}, {}".format(sddr, sddrSP))
+    print()
+    print("Average Daily Return: {}, {}".format(adr, adrSP))
+    print()
+    print("Final Portfolio Value: {:.2f}".format(portvals['Portfolio'].iloc[-1]))
+
+if __name__ == "__main__":
+    test_code()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,63 +1,54 @@
 appdirs==1.4.4
-Authlib==1.3.2
 beautifulsoup4==4.12.3
 blinker==1.8.2
 bs4==0.0.2
 cachelib==0.9.0
-certifi==2024.8.30
-cffi==1.17.1
+certifi==2024.6.2
 charset-normalizer==3.3.2
 click==8.1.7
 colorama==0.4.6
-cryptography==43.0.1
 cssselect==1.2.0
-dash==2.18.1
+dash==2.17.1
 dash-core-components==2.0.0
 dash-html-components==2.0.0
 dash-table==5.0.0
-dash_auth==2.3.0
 fake-useragent==1.5.1
 feedparser==6.0.11
 Flask==3.0.3
 Flask-Caching==2.3.0
-idna==3.10
-importlib_metadata==8.5.0
+idna==3.7
+importlib_metadata==8.0.0
 itsdangerous==2.2.0
 Jinja2==3.1.4
-lxml==5.3.0
-lxml_html_clean==0.2.2
+lxml==5.2.2
 MarkupSafe==2.1.5
 nest-asyncio==1.6.0
-numpy==2.1.1
+numpy==2.0.0
 packaging==24.1
-pandas==2.2.3
+pandas==2.2.2
 parse==1.20.2
-plotly==5.24.1
-psycopg2==2.9.9
-pycparser==2.22
-pyee==11.1.1
+plotly==5.22.0
+pyee==11.1.0
 pyppeteer==2.0.0
-pyquery==2.0.1
+pyquery==2.0.0
 python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-pytz==2024.2
+pytz==2024.1
 requests==2.32.3
 requests-html==0.10.0
 retrying==1.3.4
-scipy==1.14.1
-sec-cik-mapper==2.1.0
-setuptools==75.1.0
+scipy==1.14.0
+setuptools==70.1.1
 sgmllib3k==1.0.0
 six==1.16.0
-soupsieve==2.6
-tenacity==9.0.0
-tqdm==4.66.5
+soupsieve==2.5
+tenacity==8.4.2
+tqdm==4.66.4
 typing_extensions==4.12.2
-tzdata==2024.2
-urllib3==1.26.20
+tzdata==2024.1
+urllib3==1.26.19
 w3lib==2.2.1
 waitress==3.0.0
 websockets==10.4
-Werkzeug==3.0.4
+Werkzeug==3.0.3
 yahoo-fin==0.8.9.1
-zipp==3.20.2
+zipp==3.19.2
--- a/stock_info.py
+++ b/stock_info.py
--- a/subroutines/init.py
+++ b/subroutines/init.py
@ -1,6 +0,0 @@
-# Define the __all__ variable
-__all__ = ["Security", "remove_from_db", "insert_into_db", "get_watchlist", "hash_password"]
-
-# Import the submodules
-from .security import Security, get_crossing, get_sma_slope
-from .dbutil import remove_from_db, insert_into_db, get_watchlist, hash_password
--- a/subroutines/dbutil.py
+++ b/subroutines/dbutil.py
@ -1,125 +0,0 @@
-import hashlib
-import psycopg2
-import sys
-import os
-import pandas as pd
-
-def connect_db():
-    """Connect to database
-
-    Returns:
-        psycopg2 connector: psycopg2 postgresql connector
-    """
-    conn = None
-    try:
-        conn = psycopg2.connect(
-                    host=os.environ['DB_PATH'],
-                    database=os.environ['DB_NAME'],
-                    user=os.environ['DB_USERNAME'],
-                    password=os.environ['DB_PASSWORD'],
-                )
-    except (Exception, psycopg2.DatabaseError) as error:
-        print(error)
-        sys.exit(1)
-    return conn
-
-def get_watchlist(username : str):
-    """Read list of tickers/descriptions from database
-
-    Args:
-        username (str): database table prefix
-
-    Returns:
-        Pandas DataFrame: it has two columns - first column is ticker, second column is description
-    """
-    if username:
-        table_name = f"{username + '_watch_list'}"
-    else: # username is None, use default table
-        table_name = "stock_watch_list"
-
-    QUERY1 = f'''CREATE TABLE IF NOT EXISTS {table_name} 
-        (
-            tick character varying(5) NOT NULL,
-            description text,
-            PRIMARY KEY (tick)
-        );'''
-    QUERY2 = f"INSERT INTO {table_name} SELECT 'SPY', 'SPDR S&P 500 ETF Trust' WHERE NOT EXISTS (SELECT NULL FROM {table_name});"
-
-    QUERY3 = f"SELECT * FROM {table_name};"
-
-    with connect_db() as conn:
-        with conn.cursor() as curs:
-            curs.execute(QUERY1)
-            curs.execute(QUERY2)
-            curs.execute(QUERY3)
-            tuples_list = curs.fetchall()
-    
-    df = pd.DataFrame(tuples_list)
-    return df
-
-def remove_from_db(username, tick):
-    """Remove a row from database table using ticker as key
-
-    Args:
-        username (str): database table prefix
-        tick (str): ticker
-    """
-    if username:
-        table_name = f"{username + '_watch_list'}"
-    else: # username is None, use default table
-        table_name = "stock_watch_list"
-    
-    QUERY = f"DELETE FROM {table_name} WHERE tick = '{tick}';"
-
-    with connect_db() as conn:
-        with conn.cursor() as curs:
-            curs.execute(QUERY)
-
-def insert_into_db(username : str, tick : str, name : str):
-    """Insert ticker and description into database
-
-    Args:
-        username (str): database table prefix - each user has its own list of tickers
-        tick (str): stock or mutual fund ticker
-        name (str): company name for stock, series ID for mutual fund
-    """
-    if username:
-        table_name = f"{username + '_watch_list'}"
-    else: # username is None, use default table
-        table_name = "stock_watch_list"
-
-    QUERY1 = f'''CREATE TABLE IF NOT EXISTS {table_name} 
-        (
-            tick character varying(5) NOT NULL,
-            description text,
-            PRIMARY KEY (tick)
-        );'''
-    QUERY2 = f"INSERT INTO {table_name} SELECT 'SPY', 'SPDR S&P 500 ETF Trust' WHERE NOT EXISTS (SELECT NULL FROM {table_name});"
-
-    QUERY3 = f"INSERT INTO {table_name} VALUES ('{tick}', '{name}') ON CONFLICT DO NOTHING;"
-
-    with connect_db() as conn:
-        with conn.cursor() as curs:
-            curs.execute(QUERY1)
-            curs.execute(QUERY2)
-            curs.execute(QUERY3)
-
-def hash_password(password : str):
-    """Generate hash from string using sha256
-
-    Args:
-        password (str): any text
-
-    Returns:
-        str: hash string
-    """
-    # Encode the password as bytes
-    password_bytes = password.encode('utf-8')
-    
-    # Use SHA-256 hash function to create a hash object
-    hash_object = hashlib.sha256(password_bytes)
-    
-    # Get the hexadecimal representation of the hash
-    password_hash = hash_object.hexdigest()
-    
-    return password_hash
--- a/subroutines/security.py
+++ b/subroutines/security.py
@ -1,341 +0,0 @@
-import numpy as np
-from numpy.fft import fft, ifft
-import scipy.signal as sig
-
-class Security:
-    """
-    This can be a list of stocks, bonds, or otherinvestment vehicles.
-    price - Pandas DataFrame with datetime as index sorted to chronical order
-    """
-    def __init__(self, sym, price, volume=None, rfr: float = 0.01, sf: float = 252.0):
-        """
-        Parameters
-        ----------
-        price : TYPE pandas.DataFrame
-            DESCRIPTION. historical adj. daily close prices of stocks under
-            consideration
-        volume : TYPE pandas.DataFrame
-            DESCRIPTION. daily trading volume. The default is none.
-        rfr : TYPE float, optional
-            DESCRIPTION. annualized risk free rate. The default is 0.01.
-        sf : TYPE sample frequency, optional
-            DESCRIPTION. The default is 252 (daily). there are 252 trading
-            days in a year. Monthly sampling frequency would be 12. And
-            weekly sampling frequenc is 52.
-        """
-        self._symbol = sym
-        self._price = price
-        self._volume = volume
-        # self._symbol = price.columns.values
-        self._rfr = rfr
-        self._sf = sf
-
-    @property
-    def symbol(self):
-        return self._symbol
-    @symbol.setter
-    def symbol(self, value):
-        raise AttributeError('security symbol is read only')
-
-    @property
-    def price(self):
-        return self._price
-
-    @price.setter
-    def price(self, value):
-        raise AttributeError('security price is read only')
-
-    @property
-    def volume(self):
-        if self._volume is None:
-            raise ValueError('trading volume information not available')
-        return self._volume
-
-    @volume.setter
-    def volume(self, value):
-        raise AttributeError('security volume is read only')
-
-    def sma(self, window):
-        return self.price.rolling(window).mean()
-
-    def vwma(self, window):
-        """
-        Volume weighted moving average. When plotted against sma, it gives an
-        early indicator when VWMA crosses SMA. When VWMA is above SMA, it
-        indicates a strong upward trend and vice versa.
-        """
-        price_vol = self.price * self.volume
-        return price_vol.rolling(window).sum() / self.volume.rolling(window).sum()
-
-    def vosma(self, window):
-        return self.volume.rolling(window).mean()
-
-    def ema(self, window): # default to 14 day window
-        # EMA pre-process the first point
-        price = self.price
-        temp = price.iloc[0:window].mean()
-        price.iloc[window-1] = temp
-        price.iloc[0:(window-1)] = np.nan
-
-        # process the EMA
-        avg = price.ewm(span=window, adjust=False).mean()
-        return avg
-
-    def voema(self, window): # default to 14 day window
-        # EMA pre-process the first point
-        vol = self.volume
-        temp = vol.iloc[0:window].mean()
-        vol.iloc[window-1] = temp
-        vol.iloc[0:(window-1)] = np.nan
-
-        # process the EMA
-        avg = vol.ewm(span=window, adjust=False).mean()
-        return avg
-
-    def rsi(self, window = 14):
-        """
-        Traditional interpretation and usage of the RSI are that values of 70
-        or above indicate that a security is becoming overbought or overvalued
-        and may be primed for a trend reversal or corrective pullback in price.
-        An RSI reading of 30 or below indicates an oversold or undervalued
-        condition.
-        """
-        # use exponential averaging
-        d_chg = self.price.diff()
-        d_up, d_dn = d_chg.copy(), d_chg.copy()
-        d_up[d_up < 0] = 0
-        d_dn[d_dn > 0] = 0
-
-        # EMA pre-process the first point
-        temp = d_up.iloc[1:(window+1)].mean()
-        d_up.iloc[window] = temp
-        d_up.iloc[1:window] = np.nan
-        temp = d_dn.iloc[1:(window+1)].mean()
-        d_dn.iloc[window] = temp
-        d_dn.iloc[1:window] = np.nan
-
-        # process the EMA
-        avg_up = d_up.ewm(span=window, adjust=False).mean()
-        avg_dn = d_dn.ewm(span=window, adjust=False).mean()
-        rs = avg_up / abs(avg_dn.values)
-        exp_rsi = 100 - 100 / (1+rs)
-        return exp_rsi
-
-
-    def volume_rsi(self, window = 14):
-        """
-        The volume RSI (Relative Strength Index) is quite similar to the price
-        based RSI with difference that up-volume and down-volume are used in
-        the RSI formula instead changes in price. If price RSI shows relation
-        between up-moves and down-moves within an analyzed period of time by
-        revealing which moves are stronger, the volume RSI indicator shows the
-        relation between volume traded during these price up-moves and
-        down-moves respectfully by revealing whether up-volume (bullish money
-        flow) or down-volume (bearish money flow) is stronger.
-
-        The same as price RSI, volume RSI oscillates around 50% center-line in
-        the range from 0 to 100%. In technical analysis this indicator could be
-        used in the same way as well. The simplest way of using the volume RSI
-        would be to generate trading signals on the crossovers of the indicator
-        and 50% center-line around which it oscillates. Here you have to
-        remember following:
-
-        volume RSI reading above 50% are considered bullish as bullish volume
-        dominates over bearish volume; volume RSI readings below 50% are
-        considered bearish as bearish volume overcomes bullish volume.
-        Respectfully, technical analysis would suggest to generate buy/sell
-        signals by following rules:
-
-        Buy when indicators moves above 50% line after being below it;
-        Sell when indicator drops below 50% line after being above it.
-        """
-        # use exponential averaging
-        volume = self.volume
-
-        up_vol, dn_vol = volume.copy(), volume.copy()
-        d_chg = self.price.diff()
-
-        up_vol[d_chg < 0] = 0
-        dn_vol[d_chg > 0] = 0
-        up_vol.iloc[0] = np.nan
-        dn_vol.iloc[0] = np.nan
-
-        # EMA pre-process the first point
-        temp = up_vol.iloc[1:(window+1)].mean()
-        up_vol.iloc[window] = temp
-        up_vol.iloc[1:window] = np.nan
-        temp = dn_vol.iloc[1:(window+1)].mean()
-        dn_vol.iloc[window] = temp
-        dn_vol.iloc[1:window] = np.nan
-
-        # EMA processing
-        avg_up = up_vol.ewm(span=window, adjust=False).mean()
-        avg_dn = dn_vol.ewm(span=window, adjust=False).mean()
-        rs = avg_up / avg_dn.values
-        exp_rsi = 100 - 100 / (1+rs)
-        return exp_rsi
-
-    def daily_returns(self):
-        return self.price.pct_change()
-
-    @property
-    def annualized_return(self):
-        dr = self.daily_returns()
-        return self._sf * dr.mean()
-
-    @property
-    def annualized_stdev(self):
-        dr = self.daily_returns()
-        return np.sqrt(self._sf) * dr.std()
-
-    @property
-    def sharpe(self):
-        return (self.annualized_return - self._rfr) / self.annualize_stdev
-
-    def rolling_stdev(self, window):
-        return self.price.rolling(window).std()
-
-    def bollinger(self, window):
-        """
-        Parameters
-        ----------
-        window : TYPE int, optional
-            DESCRIPTION - averaging window in days.
-
-        Returns
-        -------
-        lower, upper : TYPE pandas.DataFrame
-            DESCRIPTION - lower band (minus 2 sigma) and the upper band.
-        """
-        avg = self.sma(window)
-        sdd2 = self.rolling_stdev(window).mul(2)
-        lower = avg.sub(sdd2.values)
-        upper = avg.add(sdd2.values)
-        # low_up = lower.join(upper, lsuffix='_L', rsuffix='_U')
-
-        return lower, upper
-
-    def macd(self, short_wd = 12, long_wd = 26, sig_wd = 9):
-        """
-        MACD Line: (12-day EMA - 26-day EMA)
-        Signal Line: 9-day EMA of MACD Line
-        MACD Histogram: MACD Line - Signal Line
-
-        MACD is calculated by subtracting the 26-period EMA from the 12-period
-        EMA. MACD triggers technical signals when it crosses above (to buy) or
-        below (to sell) its signal line. The speed of crossovers is also taken
-        as a signal of a market is overbought or oversold. MACD helps investors
-        understand whether the bullish or bearish movement in the price is
-        strengthening or weakening
-
-        MACD historgram represents signal line crossovers that are the most
-        common MACD signals. The signal line is a 9-day EMA of the MACD line.
-        As a moving average of the indicator, it trails the MACD and makes it
-        easier to spot MACD turns. A bullish crossover occurs when the MACD
-        turns up and crosses above the signal line. A bearish crossover occurs
-        when the MACD turns down and crosses below the signal line. Crossovers
-        can last a few days or a few weeks, depending on the strength of the
-        move.
-        """
-        macd_short = self.ema(short_wd)
-        macd_long = self.ema(long_wd)
-        macd_line = macd_short - macd_long.values
-        macd_sig = macd_line.ewm(span=sig_wd, adjust=False).mean()
-        macd_hist = macd_line - macd_sig.values
-        norm_hist = macd_hist.div(macd_long.values)
-        return macd_line, macd_sig, macd_hist, norm_hist
-
-def get_crossing(stocks):
-    """
-    Parameters
-    ----------
-    stocks : TYPE instance of class 'security'
-
-    Returns
-    -------
-    cross : TYPE pandas DataFrame
-        DESCRIPTION - +1 when 50 day moving average is above 200 day moving
-        average. -1 when vice versa. transition days are of value +3 and -3
-        respectively.
-    """
-    sma50 = stocks.sma(50)
-    sma200 = stocks.sma(200)
-    cross = np.sign(sma50.sub(sma200.values))
-    cross_diff = cross.diff()
-    cross = cross.add(cross_diff.values)
-    cross.columns = stocks.price.columns
-
-    return cross
-
-def get_sma_slope(stocks, wd = 50):
-    """
-    Parameters
-    ----------
-    stocks : TYPE
-        DESCRIPTION.
-    wd : TYPE, optional
-        DESCRIPTION. The default is 50.
-
-    Returns
-    -------
-    slope : TYPE pandas DataFrame
-        DESCRIPTION - +1 when n day moving average is positive. -1 when
-        negative. transition days are of value +3 and -3 respectively.
-    """
-    sma = stocks.sma(wd)
-    slope = np.sign(sma.diff())
-    slope_diff = slope.diff()
-    slope = slope.add(slope_diff.values)
-
-    return slope
-
-def fill_missing_data(df):
-    df.ffill(inplace=True)
-    df.bfilln(inplace=True)
-
-def fft_convolve(signal, window):
-    fft_signal = fft(signal)
-    fft_window = fft(window)
-    return ifft(fft_signal * fft_window)
-
-def zero_pad(array, n):
-    """Extends an array with zeros.
-
-    array: numpy array
-    n: length of result
-
-    returns: new NumPy array
-    """
-    res = np.zeros(n)
-    res[: len(array)] = array
-    return res
-
-def smooth(price, hsize=10, sigma=3):
-    """
-    Parameters
-    ----------
-    price : TYPE DataFrame.
-        DESCRIPTION - with time index and no invalid values
-    hsize : TYPE integer
-        DESCRIPTION - this adds phase delay. similar to SMA window
-    sigma : TYPE float
-        DESCRIPTION - gaussian standard deviation affects smoothness
-
-    Returns
-    -------
-    TYPE DataFrame
-        DESCRIPTION - smoothed price
-    Doesn't offer much benefit over sma. Only theoretical values. For future
-    different smooth functiona experiments
-    """
-    data = price.copy()
-    window = sig.gaussian(M=hsize, std=sigma)
-    window /= window.sum()
-    padded = zero_pad(window, data.shape[0])
-    for col in data.columns:
-        ys = data[col].values
-        smoo = abs(fft_convolve(ys, padded))
-        smoo[0:hsize-1] = np.nan
-        data[col] = smoo
-
-    return data
--- a/util.py
+++ b/util.py
@ -0,0 +1,384 @@
+"""
+Use Yahoo Finance data
+"""
+
+import warnings
+
+# Suppress FutureWarnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import datetime as dt
+import os
+import pandas as pd
+import numpy as np
+# import yfinance as yf
+import yahoo_fin.stock_info as si
+import requests
+from lxml import html
+from io import StringIO
+from time import sleep
+
+WEBSITE = 'https://www.isolo.org/dokuwiki/knowledge_base/investing/watchlist'
+BATCHSIZE = 20
+TIMEGAP = 0.2
+
+def fill_missing_data(df):
+    temp = df.ffill()
+    temp = temp.bfill()
+    return temp
+
+def symbol_to_path(symbol, base_dir=None):
+    """Return CSV file path given ticker symbol."""
+    if base_dir is None:
+        base_dir = os.environ.get("MARKET_DATA_DIR", '../data/')
+    return os.path.join(base_dir, "{}.csv".format(str(symbol)))
+
+# def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
+#     """
+#     Read stock data (adjusted close) for given symbols from Yahoo Finance
+#     from start_date to the latest date available (usually the current date).
+#     """
+#     if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+#         symbols = ['SPY'] + symbols
+
+#     df = yf.download(symbols, start = start_date)[colname]
+#     if len(symbols) == 1:
+#         df.name = symbols[0]
+#         df = df.to_frame()
+#     return df
+
+# def get_data_full(symbols, start_date, addSPY=True, colname = 'Adj Close'):
+    """
+    Read stock data (adjusted close) for given symbols from CSV files
+    from start_date to the latest date available in the CSV files.
+    """
+#     df_temp = pd.read_csv(symbol_to_path('SPY'), index_col='Date',
+#             parse_dates=True, usecols=['Date', colname], na_values=['nan'])
+#     df_temp = df_temp.rename(columns={colname: 'SPY'})
+#     end_date = df_temp.index.values[-1]
+#     dates = pd.date_range(start_date, end_date)
+#     df = pd.DataFrame(index=dates)
+#     df = df.join(df_temp)
+#     df = df.dropna()
+#     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+#     #     symbols = ['SPY'] + symbols
+#     for symbol in symbols:
+#         df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
+#                 parse_dates=True, usecols=['Date', colname], na_values=['nan'])
+#         df_temp = df_temp.rename(columns={colname: symbol})
+#         df = df.join(df_temp)
+#         # if symbol == 'SPY':  # drop dates SPY did not trade
+#         #     df = df.dropna(subset=["SPY"])
+#     if not addSPY:
+#         df = df[symbols]
+#     return df
+
+def get_data_range(df, dates):
+    """
+    Extract sections of the data in the dates range from the full data set
+    """
+    df_range = pd.DataFrame(index=dates)
+    df_range = df_range.join(df, how='inner')
+    return df_range
+
+def yf_download(symbols, start, end):
+    df = pd.DataFrame(columns = pd.MultiIndex(levels=[["Adj Close", "Volume"],[]], codes=[[],[]], names=["param", "tick"]))
+    for sym in symbols:
+        # tmp = si.get_data(sym, start_date=start)
+        tmp = si.get_data(sym, start_date=start)[["adjclose", "volume"]]
+        tmp.rename(columns={"adjclose": "Adj Close", "volume": "Volume"}, inplace=True)
+        tmp.columns = pd.MultiIndex.from_product([list(tmp.columns)] + [[sym]], names=["param", "tick"])
+
+        df = df.join(tmp, how='outer')
+
+    return df
+
+# def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
+#     """
+#     Read stock data (adjusted close) for given symbols from Yahoo Finance
+#     """
+#     org_sym = symbols
+#     sd = dates[0]
+#     ed = dates[-1]
+#     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+#     if 'SPY' not in symbols:
+#         symbols = ['SPY'] + symbols
+#     df = yf.download(symbols, start=sd, end = ed)[colname]
+#     if len(symbols) == 1:
+#         df.name = symbols[0]
+#         df = df.to_frame()
+
+#     df = df.dropna(subset=['SPY'])
+#     df = fill_missing_data(df)
+
+#     if addSPY==False:
+#         # df = df.drop(columns=['SPY'])
+#         df = df[org_sym]
+
+#     return df
+
+def yf_batch_download(symbols, start, end, batch_size, time_gap):
+    """
+    download in small batches to avoid connection closure by host
+
+    Parameters
+    ----------
+    symbols : list
+        stock symbols.
+    start : datetime
+        start date.
+    end : datetime
+        stop date.
+    batch_size : integer
+        batch size.
+    time_gap : float
+        in seconds or fraction of seconds.
+
+    Returns
+    -------
+    df : dataframe
+        stock price volume information.
+
+    """
+    n = len(symbols)
+    batches = n // batch_size
+    df = pd.DataFrame()
+    for i in range(batches - 1):
+        tmp = yf_download(symbols[i*batch_size:(i+1)*batch_size], start, end)
+        df = pd.concat([df, tmp], axis=1)
+        sleep(time_gap)
+    tmp = yf_download(symbols[(batches-1)*batch_size:n], start, end)
+    df = pd.concat([df, tmp], axis=1)
+    
+    return df
+
+def get_price_volume(symbols, dates, addSPY=False):
+    """
+    Read stock data (adjusted close and volume) for given symbols from local
+    file unless data is not in local. It only gets date from Yahoo Finance
+    when necessary to increase speed and reduce internet data.
+    
+    It will refresh local data if the symbols are on the _refresh.csv. This
+    is necessary when stock splits, spins off or something else happens.
+    """
+    # DATAFILE = "_stkdata.pickle"
+    # REFRESH = "_refresh.csv"
+    org_sym = symbols
+    sd = dates[0]
+    ed = dates[-1]
+    # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+    if 'SPY' not in symbols:
+        symbols = ['SPY'] + symbols
+
+    df = yf_batch_download(symbols, start=sd, end=ed, \
+                            batch_size=BATCHSIZE, time_gap=TIMEGAP)
+    if len(symbols) == 1:
+        tuples = list(zip(df.columns.values.tolist(), \
+                            [symbols[0]]*len(df.columns.values)))
+        df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])           
+
+    # if not os.path.exists(DATAFILE):
+    #     df = yf_batch_download(symbols, start=sd, end=ed, \
+    #                            batch_size=BATCHSIZE, time_gap=TIMEGAP)
+    #     if len(symbols) == 1:
+    #         tuples = list(zip(df.columns.values.tolist(), \
+    #                           [symbols[0]]*len(df.columns.values)))
+    #         df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])            
+    # else:
+    #     df = pd.read_pickle(DATAFILE)
+    #     exist_syms = df["Adj Close"].columns.values.tolist()
+    #     if os.path.exists(REFRESH):
+    #         try:
+    #             refresh_df = pd.read_csv(REFRESH, header=None)
+    #             refresh_syms = refresh_df.values.tolist()
+    #             refresh_syms = [x for sublist in refresh_syms for x in sublist]
+    #             remove_syms = [x for x in exist_syms if x in refresh_syms]
+    #             if remove_syms:
+    #                 df.drop(columns=remove_syms, axis=1, level=1, inplace=True)
+    #                 exist_syms = [x for x in exist_syms if x not in refresh_syms]
+    #         except:
+    #             pass
+
+        exist_syms = []
+
+        last_day = pd.to_datetime(df.index.values[-1])
+        first_day = pd.to_datetime(df.index.values[0])
+        intersect_syms = list(set(org_sym) & set(exist_syms))
+        # reduce df to only contain intersect_syms
+        df = df.loc[:, (slice(None), intersect_syms)]
+        
+        if sd < first_day:
+            # fill gap from online
+            tmp_df = yf_batch_download(intersect_syms, start=sd, end=first_day, \
+                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
+            df = pd.concat([tmp_df, df])
+            
+        if ed >= last_day:
+            # fill gap from online incl last two days to get mkt close data
+            if ed.date() == last_day.date():
+                tmp_df = yf_batch_download(intersect_syms, start=ed, end=ed, \
+                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
+            else:
+                tmp_df = yf_batch_download(intersect_syms, start=last_day, end=ed, \
+                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
+            df = pd.concat([df[:-1], tmp_df])
+
+        # get data online when new stks were added    
+        new_stks = np.setdiff1d(symbols, exist_syms).tolist()
+        if not new_stks == []:
+            tmp_df = yf_batch_download(new_stks, start=sd, end=ed, \
+                               batch_size=BATCHSIZE, time_gap=TIMEGAP)
+            if len(new_stks) == 1:
+                tuples = list(zip(tmp_df.columns.values.tolist(), \
+                                  [new_stks[0]]*len(tmp_df.columns.values)))
+                tmp_df.columns = pd.MultiIndex.from_tuples(tuples, names=[None, None])
+            df = df.join(tmp_df)
+
+    # df.to_pickle(DATAFILE) # save to local, overwrite existing file
+    # if os.path.exists(REFRESH):
+    #     with open(REFRESH, 'w'):
+    #         pass
+
+    df = df.dropna(subset=[('Adj Close', 'SPY')])
+    price = df['Adj Close']
+    price = fill_missing_data(price)
+    volume = df['Volume']
+    volume = volume.fillna(0)
+
+    # if len(symbols) == 1:
+    #     price.name = symbols[0]
+    #     volume.name = symbols[0]
+    #     price = price.to_frame()
+    #     volume = volume.to_frame()
+
+    if addSPY==False:
+        price = price[org_sym]
+        volume = volume[org_sym]
+   
+    return price, volume      
+    
+
+# def get_price_volume_online(symbols, dates, addSPY=False):
+#     """
+#     Read stock data (adjusted close and volume) for given symbols from Yahoo
+#     Finance
+#     """
+#     org_sym = symbols
+#     sd = dates[0]
+#     ed = dates[-1]
+#     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+#     if 'SPY' not in symbols:
+#         symbols = ['SPY'] + symbols
+#     df = yf.download(symbols, start=sd, end = ed)
+#     if len(symbols) == 1:
+#         df = df.dropna(subset = ['Adj Close'])
+#     else:
+#         df = df.dropna(subset=[('Adj Close', 'SPY')])
+#     price = df['Adj Close']
+#     price = fill_missing_data(price)
+#     volume = df['Volume']
+#     volume = volume.fillna(0)
+
+#     if len(symbols) == 1:
+#         price.name = symbols[0]
+#         volume.name = symbols[0]
+#         price = price.to_frame()
+#         volume = volume.to_frame()
+
+#     if addSPY==False:
+#         price = price[org_sym]
+#         volume = volume[org_sym]
+
+#     return price, volume
+
+def get_watchlist(website: str = WEBSITE):
+    page = requests.get(WEBSITE)
+    # page = requests.get(WEBSITE, verify=False) # skip certificate check for https
+    tree = html.fromstring(page.content)
+    watchlist = tree.xpath('//*[@id="dokuwiki__content"]/div[1]/div/div[3]/div/pre/text()')[0]
+    file_name = StringIO(watchlist)
+    df = pd.read_csv(file_name, index_col = 'Symbol',
+                      comment = '#', na_filter=False)
+    return df
+
+# def get_watchlist(file_name: str = 'watchlist.csv'):
+#     df = pd.read_csv(file_name, index_col = 'Symbol',
+#                      comment = '#', na_filter=False)
+#     return df
+
+# def get_data(symbols, dates, addSPY=True, colname = 'Adj Close'):
+#     """
+#     Read stock data (adjusted close) for given symbols from CSV files.
+
+#     (done) TODO: there are nan values in the data when addSPY=False is passed. The
+#     strategy should be using SPY to clean the data first including fill
+#     forward and fill backward, then to drop the SPY if addSPY=False
+#     """
+#     org_sym = symbols
+#     df = pd.DataFrame(index=dates)
+#     # if addSPY and 'SPY' not in symbols:  # add SPY for reference, if absent
+#     #     symbols = ['SPY'] + symbols
+#     if 'SPY' not in symbols:
+#         symbols = ['SPY'] + symbols
+#     for symbol in symbols:
+#         df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
+#                 parse_dates=True, usecols=['Date', colname], na_values=['nan'])
+#         df_temp = df_temp.rename(columns={colname: symbol})
+#         df = df.join(df_temp)
+#         if symbol == 'SPY':  # drop dates SPY did not trade
+#             df = df.dropna(subset=["SPY"])
+#     # fill missing data
+#     df = fill_missing_data(df)
+#     if addSPY == False: # drop SPY
+#         # df = df.drop(columns=['SPY'])
+#         df = df[org_sym]
+
+#     return df
+
+
+def plot_data(df, axs=[], title=[], xlabel='', ylabel=''):
+
+    """Plot stock prices with a custom title and meaningful axis labels."""
+    if axs == []:
+        ax = df.plot(title = title)
+    else:
+        ax = df.plot(ax=axs, title=title)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.grid()
+
+
+# def plot_data(df, title=[], xlabel='', ylabel=''):
+#     import matplotlib.pyplot as plt
+#     """Plot stock prices with a custom title and meaningful axis labels."""
+#     ax = df.plot(title=title, fontsize=12, figsize=(10, 7))
+#     ax.set_xlabel(xlabel)
+#     ax.set_ylabel(ylabel)
+#     plt.grid()
+#     plt.show()
+
+def get_orders_data_file(basefilename):
+    return open(os.path.join(os.environ.get("ORDERS_DATA_DIR",'orders/'),basefilename))
+
+def get_learner_data_file(basefilename):
+    return open(os.path.join(os.environ.get("LEARNER_DATA_DIR",'Data/'),basefilename),'r')
+
+def get_robot_world_file(basefilename):
+    return open(os.path.join(os.environ.get("ROBOT_WORLDS_DIR",'testworlds/'),basefilename))
+
+
+def test_code():
+
+    symbol = ['GOOG', 'AMZN']
+    # lookback years
+    lb_year = 0.08
+    ed = dt.datetime.today()
+    sd = ed - dt.timedelta(days = 365 * lb_year + 1)
+    # If ed or sd falls on to a non-trading day, you might get warnings saying
+    # "No data found for this date range, symbol may be delisted". This is 
+    # normal behavior.
+    prices, volume = get_price_volume(symbol, pd.date_range(sd, ed), addSPY=False)
+
+
+if __name__ == '__main__':
+    test_code()