From f800fdf4231b6a1ee9c9b9d0915276d52dac8e3b Mon Sep 17 00:00:00 2001 From: cpan Date: Sun, 16 Feb 2020 10:59:22 +0000 Subject: [PATCH] Upload files to '' --- thinkdsp.py | 1830 +++++++++++++++++++++++++++++ thinkplot.py | 890 ++++++++++++++ thinkstats2.py | 3041 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 5761 insertions(+) create mode 100644 thinkdsp.py create mode 100644 thinkplot.py create mode 100644 thinkstats2.py diff --git a/thinkdsp.py b/thinkdsp.py new file mode 100644 index 0000000..066f19e --- /dev/null +++ b/thinkdsp.py @@ -0,0 +1,1830 @@ +"""This file contains code used in "Think DSP", +by Allen B. Downey, available from greenteapress.com + +Copyright 2013 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function, division + +import array +import copy +import math + +import numpy as np +import random +import scipy +import scipy.stats +import scipy.fftpack +import struct +import subprocess +import thinkplot +import warnings + +from fractions import gcd +from wave import open as open_wave + +import matplotlib.pyplot as pyplot + +try: + from IPython.display import Audio +except: + warnings.warn( + "Can't import Audio from IPython.display; " "Wave.make_audio() will not work." + ) + +PI2 = math.pi * 2 + + +def random_seed(x): + """Initialize the random and np.random generators. + + x: int seed + """ + random.seed(x) + np.random.seed(x) + + +class UnimplementedMethodException(Exception): + """Exception if someone calls a method that should be overridden.""" + + +class WavFileWriter: + """Writes wav files.""" + + def __init__(self, filename="sound.wav", framerate=11025): + """Opens the file and sets parameters. + + filename: string + framerate: samples per second + """ + self.filename = filename + self.framerate = framerate + self.nchannels = 1 + self.sampwidth = 2 + self.bits = self.sampwidth * 8 + self.bound = 2 ** (self.bits - 1) - 1 + + self.fmt = "h" + self.dtype = np.int16 + + self.fp = open_wave(self.filename, "w") + self.fp.setnchannels(self.nchannels) + self.fp.setsampwidth(self.sampwidth) + self.fp.setframerate(self.framerate) + + def write(self, wave): + """Writes a wave. + + wave: Wave + """ + zs = wave.quantize(self.bound, self.dtype) + self.fp.writeframes(zs.tostring()) + + def close(self, duration=0): + """Closes the file. + + duration: how many seconds of silence to append + """ + if duration: + self.write(rest(duration)) + + self.fp.close() + + +def read_wave(filename="sound.wav"): + """Reads a wave file. + + filename: string + + returns: Wave + """ + fp = open_wave(filename, "r") + + nchannels = fp.getnchannels() + nframes = fp.getnframes() + sampwidth = fp.getsampwidth() + framerate = fp.getframerate() + + z_str = fp.readframes(nframes) + + fp.close() + + dtype_map = {1: np.int8, 2: np.int16, 3: "special", 4: np.int32} + if sampwidth not in dtype_map: + raise ValueError("sampwidth %d unknown" % sampwidth) + + if sampwidth == 3: + xs = np.fromstring(z_str, dtype=np.int8).astype(np.int32) + ys = (xs[2::3] * 256 + xs[1::3]) * 256 + xs[0::3] + else: + ys = np.fromstring(z_str, dtype=dtype_map[sampwidth]) + + # if it's in stereo, just pull out the first channel + if nchannels == 2: + ys = ys[::2] + + # ts = np.arange(len(ys)) / framerate + wave = Wave(ys, framerate=framerate) + wave.normalize() + return wave + + +def play_wave(filename="sound.wav", player="aplay"): + """Plays a wave file. + + filename: string + player: string name of executable that plays wav files + """ + cmd = "%s %s" % (player, filename) + popen = subprocess.Popen(cmd, shell=True) + popen.communicate() + + +def find_index(x, xs): + """Find the index corresponding to a given value in an array.""" + n = len(xs) + start = xs[0] + end = xs[-1] + i = round((n - 1) * (x - start) / (end - start)) + return int(i) + + +class _SpectrumParent: + """Contains code common to Spectrum and DCT. + """ + + def __init__(self, hs, fs, framerate, full=False): + """Initializes a spectrum. + + hs: array of amplitudes (real or complex) + fs: array of frequencies + framerate: frames per second + full: boolean to indicate full or real FFT + """ + self.hs = np.asanyarray(hs) + self.fs = np.asanyarray(fs) + self.framerate = framerate + self.full = full + + @property + def max_freq(self): + """Returns the Nyquist frequency for this spectrum.""" + return self.framerate / 2 + + @property + def amps(self): + """Returns a sequence of amplitudes (read-only property).""" + return np.absolute(self.hs) + + @property + def power(self): + """Returns a sequence of powers (read-only property).""" + return self.amps ** 2 + + def copy(self): + """Makes a copy. + + Returns: new Spectrum + """ + return copy.deepcopy(self) + + def max_diff(self, other): + """Computes the maximum absolute difference between spectra. + + other: Spectrum + + returns: float + """ + assert self.framerate == other.framerate + assert len(self) == len(other) + + hs = self.hs - other.hs + return np.max(np.abs(hs)) + + def ratio(self, denom, thresh=1, val=0): + """The ratio of two spectrums. + + denom: Spectrum + thresh: values smaller than this are replaced + val: with this value + + returns: new Wave + """ + ratio_spectrum = self.copy() + ratio_spectrum.hs /= denom.hs + ratio_spectrum.hs[denom.amps < thresh] = val + return ratio_spectrum + + def invert(self): + """Inverts this spectrum/filter. + + returns: new Wave + """ + inverse = self.copy() + inverse.hs = 1 / inverse.hs + return inverse + + @property + def freq_res(self): + return self.framerate / 2 / (len(self.fs) - 1) + + def render_full(self, high=None): + """Extracts amps and fs from a full spectrum. + + high: cutoff frequency + + returns: fs, amps + """ + hs = np.fft.fftshift(self.hs) + amps = np.abs(hs) + fs = np.fft.fftshift(self.fs) + i = 0 if high is None else find_index(-high, fs) + j = None if high is None else find_index(high, fs) + 1 + return fs[i:j], amps[i:j] + + def plot(self, high=None, **options): + """Plots amplitude vs frequency. + + Note: if this is a full spectrum, it ignores low and high + + high: frequency to cut off at + """ + if self.full: + fs, amps = self.render_full(high) + thinkplot.plot(fs, amps, **options) + else: + i = None if high is None else find_index(high, self.fs) + thinkplot.plot(self.fs[:i], self.amps[:i], **options) + + def plot_power(self, high=None, **options): + """Plots power vs frequency. + + high: frequency to cut off at + """ + if self.full: + fs, amps = self.render_full(high) + thinkplot.plot(fs, amps ** 2, **options) + else: + i = None if high is None else find_index(high, self.fs) + thinkplot.plot(self.fs[:i], self.power[:i], **options) + + def estimate_slope(self): + """Runs linear regression on log power vs log frequency. + + returns: slope, inter, r2, p, stderr + """ + x = np.log(self.fs[1:]) + y = np.log(self.power[1:]) + t = scipy.stats.linregress(x, y) + return t + + def peaks(self): + """Finds the highest peaks and their frequencies. + + returns: sorted list of (amplitude, frequency) pairs + """ + t = list(zip(self.amps, self.fs)) + t.sort(reverse=True) + return t + + +class Spectrum(_SpectrumParent): + """Represents the spectrum of a signal.""" + + def __len__(self): + """Length of the spectrum.""" + return len(self.hs) + + def __add__(self, other): + """Adds two spectrums elementwise. + + other: Spectrum + + returns: new Spectrum + """ + if other == 0: + return self.copy() + + assert all(self.fs == other.fs) + hs = self.hs + other.hs + return Spectrum(hs, self.fs, self.framerate, self.full) + + __radd__ = __add__ + + def __mul__(self, other): + """Multiplies two spectrums elementwise. + + other: Spectrum + + returns: new Spectrum + """ + assert all(self.fs == other.fs) + hs = self.hs * other.hs + return Spectrum(hs, self.fs, self.framerate, self.full) + + def convolve(self, other): + """Convolves two Spectrums. + + other: Spectrum + + returns: Spectrum + """ + assert all(self.fs == other.fs) + if self.full: + hs1 = np.fft.fftshift(self.hs) + hs2 = np.fft.fftshift(other.hs) + hs = np.convolve(hs1, hs2, mode="same") + hs = np.fft.ifftshift(hs) + else: + # not sure this branch would mean very much + hs = np.convolve(self.hs, other.hs, mode="same") + + return Spectrum(hs, self.fs, self.framerate, self.full) + + @property + def real(self): + """Returns the real part of the hs (read-only property).""" + return np.real(self.hs) + + @property + def imag(self): + """Returns the imaginary part of the hs (read-only property).""" + return np.imag(self.hs) + + @property + def angles(self): + """Returns a sequence of angles (read-only property).""" + return np.angle(self.hs) + + def scale(self, factor): + """Multiplies all elements by the given factor. + + factor: what to multiply the magnitude by (could be complex) + """ + self.hs *= factor + + def low_pass(self, cutoff, factor=0): + """Attenuate frequencies above the cutoff. + + cutoff: frequency in Hz + factor: what to multiply the magnitude by + """ + self.hs[abs(self.fs) > cutoff] *= factor + + def high_pass(self, cutoff, factor=0): + """Attenuate frequencies below the cutoff. + + cutoff: frequency in Hz + factor: what to multiply the magnitude by + """ + self.hs[abs(self.fs) < cutoff] *= factor + + def band_stop(self, low_cutoff, high_cutoff, factor=0): + """Attenuate frequencies between the cutoffs. + + low_cutoff: frequency in Hz + high_cutoff: frequency in Hz + factor: what to multiply the magnitude by + """ + # TODO: test this function + fs = abs(self.fs) + indices = (low_cutoff < fs) & (fs < high_cutoff) + self.hs[indices] *= factor + + def pink_filter(self, beta=1): + """Apply a filter that would make white noise pink. + + beta: exponent of the pink noise + """ + denom = self.fs ** (beta / 2.0) + denom[0] = 1 + self.hs /= denom + + def differentiate(self): + """Apply the differentiation filter. + + returns: new Spectrum + """ + new = self.copy() + new.hs *= PI2 * 1j * new.fs + return new + + def integrate(self): + """Apply the integration filter. + + returns: new Spectrum + """ + new = self.copy() + new.hs /= PI2 * 1j * new.fs + return new + + def make_integrated_spectrum(self): + """Makes an integrated spectrum. + """ + cs = np.cumsum(self.power) + cs /= cs[-1] + return IntegratedSpectrum(cs, self.fs) + + def make_wave(self): + """Transforms to the time domain. + + returns: Wave + """ + if self.full: + ys = np.fft.ifft(self.hs) + else: + ys = np.fft.irfft(self.hs) + + # NOTE: whatever the start time was, we lose it when + # we transform back; we could fix that by saving start + # time in the Spectrum + # ts = self.start + np.arange(len(ys)) / self.framerate + return Wave(ys, framerate=self.framerate) + + +class IntegratedSpectrum: + """Represents the integral of a spectrum.""" + + def __init__(self, cs, fs): + """Initializes an integrated spectrum: + + cs: sequence of cumulative amplitudes + fs: sequence of frequencies + """ + self.cs = np.asanyarray(cs) + self.fs = np.asanyarray(fs) + + def plot_power(self, low=0, high=None, expo=False, **options): + """Plots the integrated spectrum. + + low: int index to start at + high: int index to end at + """ + cs = self.cs[low:high] + fs = self.fs[low:high] + + if expo: + cs = np.exp(cs) + + thinkplot.plot(fs, cs, **options) + + def estimate_slope(self, low=1, high=-12000): + """Runs linear regression on log cumulative power vs log frequency. + + returns: slope, inter, r2, p, stderr + """ + # print self.fs[low:high] + # print self.cs[low:high] + x = np.log(self.fs[low:high]) + y = np.log(self.cs[low:high]) + t = scipy.stats.linregress(x, y) + return t + + +class Dct(_SpectrumParent): + """Represents the spectrum of a signal using discrete cosine transform.""" + + @property + def amps(self): + """Returns a sequence of amplitudes (read-only property). + + Note: for DCTs, amps are positive or negative real. + """ + return self.hs + + def __add__(self, other): + """Adds two DCTs elementwise. + + other: DCT + + returns: new DCT + """ + if other == 0: + return self + + assert self.framerate == other.framerate + hs = self.hs + other.hs + return Dct(hs, self.fs, self.framerate) + + __radd__ = __add__ + + def make_wave(self): + """Transforms to the time domain. + + returns: Wave + """ + N = len(self.hs) + ys = scipy.fftpack.idct(self.hs, type=2) / 2 / N + # NOTE: whatever the start time was, we lose it when + # we transform back + # ts = self.start + np.arange(len(ys)) / self.framerate + return Wave(ys, framerate=self.framerate) + + +class Spectrogram: + """Represents the spectrum of a signal.""" + + def __init__(self, spec_map, seg_length): + """Initialize the spectrogram. + + spec_map: map from float time to Spectrum + seg_length: number of samples in each segment + """ + self.spec_map = spec_map + self.seg_length = seg_length + + def any_spectrum(self): + """Returns an arbitrary spectrum from the spectrogram.""" + index = next(iter(self.spec_map)) + return self.spec_map[index] + + @property + def time_res(self): + """Time resolution in seconds.""" + spectrum = self.any_spectrum() + return float(self.seg_length) / spectrum.framerate + + @property + def freq_res(self): + """Frequency resolution in Hz.""" + return self.any_spectrum().freq_res + + def times(self): + """Sorted sequence of times. + + returns: sequence of float times in seconds + """ + ts = sorted(iter(self.spec_map)) + return ts + + def frequencies(self): + """Sequence of frequencies. + + returns: sequence of float freqencies in Hz. + """ + fs = self.any_spectrum().fs + return fs + + def plot(self, high=None, **options): + """Make a pseudocolor plot. + + high: highest frequency component to plot + """ + fs = self.frequencies() + i = None if high is None else find_index(high, fs) + fs = fs[:i] + ts = self.times() + + # make the array + size = len(fs), len(ts) + array = np.zeros(size, dtype=np.float) + + # copy amplitude from each spectrum into a column of the array + for j, t in enumerate(ts): + spectrum = self.spec_map[t] + array[:, j] = spectrum.amps[:i] + + thinkplot.pcolor(ts, fs, array, **options) + + def make_wave(self): + """Inverts the spectrogram and returns a Wave. + + returns: Wave + """ + res = [] + for t, spectrum in sorted(self.spec_map.items()): + wave = spectrum.make_wave() + n = len(wave) + + window = 1 / np.hamming(n) + wave.window(window) + + i = wave.find_index(t) + start = i - n // 2 + end = start + n + res.append((start, end, wave)) + + starts, ends, waves = zip(*res) + low = min(starts) + high = max(ends) + + ys = np.zeros(high - low, np.float) + for start, end, wave in res: + ys[start:end] = wave.ys + + # ts = np.arange(len(ys)) / self.framerate + return Wave(ys, framerate=wave.framerate) + + +class Wave: + """Represents a discrete-time waveform. + + """ + + def __init__(self, ys, ts=None, framerate=None): + """Initializes the wave. + + ys: wave array + ts: array of times + framerate: samples per second + """ + self.ys = np.asanyarray(ys) + self.framerate = framerate if framerate is not None else 11025 + + if ts is None: + self.ts = np.arange(len(ys)) / self.framerate + else: + self.ts = np.asanyarray(ts) + + def copy(self): + """Makes a copy. + + Returns: new Wave + """ + return copy.deepcopy(self) + + def __len__(self): + return len(self.ys) + + @property + def start(self): + return self.ts[0] + + @property + def end(self): + return self.ts[-1] + + @property + def duration(self): + """Duration (property). + + returns: float duration in seconds + """ + return len(self.ys) / self.framerate + + def __add__(self, other): + """Adds two waves elementwise. + + other: Wave + + returns: new Wave + """ + if other == 0: + return self + + assert self.framerate == other.framerate + + # make an array of times that covers both waves + start = min(self.start, other.start) + end = max(self.end, other.end) + n = int(round((end - start) * self.framerate)) + 1 + ys = np.zeros(n) + ts = start + np.arange(n) / self.framerate + + def add_ys(wave): + i = find_index(wave.start, ts) + + # make sure the arrays line up reasonably well + diff = ts[i] - wave.start + dt = 1 / wave.framerate + if (diff / dt) > 0.1: + warnings.warn( + "Can't add these waveforms; their " "time arrays don't line up." + ) + + j = i + len(wave) + ys[i:j] += wave.ys + + add_ys(self) + add_ys(other) + + return Wave(ys, ts, self.framerate) + + __radd__ = __add__ + + def __or__(self, other): + """Concatenates two waves. + + other: Wave + + returns: new Wave + """ + if self.framerate != other.framerate: + raise ValueError("Wave.__or__: framerates do not agree") + + ys = np.concatenate((self.ys, other.ys)) + # ts = np.arange(len(ys)) / self.framerate + return Wave(ys, framerate=self.framerate) + + def __mul__(self, other): + """Multiplies two waves elementwise. + + Note: this operation ignores the timestamps; the result + has the timestamps of self. + + other: Wave + + returns: new Wave + """ + # the spectrums have to have the same framerate and duration + assert self.framerate == other.framerate + assert len(self) == len(other) + + ys = self.ys * other.ys + return Wave(ys, self.ts, self.framerate) + + def max_diff(self, other): + """Computes the maximum absolute difference between waves. + + other: Wave + + returns: float + """ + assert self.framerate == other.framerate + assert len(self) == len(other) + + ys = self.ys - other.ys + return np.max(np.abs(ys)) + + def convolve(self, other): + """Convolves two waves. + + Note: this operation ignores the timestamps; the result + has the timestamps of self. + + other: Wave or NumPy array + + returns: Wave + """ + if isinstance(other, Wave): + assert self.framerate == other.framerate + window = other.ys + else: + window = other + + ys = np.convolve(self.ys, window, mode="full") + # ts = np.arange(len(ys)) / self.framerate + return Wave(ys, framerate=self.framerate) + + def diff(self): + """Computes the difference between successive elements. + + returns: new Wave + """ + ys = np.diff(self.ys) + ts = self.ts[1:].copy() + return Wave(ys, ts, self.framerate) + + def cumsum(self): + """Computes the cumulative sum of the elements. + + returns: new Wave + """ + ys = np.cumsum(self.ys) + ts = self.ts.copy() + return Wave(ys, ts, self.framerate) + + def quantize(self, bound, dtype): + """Maps the waveform to quanta. + + bound: maximum amplitude + dtype: numpy data type or string + + returns: quantized signal + """ + return quantize(self.ys, bound, dtype) + + def apodize(self, denom=20, duration=0.1): + """Tapers the amplitude at the beginning and end of the signal. + + Tapers either the given duration of time or the given + fraction of the total duration, whichever is less. + + denom: float fraction of the segment to taper + duration: float duration of the taper in seconds + """ + self.ys = apodize(self.ys, self.framerate, denom, duration) + + def hamming(self): + """Apply a Hamming window to the wave. + """ + self.ys *= np.hamming(len(self.ys)) + + def window(self, window): + """Apply a window to the wave. + + window: sequence of multipliers, same length as self.ys + """ + self.ys *= window + + def scale(self, factor): + """Multplies the wave by a factor. + + factor: scale factor + """ + self.ys *= factor + + def shift(self, shift): + """Shifts the wave left or right in time. + + shift: float time shift + """ + # TODO: track down other uses of this function and check them + self.ts += shift + + def roll(self, roll): + """Rolls this wave by the given number of locations. + """ + self.ys = np.roll(self.ys, roll) + + def truncate(self, n): + """Trims this wave to the given length. + + n: integer index + """ + self.ys = truncate(self.ys, n) + self.ts = truncate(self.ts, n) + + def zero_pad(self, n): + """Trims this wave to the given length. + + n: integer index + """ + self.ys = zero_pad(self.ys, n) + self.ts = self.start + np.arange(n) / self.framerate + + def normalize(self, amp=1.0): + """Normalizes the signal to the given amplitude. + + amp: float amplitude + """ + self.ys = normalize(self.ys, amp=amp) + + def unbias(self): + """Unbiases the signal. + """ + self.ys = unbias(self.ys) + + def find_index(self, t): + """Find the index corresponding to a given time.""" + n = len(self) + start = self.start + end = self.end + i = round((n - 1) * (t - start) / (end - start)) + return int(i) + + def segment(self, start=None, duration=None): + """Extracts a segment. + + start: float start time in seconds + duration: float duration in seconds + + returns: Wave + """ + if start is None: + start = self.ts[0] + i = 0 + else: + i = self.find_index(start) + + j = None if duration is None else self.find_index(start + duration) + return self.slice(i, j) + + def slice(self, i, j): + """Makes a slice from a Wave. + + i: first slice index + j: second slice index + """ + ys = self.ys[i:j].copy() + ts = self.ts[i:j].copy() + return Wave(ys, ts, self.framerate) + + def make_spectrum(self, full=False): + """Computes the spectrum using FFT. + + returns: Spectrum + """ + n = len(self.ys) + d = 1 / self.framerate + + if full: + hs = np.fft.fft(self.ys) + fs = np.fft.fftfreq(n, d) + else: + hs = np.fft.rfft(self.ys) + fs = np.fft.rfftfreq(n, d) + + return Spectrum(hs, fs, self.framerate, full) + + def make_dct(self): + """Computes the DCT of this wave. + """ + N = len(self.ys) + hs = scipy.fftpack.dct(self.ys, type=2) + fs = (0.5 + np.arange(N)) / 2 + return Dct(hs, fs, self.framerate) + + def make_spectrogram(self, seg_length, win_flag=True): + """Computes the spectrogram of the wave. + + seg_length: number of samples in each segment + win_flag: boolean, whether to apply hamming window to each segment + + returns: Spectrogram + """ + if win_flag: + window = np.hamming(seg_length) + i, j = 0, seg_length + step = int(seg_length // 2) + + # map from time to Spectrum + spec_map = {} + + while j < len(self.ys): + segment = self.slice(i, j) + if win_flag: + segment.window(window) + + # the nominal time for this segment is the midpoint + t = (segment.start + segment.end) / 2 + spec_map[t] = segment.make_spectrum() + + i += step + j += step + + return Spectrogram(spec_map, seg_length) + + def get_xfactor(self, options): + try: + xfactor = options["xfactor"] + options.pop("xfactor") + except KeyError: + xfactor = 1 + return xfactor + + def plot(self, **options): + """Plots the wave. + + """ + xfactor = self.get_xfactor(options) + thinkplot.plot(self.ts * xfactor, self.ys, **options) + + def plot_vlines(self, **options): + """Plots the wave with vertical lines for samples. + + """ + xfactor = self.get_xfactor(options) + thinkplot.vlines(self.ts * xfactor, 0, self.ys, **options) + + def corr(self, other): + """Correlation coefficient two waves. + + other: Wave + + returns: float coefficient of correlation + """ + corr = np.corrcoef(self.ys, other.ys)[0, 1] + return corr + + def cov_mat(self, other): + """Covariance matrix of two waves. + + other: Wave + + returns: 2x2 covariance matrix + """ + return np.cov(self.ys, other.ys) + + def cov(self, other): + """Covariance of two unbiased waves. + + other: Wave + + returns: float + """ + total = sum(self.ys * other.ys) / len(self.ys) + return total + + def cos_cov(self, k): + """Covariance with a cosine signal. + + freq: freq of the cosine signal in Hz + + returns: float covariance + """ + n = len(self.ys) + factor = math.pi * k / n + ys = [math.cos(factor * (i + 0.5)) for i in range(n)] + total = 2 * sum(self.ys * ys) + return total + + def cos_transform(self): + """Discrete cosine transform. + + returns: list of frequency, cov pairs + """ + n = len(self.ys) + res = [] + for k in range(n): + cov = self.cos_cov(k) + res.append((k, cov)) + + return res + + def write(self, filename="sound.wav"): + """Write a wave file. + + filename: string + """ + print("Writing", filename) + wfile = WavFileWriter(filename, self.framerate) + wfile.write(self) + wfile.close() + + def play(self, filename="sound.wav"): + """Plays a wave file. + + filename: string + """ + self.write(filename) + play_wave(filename) + + def make_audio(self): + """Makes an IPython Audio object. + """ + audio = Audio(data=self.ys.real, rate=self.framerate) + return audio + + +def unbias(ys): + """Shifts a wave array so it has mean 0. + + ys: wave array + + returns: wave array + """ + return ys - ys.mean() + + +def normalize(ys, amp=1.0): + """Normalizes a wave array so the maximum amplitude is +amp or -amp. + + ys: wave array + amp: max amplitude (pos or neg) in result + + returns: wave array + """ + high, low = abs(max(ys)), abs(min(ys)) + return amp * ys / max(high, low) + + +def shift_right(ys, shift): + """Shifts a wave array to the right and zero pads. + + ys: wave array + shift: integer shift + + returns: wave array + """ + res = np.zeros(len(ys) + shift) + res[shift:] = ys + return res + + +def shift_left(ys, shift): + """Shifts a wave array to the left. + + ys: wave array + shift: integer shift + + returns: wave array + """ + return ys[shift:] + + +def truncate(ys, n): + """Trims a wave array to the given length. + + ys: wave array + n: integer length + + returns: wave array + """ + return ys[:n] + + +def quantize(ys, bound, dtype): + """Maps the waveform to quanta. + + ys: wave array + bound: maximum amplitude + dtype: numpy data type of the result + + returns: quantized signal + """ + if max(ys) > 1 or min(ys) < -1: + warnings.warn("Warning: normalizing before quantizing.") + ys = normalize(ys) + + zs = (ys * bound).astype(dtype) + return zs + + +def apodize(ys, framerate, denom=20, duration=0.1): + """Tapers the amplitude at the beginning and end of the signal. + + Tapers either the given duration of time or the given + fraction of the total duration, whichever is less. + + ys: wave array + framerate: int frames per second + denom: float fraction of the segment to taper + duration: float duration of the taper in seconds + + returns: wave array + """ + # a fixed fraction of the segment + n = len(ys) + k1 = n // denom + + # a fixed duration of time + k2 = int(duration * framerate) + + k = min(k1, k2) + + w1 = np.linspace(0, 1, k) + w2 = np.ones(n - 2 * k) + w3 = np.linspace(1, 0, k) + + window = np.concatenate((w1, w2, w3)) + return ys * window + + +class Signal: + """Represents a time-varying signal.""" + + def __add__(self, other): + """Adds two signals. + + other: Signal + + returns: Signal + """ + if other == 0: + return self + return SumSignal(self, other) + + __radd__ = __add__ + + @property + def period(self): + """Period of the signal in seconds (property). + + Since this is used primarily for purposes of plotting, + the default behavior is to return a value, 0.1 seconds, + that is reasonable for many signals. + + returns: float seconds + """ + return 0.1 + + def plot(self, framerate=11025): + """Plots the signal. + + The default behavior is to plot three periods. + + framerate: samples per second + """ + duration = self.period * 3 + wave = self.make_wave(duration, start=0, framerate=framerate) + wave.plot() + + def make_wave(self, duration=1, start=0, framerate=11025): + """Makes a Wave object. + + duration: float seconds + start: float seconds + framerate: int frames per second + + returns: Wave + """ + n = round(duration * framerate) + ts = start + np.arange(n) / framerate + ys = self.evaluate(ts) + return Wave(ys, ts, framerate=framerate) + + +def infer_framerate(ts): + """Given ts, find the framerate. + + Assumes that the ts are equally spaced. + + ts: sequence of times in seconds + + returns: frames per second + """ + # TODO: confirm that this is never used and remove it + dt = ts[1] - ts[0] + framerate = 1.0 / dt + return framerate + + +class SumSignal(Signal): + """Represents the sum of signals.""" + + def __init__(self, *args): + """Initializes the sum. + + args: tuple of signals + """ + self.signals = args + + @property + def period(self): + """Period of the signal in seconds. + + Note: this is not correct; it's mostly a placekeeper. + + But it is correct for a harmonic sequence where all + component frequencies are multiples of the fundamental. + + returns: float seconds + """ + return max(sig.period for sig in self.signals) + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + return sum(sig.evaluate(ts) for sig in self.signals) + + +class Sinusoid(Signal): + """Represents a sinusoidal signal.""" + + def __init__(self, freq=440, amp=1.0, offset=0, func=np.sin): + """Initializes a sinusoidal signal. + + freq: float frequency in Hz + amp: float amplitude, 1.0 is nominal max + offset: float phase offset in radians + func: function that maps phase to amplitude + """ + self.freq = freq + self.amp = amp + self.offset = offset + self.func = func + + @property + def period(self): + """Period of the signal in seconds. + + returns: float seconds + """ + return 1.0 / self.freq + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + phases = PI2 * self.freq * ts + self.offset + ys = self.amp * self.func(phases) + return ys + + +def CosSignal(freq=440, amp=1.0, offset=0): + """Makes a cosine Sinusoid. + + freq: float frequency in Hz + amp: float amplitude, 1.0 is nominal max + offset: float phase offset in radians + + returns: Sinusoid object + """ + return Sinusoid(freq, amp, offset, func=np.cos) + + +def SinSignal(freq=440, amp=1.0, offset=0): + """Makes a sine Sinusoid. + + freq: float frequency in Hz + amp: float amplitude, 1.0 is nominal max + offset: float phase offset in radians + + returns: Sinusoid object + """ + return Sinusoid(freq, amp, offset, func=np.sin) + + +def Sinc(freq=440, amp=1.0, offset=0): + """Makes a Sinc function. + + freq: float frequency in Hz + amp: float amplitude, 1.0 is nominal max + offset: float phase offset in radians + + returns: Sinusoid object + """ + return Sinusoid(freq, amp, offset, func=np.sinc) + + +class ComplexSinusoid(Sinusoid): + """Represents a complex exponential signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + phases = PI2 * self.freq * ts + self.offset + ys = self.amp * np.exp(1j * phases) + return ys + + +class SquareSignal(Sinusoid): + """Represents a square signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + cycles = self.freq * ts + self.offset / PI2 + frac, _ = np.modf(cycles) + ys = self.amp * np.sign(unbias(frac)) + return ys + + +class SawtoothSignal(Sinusoid): + """Represents a sawtooth signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + cycles = self.freq * ts + self.offset / PI2 + frac, _ = np.modf(cycles) + ys = normalize(unbias(frac), self.amp) + return ys + + +class ParabolicSignal(Sinusoid): + """Represents a parabolic signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + cycles = self.freq * ts + self.offset / PI2 + frac, _ = np.modf(cycles) + ys = (frac - 0.5) ** 2 + ys = normalize(unbias(ys), self.amp) + return ys + + +class CubicSignal(ParabolicSignal): + """Represents a cubic signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ys = ParabolicSignal.evaluate(self, ts) + ys = np.cumsum(ys) + ys = normalize(unbias(ys), self.amp) + return ys + + +class GlottalSignal(Sinusoid): + """Represents a periodic signal that resembles a glottal signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + cycles = self.freq * ts + self.offset / PI2 + frac, _ = np.modf(cycles) + ys = frac ** 2 * (1 - frac) + ys = normalize(unbias(ys), self.amp) + return ys + + +class TriangleSignal(Sinusoid): + """Represents a triangle signal.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ts = np.asarray(ts) + cycles = self.freq * ts + self.offset / PI2 + frac, _ = np.modf(cycles) + ys = np.abs(frac - 0.5) + ys = normalize(unbias(ys), self.amp) + return ys + + +class Chirp(Signal): + """Represents a signal with variable frequency.""" + + def __init__(self, start=440, end=880, amp=1.0): + """Initializes a linear chirp. + + start: float frequency in Hz + end: float frequency in Hz + amp: float amplitude, 1.0 is nominal max + """ + self.start = start + self.end = end + self.amp = amp + + @property + def period(self): + """Period of the signal in seconds. + + returns: float seconds + """ + return ValueError("Non-periodic signal.") + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + freqs = np.linspace(self.start, self.end, len(ts) - 1) + return self._evaluate(ts, freqs) + + def _evaluate(self, ts, freqs): + """Helper function that evaluates the signal. + + ts: float array of times + freqs: float array of frequencies during each interval + """ + dts = np.diff(ts) + dps = PI2 * freqs * dts + phases = np.cumsum(dps) + phases = np.insert(phases, 0, 0) + ys = self.amp * np.cos(phases) + return ys + + +class ExpoChirp(Chirp): + """Represents a signal with varying frequency.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + start, end = np.log10(self.start), np.log10(self.end) + freqs = np.logspace(start, end, len(ts) - 1) + return self._evaluate(ts, freqs) + + +class SilentSignal(Signal): + """Represents silence.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + return np.zeros(len(ts)) + + +class Impulses(Signal): + """Represents silence.""" + + def __init__(self, locations, amps=1): + self.locations = np.asanyarray(locations) + self.amps = amps + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ys = np.zeros(len(ts)) + indices = np.searchsorted(ts, self.locations) + ys[indices] = self.amps + return ys + + +class _Noise(Signal): + """Represents a noise signal (abstract parent class).""" + + def __init__(self, amp=1.0): + """Initializes a white noise signal. + + amp: float amplitude, 1.0 is nominal max + """ + self.amp = amp + + @property + def period(self): + """Period of the signal in seconds. + + returns: float seconds + """ + return ValueError("Non-periodic signal.") + + +class UncorrelatedUniformNoise(_Noise): + """Represents uncorrelated uniform noise.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ys = np.random.uniform(-self.amp, self.amp, len(ts)) + return ys + + +class UncorrelatedGaussianNoise(_Noise): + """Represents uncorrelated gaussian noise.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + ts: float array of times + + returns: float wave array + """ + ys = np.random.normal(0, self.amp, len(ts)) + return ys + + +class BrownianNoise(_Noise): + """Represents Brownian noise, aka red noise.""" + + def evaluate(self, ts): + """Evaluates the signal at the given times. + + Computes Brownian noise by taking the cumulative sum of + a uniform random series. + + ts: float array of times + + returns: float wave array + """ + dys = np.random.uniform(-1, 1, len(ts)) + # ys = scipy.integrate.cumtrapz(dys, ts) + ys = np.cumsum(dys) + ys = normalize(unbias(ys), self.amp) + return ys + + +class PinkNoise(_Noise): + """Represents Brownian noise, aka red noise.""" + + def __init__(self, amp=1.0, beta=1.0): + """Initializes a pink noise signal. + + amp: float amplitude, 1.0 is nominal max + """ + self.amp = amp + self.beta = beta + + def make_wave(self, duration=1, start=0, framerate=11025): + """Makes a Wave object. + + duration: float seconds + start: float seconds + framerate: int frames per second + + returns: Wave + """ + signal = UncorrelatedUniformNoise() + wave = signal.make_wave(duration, start, framerate) + spectrum = wave.make_spectrum() + + spectrum.pink_filter(beta=self.beta) + + wave2 = spectrum.make_wave() + wave2.unbias() + wave2.normalize(self.amp) + return wave2 + + +def rest(duration): + """Makes a rest of the given duration. + + duration: float seconds + + returns: Wave + """ + signal = SilentSignal() + wave = signal.make_wave(duration) + return wave + + +def make_note(midi_num, duration, sig_cons=CosSignal, framerate=11025): + """Make a MIDI note with the given duration. + + midi_num: int MIDI note number + duration: float seconds + sig_cons: Signal constructor function + framerate: int frames per second + + returns: Wave + """ + freq = midi_to_freq(midi_num) + signal = sig_cons(freq) + wave = signal.make_wave(duration, framerate=framerate) + wave.apodize() + return wave + + +def make_chord(midi_nums, duration, sig_cons=CosSignal, framerate=11025): + """Make a chord with the given duration. + + midi_nums: sequence of int MIDI note numbers + duration: float seconds + sig_cons: Signal constructor function + framerate: int frames per second + + returns: Wave + """ + freqs = [midi_to_freq(num) for num in midi_nums] + signal = sum(sig_cons(freq) for freq in freqs) + wave = signal.make_wave(duration, framerate=framerate) + wave.apodize() + return wave + + +def midi_to_freq(midi_num): + """Converts MIDI note number to frequency. + + midi_num: int MIDI note number + + returns: float frequency in Hz + """ + x = (midi_num - 69) / 12.0 + freq = 440.0 * 2 ** x + return freq + + +def sin_wave(freq, duration=1, offset=0): + """Makes a sine wave with the given parameters. + + freq: float cycles per second + duration: float seconds + offset: float radians + + returns: Wave + """ + signal = SinSignal(freq, offset=offset) + wave = signal.make_wave(duration) + return wave + + +def cos_wave(freq, duration=1, offset=0): + """Makes a cosine wave with the given parameters. + + freq: float cycles per second + duration: float seconds + offset: float radians + + returns: Wave + """ + signal = CosSignal(freq, offset=offset) + wave = signal.make_wave(duration) + return wave + + +def mag(a): + """Computes the magnitude of a numpy array. + + a: numpy array + + returns: float + """ + return np.sqrt(np.dot(a, a)) + + +def zero_pad(array, n): + """Extends an array with zeros. + + array: numpy array + n: length of result + + returns: new NumPy array + """ + res = np.zeros(n) + res[: len(array)] = array + return res + + +def main(): + + cos_basis = cos_wave(440) + sin_basis = sin_wave(440) + + wave = cos_wave(440, offset=math.pi / 2) + cos_cov = cos_basis.cov(wave) + sin_cov = sin_basis.cov(wave) + print(cos_cov, sin_cov, mag((cos_cov, sin_cov))) + return + + wfile = WavFileWriter() + for sig_cons in [ + SinSignal, + TriangleSignal, + SawtoothSignal, + GlottalSignal, + ParabolicSignal, + SquareSignal, + ]: + print(sig_cons) + sig = sig_cons(440) + wave = sig.make_wave(1) + wave.apodize() + wfile.write(wave) + wfile.close() + return + + signal = GlottalSignal(440) + signal.plot() + pyplot.show() + return + + wfile = WavFileWriter() + for m in range(60, 0, -1): + wfile.write(make_note(m, 0.25)) + wfile.close() + return + + wave1 = make_note(69, 1) + wave2 = make_chord([69, 72, 76], 1) + wave = wave1 | wave2 + + wfile = WavFileWriter() + wfile.write(wave) + wfile.close() + return + + sig1 = CosSignal(freq=440) + sig2 = CosSignal(freq=523.25) + sig3 = CosSignal(freq=660) + sig4 = CosSignal(freq=880) + sig5 = CosSignal(freq=987) + sig = sig1 + sig2 + sig3 + sig4 + + # wave = Wave(sig, duration=0.02) + # wave.plot() + + wave = sig.make_wave(duration=1) + # wave.normalize() + + wfile = WavFileWriter(wave) + wfile.write() + wfile.close() + + +if __name__ == "__main__": + main() diff --git a/thinkplot.py b/thinkplot.py new file mode 100644 index 0000000..e68f246 --- /dev/null +++ b/thinkplot.py @@ -0,0 +1,890 @@ +"""This file contains code for use with "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function + +import math +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +import warnings + +# customize some matplotlib attributes +#matplotlib.rc('figure', figsize=(4, 3)) + +#matplotlib.rc('font', size=14.0) +#matplotlib.rc('axes', labelsize=22.0, titlesize=22.0) +#matplotlib.rc('legend', fontsize=20.0) + +#matplotlib.rc('xtick.major', size=6.0) +#matplotlib.rc('xtick.minor', size=3.0) + +#matplotlib.rc('ytick.major', size=6.0) +#matplotlib.rc('ytick.minor', size=3.0) + + +class _Brewer(object): + """Encapsulates a nice sequence of colors. + + Shades of blue that look good in color and can be distinguished + in grayscale (up to a point). + + Borrowed from http://colorbrewer2.org/ + """ + color_iter = None + + colors = ['#f7fbff', '#deebf7', '#c6dbef', + '#9ecae1', '#6baed6', '#4292c6', + '#2171b5','#08519c','#08306b'][::-1] + + # lists that indicate which colors to use depending on how many are used + which_colors = [[], + [1], + [1, 3], + [0, 2, 4], + [0, 2, 4, 6], + [0, 2, 3, 5, 6], + [0, 2, 3, 4, 5, 6], + [0, 1, 2, 3, 4, 5, 6], + [0, 1, 2, 3, 4, 5, 6, 7], + [0, 1, 2, 3, 4, 5, 6, 7, 8], + ] + + current_figure = None + + @classmethod + def Colors(cls): + """Returns the list of colors. + """ + return cls.colors + + @classmethod + def ColorGenerator(cls, num): + """Returns an iterator of color strings. + + n: how many colors will be used + """ + for i in cls.which_colors[num]: + yield cls.colors[i] + raise StopIteration('Ran out of colors in _Brewer.') + + @classmethod + def InitIter(cls, num): + """Initializes the color iterator with the given number of colors.""" + cls.color_iter = cls.ColorGenerator(num) + fig = plt.gcf() + cls.current_figure = fig + + @classmethod + def ClearIter(cls): + """Sets the color iterator to None.""" + cls.color_iter = None + cls.current_figure = None + + @classmethod + def GetIter(cls, num): + """Gets the color iterator.""" + fig = plt.gcf() + if fig != cls.current_figure: + cls.InitIter(num) + cls.current_figure = fig + + if cls.color_iter is None: + cls.InitIter(num) + + return cls.color_iter + + +def _UnderrideColor(options): + """If color is not in the options, chooses a color. + """ + if 'color' in options: + return options + + # get the current color iterator; if there is none, init one + color_iter = _Brewer.GetIter(5) + + try: + options['color'] = next(color_iter) + except StopIteration: + # if you run out of colors, initialize the color iterator + # and try again + warnings.warn('Ran out of colors. Starting over.') + _Brewer.ClearIter() + _UnderrideColor(options) + + return options + + +def PrePlot(num=None, rows=None, cols=None): + """Takes hints about what's coming. + + num: number of lines that will be plotted + rows: number of rows of subplots + cols: number of columns of subplots + """ + if num: + _Brewer.InitIter(num) + + if rows is None and cols is None: + return + + if rows is not None and cols is None: + cols = 1 + + if cols is not None and rows is None: + rows = 1 + + # resize the image, depending on the number of rows and cols + size_map = {(1, 1): (8, 6), + (1, 2): (12, 6), + (1, 3): (12, 6), + (1, 4): (12, 5), + (1, 5): (12, 4), + (2, 2): (10, 10), + (2, 3): (16, 10), + (3, 1): (8, 10), + (4, 1): (8, 12), + } + + if (rows, cols) in size_map: + fig = plt.gcf() + fig.set_size_inches(*size_map[rows, cols]) + + # create the first subplot + if rows > 1 or cols > 1: + ax = plt.subplot(rows, cols, 1) + global SUBPLOT_ROWS, SUBPLOT_COLS + SUBPLOT_ROWS = rows + SUBPLOT_COLS = cols + else: + ax = plt.gca() + + return ax + + +def SubPlot(plot_number, rows=None, cols=None, **options): + """Configures the number of subplots and changes the current plot. + + rows: int + cols: int + plot_number: int + options: passed to subplot + """ + rows = rows or SUBPLOT_ROWS + cols = cols or SUBPLOT_COLS + return plt.subplot(rows, cols, plot_number, **options) + + +def _Underride(d, **options): + """Add key-value pairs to d only if key is not in d. + + If d is None, create a new dictionary. + + d: dictionary + options: keyword args to add to d + """ + if d is None: + d = {} + + for key, val in options.items(): + d.setdefault(key, val) + + return d + + +def Clf(): + """Clears the figure and any hints that have been set.""" + global LOC + LOC = None + _Brewer.ClearIter() + plt.clf() + fig = plt.gcf() + fig.set_size_inches(8, 6) + + +def Figure(**options): + """Sets options for the current figure.""" + _Underride(options, figsize=(6, 8)) + plt.figure(**options) + + +def Plot(obj, ys=None, style='', **options): + """Plots a line. + + Args: + obj: sequence of x values, or Series, or anything with Render() + ys: sequence of y values + style: style string passed along to plt.plot + options: keyword args passed to plt.plot + """ + options = _UnderrideColor(options) + label = getattr(obj, 'label', '_nolegend_') + options = _Underride(options, linewidth=3, alpha=0.7, label=label) + + xs = obj + if ys is None: + if hasattr(obj, 'Render'): + xs, ys = obj.Render() + if isinstance(obj, pd.Series): + ys = obj.values + xs = obj.index + + if ys is None: + plt.plot(xs, style, **options) + else: + plt.plot(xs, ys, style, **options) + + +def Vlines(xs, y1, y2, **options): + """Plots a set of vertical lines. + + Args: + xs: sequence of x values + y1: sequence of y values + y2: sequence of y values + options: keyword args passed to plt.vlines + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=1, alpha=0.5) + plt.vlines(xs, y1, y2, **options) + + +def Hlines(ys, x1, x2, **options): + """Plots a set of horizontal lines. + + Args: + ys: sequence of y values + x1: sequence of x values + x2: sequence of x values + options: keyword args passed to plt.vlines + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=1, alpha=0.5) + plt.hlines(ys, x1, x2, **options) + + +def axvline(x, **options): + """Plots a vertical line. + + Args: + x: x location + options: keyword args passed to plt.axvline + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=1, alpha=0.5) + plt.axvline(x, **options) + + +def axhline(y, **options): + """Plots a horizontal line. + + Args: + y: y location + options: keyword args passed to plt.axhline + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=1, alpha=0.5) + plt.axhline(y, **options) + + +def tight_layout(**options): + """Adjust subplots to minimize padding and margins. + """ + options = _Underride(options, + wspace=0.1, hspace=0.1, + left=0, right=1, + bottom=0, top=1) + plt.tight_layout() + plt.subplots_adjust(**options) + + +def FillBetween(xs, y1, y2=None, where=None, **options): + """Fills the space between two lines. + + Args: + xs: sequence of x values + y1: sequence of y values + y2: sequence of y values + where: sequence of boolean + options: keyword args passed to plt.fill_between + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=0, alpha=0.5) + plt.fill_between(xs, y1, y2, where, **options) + + +def Bar(xs, ys, **options): + """Plots a line. + + Args: + xs: sequence of x values + ys: sequence of y values + options: keyword args passed to plt.bar + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=0, alpha=0.6) + plt.bar(xs, ys, **options) + + +def Scatter(xs, ys=None, **options): + """Makes a scatter plot. + + xs: x values + ys: y values + options: options passed to plt.scatter + """ + options = _Underride(options, color='blue', alpha=0.2, + s=30, edgecolors='none') + + if ys is None and isinstance(xs, pd.Series): + ys = xs.values + xs = xs.index + + plt.scatter(xs, ys, **options) + + +def HexBin(xs, ys, **options): + """Makes a scatter plot. + + xs: x values + ys: y values + options: options passed to plt.scatter + """ + options = _Underride(options, cmap=matplotlib.cm.Blues) + plt.hexbin(xs, ys, **options) + + +def Pdf(pdf, **options): + """Plots a Pdf, Pmf, or Hist as a line. + + Args: + pdf: Pdf, Pmf, or Hist object + options: keyword args passed to plt.plot + """ + low, high = options.pop('low', None), options.pop('high', None) + n = options.pop('n', 101) + xs, ps = pdf.Render(low=low, high=high, n=n) + options = _Underride(options, label=pdf.label) + Plot(xs, ps, **options) + + +def Pdfs(pdfs, **options): + """Plots a sequence of PDFs. + + Options are passed along for all PDFs. If you want different + options for each pdf, make multiple calls to Pdf. + + Args: + pdfs: sequence of PDF objects + options: keyword args passed to plt.plot + """ + for pdf in pdfs: + Pdf(pdf, **options) + + +def Hist(hist, **options): + """Plots a Pmf or Hist with a bar plot. + + The default width of the bars is based on the minimum difference + between values in the Hist. If that's too small, you can override + it by providing a width keyword argument, in the same units + as the values. + + Args: + hist: Hist or Pmf object + options: keyword args passed to plt.bar + """ + # find the minimum distance between adjacent values + xs, ys = hist.Render() + + # see if the values support arithmetic + try: + xs[0] - xs[0] + except TypeError: + # if not, replace values with numbers + labels = [str(x) for x in xs] + xs = np.arange(len(xs)) + plt.xticks(xs+0.5, labels) + + if 'width' not in options: + try: + options['width'] = 0.9 * np.diff(xs).min() + except TypeError: + warnings.warn("Hist: Can't compute bar width automatically." + "Check for non-numeric types in Hist." + "Or try providing width option." + ) + + options = _Underride(options, label=hist.label) + options = _Underride(options, align='center') + if options['align'] == 'left': + options['align'] = 'edge' + elif options['align'] == 'right': + options['align'] = 'edge' + options['width'] *= -1 + + Bar(xs, ys, **options) + + +def Hists(hists, **options): + """Plots two histograms as interleaved bar plots. + + Options are passed along for all PMFs. If you want different + options for each pmf, make multiple calls to Pmf. + + Args: + hists: list of two Hist or Pmf objects + options: keyword args passed to plt.plot + """ + for hist in hists: + Hist(hist, **options) + + +def Pmf(pmf, **options): + """Plots a Pmf or Hist as a line. + + Args: + pmf: Hist or Pmf object + options: keyword args passed to plt.plot + """ + xs, ys = pmf.Render() + low, high = min(xs), max(xs) + + width = options.pop('width', None) + if width is None: + try: + width = np.diff(xs).min() + except TypeError: + warnings.warn("Pmf: Can't compute bar width automatically." + "Check for non-numeric types in Pmf." + "Or try providing width option.") + points = [] + + lastx = np.nan + lasty = 0 + for x, y in zip(xs, ys): + if (x - lastx) > 1e-5: + points.append((lastx, 0)) + points.append((x, 0)) + + points.append((x, lasty)) + points.append((x, y)) + points.append((x+width, y)) + + lastx = x + width + lasty = y + points.append((lastx, 0)) + pxs, pys = zip(*points) + + align = options.pop('align', 'center') + if align == 'center': + pxs = np.array(pxs) - width/2.0 + if align == 'right': + pxs = np.array(pxs) - width + + options = _Underride(options, label=pmf.label) + Plot(pxs, pys, **options) + + +def Pmfs(pmfs, **options): + """Plots a sequence of PMFs. + + Options are passed along for all PMFs. If you want different + options for each pmf, make multiple calls to Pmf. + + Args: + pmfs: sequence of PMF objects + options: keyword args passed to plt.plot + """ + for pmf in pmfs: + Pmf(pmf, **options) + + +def Diff(t): + """Compute the differences between adjacent elements in a sequence. + + Args: + t: sequence of number + + Returns: + sequence of differences (length one less than t) + """ + diffs = [t[i+1] - t[i] for i in range(len(t)-1)] + return diffs + + +def Cdf(cdf, complement=False, transform=None, **options): + """Plots a CDF as a line. + + Args: + cdf: Cdf object + complement: boolean, whether to plot the complementary CDF + transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' + options: keyword args passed to plt.plot + + Returns: + dictionary with the scale options that should be passed to + Config, Show or Save. + """ + xs, ps = cdf.Render() + xs = np.asarray(xs) + ps = np.asarray(ps) + + scale = dict(xscale='linear', yscale='linear') + + for s in ['xscale', 'yscale']: + if s in options: + scale[s] = options.pop(s) + + if transform == 'exponential': + complement = True + scale['yscale'] = 'log' + + if transform == 'pareto': + complement = True + scale['yscale'] = 'log' + scale['xscale'] = 'log' + + if complement: + ps = [1.0-p for p in ps] + + if transform == 'weibull': + xs = np.delete(xs, -1) + ps = np.delete(ps, -1) + ps = [-math.log(1.0-p) for p in ps] + scale['xscale'] = 'log' + scale['yscale'] = 'log' + + if transform == 'gumbel': + xs = np.delete(xs, 0) + ps = np.delete(ps, 0) + ps = [-math.log(p) for p in ps] + scale['yscale'] = 'log' + + options = _Underride(options, label=cdf.label) + Plot(xs, ps, **options) + return scale + + +def Cdfs(cdfs, complement=False, transform=None, **options): + """Plots a sequence of CDFs. + + cdfs: sequence of CDF objects + complement: boolean, whether to plot the complementary CDF + transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' + options: keyword args passed to plt.plot + """ + for cdf in cdfs: + Cdf(cdf, complement, transform, **options) + + +def Contour(obj, pcolor=False, contour=True, imshow=False, **options): + """Makes a contour plot. + + d: map from (x, y) to z, or object that provides GetDict + pcolor: boolean, whether to make a pseudocolor plot + contour: boolean, whether to make a contour plot + imshow: boolean, whether to use plt.imshow + options: keyword args passed to plt.pcolor and/or plt.contour + """ + try: + d = obj.GetDict() + except AttributeError: + d = obj + + _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) + + xs, ys = zip(*d.keys()) + xs = sorted(set(xs)) + ys = sorted(set(ys)) + + X, Y = np.meshgrid(xs, ys) + func = lambda x, y: d.get((x, y), 0) + func = np.vectorize(func) + Z = func(X, Y) + + x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) + axes = plt.gca() + axes.xaxis.set_major_formatter(x_formatter) + + if pcolor: + plt.pcolormesh(X, Y, Z, **options) + if contour: + cs = plt.contour(X, Y, Z, **options) + plt.clabel(cs, inline=1, fontsize=10) + if imshow: + extent = xs[0], xs[-1], ys[0], ys[-1] + plt.imshow(Z, extent=extent, **options) + + +def Pcolor(xs, ys, zs, pcolor=True, contour=False, **options): + """Makes a pseudocolor plot. + + xs: + ys: + zs: + pcolor: boolean, whether to make a pseudocolor plot + contour: boolean, whether to make a contour plot + options: keyword args passed to plt.pcolor and/or plt.contour + """ + _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) + + X, Y = np.meshgrid(xs, ys) + Z = zs + + x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) + axes = plt.gca() + axes.xaxis.set_major_formatter(x_formatter) + + if pcolor: + plt.pcolormesh(X, Y, Z, **options) + + if contour: + cs = plt.contour(X, Y, Z, **options) + plt.clabel(cs, inline=1, fontsize=10) + + +def Text(x, y, s, **options): + """Puts text in a figure. + + x: number + y: number + s: string + options: keyword args passed to plt.text + """ + options = _Underride(options, + fontsize=16, + verticalalignment='top', + horizontalalignment='left') + plt.text(x, y, s, **options) + + +LEGEND = True +LOC = None + +def Config(**options): + """Configures the plot. + + Pulls options out of the option dictionary and passes them to + the corresponding plt functions. + """ + names = ['title', 'xlabel', 'ylabel', 'xscale', 'yscale', + 'xticks', 'yticks', 'axis', 'xlim', 'ylim'] + + for name in names: + if name in options: + getattr(plt, name)(options[name]) + + global LEGEND + LEGEND = options.get('legend', LEGEND) + + # see if there are any elements with labels; + # if not, don't draw a legend + ax = plt.gca() + handles, labels = ax.get_legend_handles_labels() + + if LEGEND and len(labels) > 0: + global LOC + LOC = options.get('loc', LOC) + frameon = options.get('frameon', True) + + try: + plt.legend(loc=LOC, frameon=frameon) + except UserWarning: + pass + + # x and y ticklabels can be made invisible + val = options.get('xticklabels', None) + if val is not None: + if val == 'invisible': + ax = plt.gca() + labels = ax.get_xticklabels() + plt.setp(labels, visible=False) + + val = options.get('yticklabels', None) + if val is not None: + if val == 'invisible': + ax = plt.gca() + labels = ax.get_yticklabels() + plt.setp(labels, visible=False) + +def set_font_size(title_size=16, label_size=16, ticklabel_size=14, legend_size=14): + """Set font sizes for the title, labels, ticklabels, and legend. + """ + def set_text_size(texts, size): + for text in texts: + text.set_size(size) + + ax = plt.gca() + + # TODO: Make this function more robust if any of these elements + # is missing. + + # title + ax.title.set_size(title_size) + + # x axis + ax.xaxis.label.set_size(label_size) + set_text_size(ax.xaxis.get_ticklabels(), ticklabel_size) + + # y axis + ax.yaxis.label.set_size(label_size) + set_text_size(ax.yaxis.get_ticklabels(), ticklabel_size) + + # legend + legend = ax.get_legend() + if legend is not None: + set_text_size(legend.texts, legend_size) + + +def bigger_text(): + sizes = dict(title_size=16, label_size=16, ticklabel_size=14, legend_size=14) + set_font_size(**sizes) + + +def Show(**options): + """Shows the plot. + + For options, see Config. + + options: keyword args used to invoke various plt functions + """ + clf = options.pop('clf', True) + Config(**options) + plt.show() + if clf: + Clf() + + +def Plotly(**options): + """Shows the plot. + + For options, see Config. + + options: keyword args used to invoke various plt functions + """ + clf = options.pop('clf', True) + Config(**options) + import plotly.plotly as plotly + url = plotly.plot_mpl(plt.gcf()) + if clf: + Clf() + return url + + +def Save(root=None, formats=None, **options): + """Saves the plot in the given formats and clears the figure. + + For options, see Config. + + Note: With a capital S, this is the original save, maintained for + compatibility. New code should use save(), which works better + with my newer code, especially in Jupyter notebooks. + + Args: + root: string filename root + formats: list of string formats + options: keyword args used to invoke various plt functions + """ + clf = options.pop('clf', True) + + save_options = {} + for option in ['bbox_inches', 'pad_inches']: + if option in options: + save_options[option] = options.pop(option) + + # TODO: falling Config inside Save was probably a mistake, but removing + # it will require some work + Config(**options) + + if formats is None: + formats = ['pdf', 'png'] + + try: + formats.remove('plotly') + Plotly(clf=False) + except ValueError: + pass + + if root: + for fmt in formats: + SaveFormat(root, fmt, **save_options) + if clf: + Clf() + + +def save(root, formats=None, **options): + """Saves the plot in the given formats and clears the figure. + + For options, see plt.savefig. + + Args: + root: string filename root + formats: list of string formats + options: keyword args passed to plt.savefig + """ + if formats is None: + formats = ['pdf', 'png'] + + try: + formats.remove('plotly') + Plotly(clf=False) + except ValueError: + pass + + for fmt in formats: + SaveFormat(root, fmt, **options) + + +def SaveFormat(root, fmt='eps', **options): + """Writes the current figure to a file in the given format. + + Args: + root: string filename root + fmt: string format + """ + _Underride(options, dpi=300) + filename = '%s.%s' % (root, fmt) + print('Writing', filename) + plt.savefig(filename, format=fmt, **options) + + +# provide aliases for calling functions with lower-case names +preplot = PrePlot +subplot = SubPlot +clf = Clf +figure = Figure +plot = Plot +vlines = Vlines +hlines = Hlines +fill_between = FillBetween +text = Text +scatter = Scatter +pmf = Pmf +pmfs = Pmfs +hist = Hist +hists = Hists +diff = Diff +cdf = Cdf +cdfs = Cdfs +contour = Contour +pcolor = Pcolor +config = Config +show = Show + + +def main(): + color_iter = _Brewer.ColorGenerator(7) + for color in color_iter: + print(color) + + +if __name__ == '__main__': + main() diff --git a/thinkstats2.py b/thinkstats2.py new file mode 100644 index 0000000..42c2bbc --- /dev/null +++ b/thinkstats2.py @@ -0,0 +1,3041 @@ +"""This file contains code for use with "Think Stats" and +"Think Bayes", both by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function, division + +"""This file contains class definitions for: + +Hist: represents a histogram (map from values to integer frequencies). + +Pmf: represents a probability mass function (map from values to probs). + +_DictWrapper: private parent class for Hist and Pmf. + +Cdf: represents a discrete cumulative distribution function + +Pdf: represents a continuous probability density function + +""" + +import bisect +import copy +import logging +import math +import random +import re + +from collections import Counter +from operator import itemgetter + +import thinkplot + +import numpy as np +import pandas + +import scipy +from scipy import stats +from scipy import special +from scipy import ndimage + +from scipy.special import gamma + +from io import open + +ROOT2 = math.sqrt(2) + +def RandomSeed(x): + """Initialize the random and np.random generators. + + x: int seed + """ + random.seed(x) + np.random.seed(x) + + +def Odds(p): + """Computes odds for a given probability. + + Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor. + + Note: when p=1, the formula for odds divides by zero, which is + normally undefined. But I think it is reasonable to define Odds(1) + to be infinity, so that's what this function does. + + p: float 0-1 + + Returns: float odds + """ + if p == 1: + return float('inf') + return p / (1 - p) + + +def Probability(o): + """Computes the probability corresponding to given odds. + + Example: o=2 means 2:1 odds in favor, or 2/3 probability + + o: float odds, strictly positive + + Returns: float probability + """ + return o / (o + 1) + + +def Probability2(yes, no): + """Computes the probability corresponding to given odds. + + Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability. + + yes, no: int or float odds in favor + """ + return yes / (yes + no) + + +class Interpolator(object): + """Represents a mapping between sorted sequences; performs linear interp. + + Attributes: + xs: sorted list + ys: sorted list + """ + + def __init__(self, xs, ys): + self.xs = xs + self.ys = ys + + def Lookup(self, x): + """Looks up x and returns the corresponding value of y.""" + return self._Bisect(x, self.xs, self.ys) + + def Reverse(self, y): + """Looks up y and returns the corresponding value of x.""" + return self._Bisect(y, self.ys, self.xs) + + def _Bisect(self, x, xs, ys): + """Helper function.""" + if x <= xs[0]: + return ys[0] + if x >= xs[-1]: + return ys[-1] + i = bisect.bisect(xs, x) + frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1]) + y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1]) + return y + + +# When we plot Hist, Pmf and Cdf objects, they don't appear in +# the legend unless we override the default label. +DEFAULT_LABEL = '_nolegend_' + + +class _DictWrapper(object): + """An object that contains a dictionary.""" + + def __init__(self, obj=None, label=None): + """Initializes the distribution. + + obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs + label: string label + """ + self.label = label if label is not None else DEFAULT_LABEL + self.d = {} + + # flag whether the distribution is under a log transform + self.log = False + + if obj is None: + return + + if isinstance(obj, (_DictWrapper, Cdf, Pdf)): + self.label = label if label is not None else obj.label + + if isinstance(obj, dict): + self.d.update(obj.items()) + elif isinstance(obj, (_DictWrapper, Cdf, Pdf)): + self.d.update(obj.Items()) + elif isinstance(obj, pandas.Series): + self.d.update(obj.value_counts().iteritems()) + else: + # finally, treat it like a list + self.d.update(Counter(obj)) + + if len(self) > 0 and isinstance(self, Pmf): + self.Normalize() + + def __hash__(self): + return id(self) + + def __str__(self): + cls = self.__class__.__name__ + if self.label == DEFAULT_LABEL: + return '%s(%s)' % (cls, str(self.d)) + else: + return self.label + + def __repr__(self): + cls = self.__class__.__name__ + if self.label == DEFAULT_LABEL: + return '%s(%s)' % (cls, repr(self.d)) + else: + return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label)) + + def __eq__(self, other): + try: + return self.d == other.d + except AttributeError: + return False + + def __len__(self): + return len(self.d) + + def __iter__(self): + return iter(self.d) + + def iterkeys(self): + """Returns an iterator over keys.""" + return iter(self.d) + + def __contains__(self, value): + return value in self.d + + def __getitem__(self, value): + return self.d.get(value, 0) + + def __setitem__(self, value, prob): + self.d[value] = prob + + def __delitem__(self, value): + del self.d[value] + + def Copy(self, label=None): + """Returns a copy. + + Make a shallow copy of d. If you want a deep copy of d, + use copy.deepcopy on the whole object. + + label: string label for the new Hist + + returns: new _DictWrapper with the same type + """ + new = copy.copy(self) + new.d = copy.copy(self.d) + new.label = label if label is not None else self.label + return new + + def Scale(self, factor): + """Multiplies the values by a factor. + + factor: what to multiply by + + Returns: new object + """ + new = self.Copy() + new.d.clear() + + for val, prob in self.Items(): + new.Set(val * factor, prob) + return new + + def Log(self, m=None): + """Log transforms the probabilities. + + Removes values with probability 0. + + Normalizes so that the largest logprob is 0. + """ + if self.log: + raise ValueError("Pmf/Hist already under a log transform") + self.log = True + + if m is None: + m = self.MaxLike() + + for x, p in self.d.items(): + if p: + self.Set(x, math.log(p / m)) + else: + self.Remove(x) + + def Exp(self, m=None): + """Exponentiates the probabilities. + + m: how much to shift the ps before exponentiating + + If m is None, normalizes so that the largest prob is 1. + """ + if not self.log: + raise ValueError("Pmf/Hist not under a log transform") + self.log = False + + if m is None: + m = self.MaxLike() + + for x, p in self.d.items(): + self.Set(x, math.exp(p - m)) + + def GetDict(self): + """Gets the dictionary.""" + return self.d + + def SetDict(self, d): + """Sets the dictionary.""" + self.d = d + + def Values(self): + """Gets an unsorted sequence of values. + + Note: one source of confusion is that the keys of this + dictionary are the values of the Hist/Pmf, and the + values of the dictionary are frequencies/probabilities. + """ + return self.d.keys() + + def Items(self): + """Gets an unsorted sequence of (value, freq/prob) pairs.""" + return self.d.items() + + def SortedItems(self): + """Gets a sorted sequence of (value, freq/prob) pairs. + + It items are unsortable, the result is unsorted. + """ + def isnan(x): + try: + return math.isnan(x) + except TypeError: + return False + + if any([isnan(x) for x in self.Values()]): + msg = 'Keys contain NaN, may not sort correctly.' + logging.warning(msg) + + try: + return sorted(self.d.items()) + except TypeError: + return self.d.items() + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + Note: options are ignored + + Returns: + tuple of (sorted value sequence, freq/prob sequence) + """ + return zip(*self.SortedItems()) + + def MakeCdf(self, label=None): + """Makes a Cdf.""" + label = label if label is not None else self.label + return Cdf(self, label=label) + + def Print(self): + """Prints the values and freqs/probs in ascending order.""" + for val, prob in self.SortedItems(): + print(val, prob) + + def Set(self, x, y=0): + """Sets the freq/prob associated with the value x. + + Args: + x: number value + y: number freq or prob + """ + self.d[x] = y + + def Incr(self, x, term=1): + """Increments the freq/prob associated with the value x. + + Args: + x: number value + term: how much to increment by + """ + self.d[x] = self.d.get(x, 0) + term + + def Mult(self, x, factor): + """Scales the freq/prob associated with the value x. + + Args: + x: number value + factor: how much to multiply by + """ + self.d[x] = self.d.get(x, 0) * factor + + def Remove(self, x): + """Removes a value. + + Throws an exception if the value is not there. + + Args: + x: value to remove + """ + del self.d[x] + + def Total(self): + """Returns the total of the frequencies/probabilities in the map.""" + total = sum(self.d.values()) + return total + + def MaxLike(self): + """Returns the largest frequency/probability in the map.""" + return max(self.d.values()) + + def Largest(self, n=10): + """Returns the largest n values, with frequency/probability. + + n: number of items to return + """ + return sorted(self.d.items(), reverse=True)[:n] + + def Smallest(self, n=10): + """Returns the smallest n values, with frequency/probability. + + n: number of items to return + """ + return sorted(self.d.items(), reverse=False)[:n] + + +class Hist(_DictWrapper): + """Represents a histogram, which is a map from values to frequencies. + + Values can be any hashable type; frequencies are integer counters. + """ + def Freq(self, x): + """Gets the frequency associated with the value x. + + Args: + x: number value + + Returns: + int frequency + """ + return self.d.get(x, 0) + + def Freqs(self, xs): + """Gets frequencies for a sequence of values.""" + return [self.Freq(x) for x in xs] + + def IsSubset(self, other): + """Checks whether the values in this histogram are a subset of + the values in the given histogram.""" + for val, freq in self.Items(): + if freq > other.Freq(val): + return False + return True + + def Subtract(self, other): + """Subtracts the values in the given histogram from this histogram.""" + for val, freq in other.Items(): + self.Incr(val, -freq) + + +class Pmf(_DictWrapper): + """Represents a probability mass function. + + Values can be any hashable type; probabilities are floating-point. + Pmfs are not necessarily normalized. + """ + + def Prob(self, x, default=0): + """Gets the probability associated with the value x. + + Args: + x: number value + default: value to return if the key is not there + + Returns: + float probability + """ + return self.d.get(x, default) + + def Probs(self, xs): + """Gets probabilities for a sequence of values.""" + return [self.Prob(x) for x in xs] + + def Percentile(self, percentage): + """Computes a percentile of a given Pmf. + + Note: this is not super efficient. If you are planning + to compute more than a few percentiles, compute the Cdf. + + percentage: float 0-100 + + returns: value from the Pmf + """ + p = percentage / 100 + total = 0 + for val, prob in sorted(self.Items()): + total += prob + if total >= p: + return val + + def ProbGreater(self, x): + """Probability that a sample from this Pmf exceeds x. + + x: number + + returns: float probability + """ + if isinstance(x, _DictWrapper): + return PmfProbGreater(self, x) + else: + t = [prob for (val, prob) in self.d.items() if val > x] + return sum(t) + + def ProbLess(self, x): + """Probability that a sample from this Pmf is less than x. + + x: number + + returns: float probability + """ + if isinstance(x, _DictWrapper): + return PmfProbLess(self, x) + else: + t = [prob for (val, prob) in self.d.items() if val < x] + return sum(t) + + def ProbEqual(self, x): + """Probability that a sample from this Pmf is exactly x. + + x: number + + returns: float probability + """ + if isinstance(x, _DictWrapper): + return PmfProbEqual(self, x) + else: + return self[x] + + # NOTE: I've decided to remove the magic comparators because they + # have the side-effect of making Pmf sortable, but in fact they + # don't support sorting. + + def Normalize(self, fraction=1): + """Normalizes this PMF so the sum of all probs is fraction. + + Args: + fraction: what the total should be after normalization + + Returns: the total probability before normalizing + """ + if self.log: + raise ValueError("Normalize: Pmf is under a log transform") + + total = self.Total() + if total == 0: + raise ValueError('Normalize: total probability is zero.') + + factor = fraction / total + for x in self.d: + self.d[x] *= factor + + return total + + def Random(self): + """Chooses a random element from this PMF. + + Note: this is not very efficient. If you plan to call + this more than a few times, consider converting to a CDF. + + Returns: + float value from the Pmf + """ + target = random.random() + total = 0 + for x, p in self.d.items(): + total += p + if total >= target: + return x + + # we shouldn't get here + raise ValueError('Random: Pmf might not be normalized.') + + def Sample(self, n): + """Generates a random sample from this distribution. + + n: int length of the sample + returns: NumPy array + """ + return self.MakeCdf().Sample(n) + + def Mean(self): + """Computes the mean of a PMF. + + Returns: + float mean + """ + return sum(p * x for x, p in self.Items()) + + def Median(self): + """Computes the median of a PMF. + + Returns: + float median + """ + return self.MakeCdf().Percentile(50) + + def Var(self, mu=None): + """Computes the variance of a PMF. + + mu: the point around which the variance is computed; + if omitted, computes the mean + + returns: float variance + """ + if mu is None: + mu = self.Mean() + + return sum(p * (x-mu)**2 for x, p in self.Items()) + + def Expect(self, func): + """Computes the expectation of func(x). + + Returns: + expectation + """ + return np.sum(p * func(x) for x, p in self.Items()) + + def Std(self, mu=None): + """Computes the standard deviation of a PMF. + + mu: the point around which the variance is computed; + if omitted, computes the mean + + returns: float standard deviation + """ + var = self.Var(mu) + return math.sqrt(var) + + def Mode(self): + """Returns the value with the highest probability. + + Returns: float probability + """ + _, val = max((prob, val) for val, prob in self.Items()) + return val + + # The mode of a posterior is the maximum aposteori probability (MAP) + MAP = Mode + + # If the distribution contains likelihoods only, the peak is the + # maximum likelihood estimator. + MaximumLikelihood = Mode + + def CredibleInterval(self, percentage=90): + """Computes the central credible interval. + + If percentage=90, computes the 90% CI. + + Args: + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + cdf = self.MakeCdf() + return cdf.CredibleInterval(percentage) + + def __add__(self, other): + """Computes the Pmf of the sum of values drawn from self and other. + + other: another Pmf or a scalar + + returns: new Pmf + """ + try: + return self.AddPmf(other) + except AttributeError: + return self.AddConstant(other) + + __radd__ = __add__ + + def AddPmf(self, other): + """Computes the Pmf of the sum of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf[v1 + v2] += p1 * p2 + return pmf + + def AddConstant(self, other): + """Computes the Pmf of the sum a constant and values from self. + + other: a number + + returns: new Pmf + """ + if other == 0: + return self.Copy() + + pmf = Pmf() + for v1, p1 in self.Items(): + pmf.Set(v1 + other, p1) + return pmf + + def __sub__(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.SubPmf(other) + except AttributeError: + return self.AddConstant(-other) + + def SubPmf(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 - v2, p1 * p2) + return pmf + + def __mul__(self, other): + """Computes the Pmf of the product of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.MulPmf(other) + except AttributeError: + return self.MulConstant(other) + + def MulPmf(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 * v2, p1 * p2) + return pmf + + def MulConstant(self, other): + """Computes the Pmf of the product of a constant and values from self. + + other: a number + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + pmf.Set(v1 * other, p1) + return pmf + + def __div__(self, other): + """Computes the Pmf of the ratio of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.DivPmf(other) + except AttributeError: + return self.MulConstant(1/other) + + __truediv__ = __div__ + + def DivPmf(self, other): + """Computes the Pmf of the ratio of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 / v2, p1 * p2) + return pmf + + def Max(self, k): + """Computes the CDF of the maximum of k selections from this dist. + + k: int + + returns: new Cdf + """ + cdf = self.MakeCdf() + cdf.ps **= k + return cdf + + +class Joint(Pmf): + """Represents a joint distribution. + + The values are sequences (usually tuples) + """ + + def Marginal(self, i, label=None): + """Gets the marginal distribution of the indicated variable. + + i: index of the variable we want + + Returns: Pmf + """ + pmf = Pmf(label=label) + for vs, prob in self.Items(): + pmf.Incr(vs[i], prob) + return pmf + + def Conditional(self, i, j, val, label=None): + """Gets the conditional distribution of the indicated variable. + + Distribution of vs[i], conditioned on vs[j] = val. + + i: index of the variable we want + j: which variable is conditioned on + val: the value the jth variable has to have + + Returns: Pmf + """ + pmf = Pmf(label=label) + for vs, prob in self.Items(): + if vs[j] != val: + continue + pmf.Incr(vs[i], prob) + + pmf.Normalize() + return pmf + + def MaxLikeInterval(self, percentage=90): + """Returns the maximum-likelihood credible interval. + + If percentage=90, computes a 90% CI containing the values + with the highest likelihoods. + + percentage: float between 0 and 100 + + Returns: list of values from the suite + """ + interval = [] + total = 0 + + t = [(prob, val) for val, prob in self.Items()] + t.sort(reverse=True) + + for prob, val in t: + interval.append(val) + total += prob + if total >= percentage / 100: + break + + return interval + + +def MakeJoint(pmf1, pmf2): + """Joint distribution of values from pmf1 and pmf2. + + Assumes that the PMFs represent independent random variables. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + Joint pmf of value pairs + """ + joint = Joint() + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + joint.Set((v1, v2), p1 * p2) + return joint + + +def MakeHistFromList(t, label=None): + """Makes a histogram from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this histogram + + Returns: + Hist object + """ + return Hist(t, label=label) + + +def MakeHistFromDict(d, label=None): + """Makes a histogram from a map from values to frequencies. + + Args: + d: dictionary that maps values to frequencies + label: string label for this histogram + + Returns: + Hist object + """ + return Hist(d, label) + + +def MakePmfFromList(t, label=None): + """Makes a PMF from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(t, label=label) + + +def MakePmfFromDict(d, label=None): + """Makes a PMF from a map from values to probabilities. + + Args: + d: dictionary that maps values to probabilities + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(d, label=label) + + +def MakePmfFromItems(t, label=None): + """Makes a PMF from a sequence of value-probability pairs + + Args: + t: sequence of value-probability pairs + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(dict(t), label=label) + + +def MakePmfFromHist(hist, label=None): + """Makes a normalized PMF from a Hist object. + + Args: + hist: Hist object + label: string label + + Returns: + Pmf object + """ + if label is None: + label = hist.label + + return Pmf(hist, label=label) + + +def MakeMixture(metapmf, label='mix'): + """Make a mixture distribution. + + Args: + metapmf: Pmf that maps from Pmfs to probs. + label: string label for the new Pmf. + + Returns: Pmf object. + """ + mix = Pmf(label=label) + for pmf, p1 in metapmf.Items(): + for x, p2 in pmf.Items(): + mix[x] += p1 * p2 + return mix + + +def MakeUniformPmf(low, high, n): + """Make a uniform Pmf. + + low: lowest value (inclusive) + high: highest value (inclusize) + n: number of values + """ + pmf = Pmf() + for x in np.linspace(low, high, n): + pmf.Set(x, 1) + pmf.Normalize() + return pmf + + +class Cdf: + """Represents a cumulative distribution function. + + Attributes: + xs: sequence of values + ps: sequence of probabilities + label: string used as a graph label. + """ + def __init__(self, obj=None, ps=None, label=None): + """Initializes. + + If ps is provided, obj must be the corresponding list of values. + + obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs + ps: list of cumulative probabilities + label: string label + """ + self.label = label if label is not None else DEFAULT_LABEL + + if isinstance(obj, (_DictWrapper, Cdf, Pdf)): + if not label: + self.label = label if label is not None else obj.label + + if obj is None: + # caller does not provide obj, make an empty Cdf + self.xs = np.asarray([]) + self.ps = np.asarray([]) + if ps is not None: + logging.warning("Cdf: can't pass ps without also passing xs.") + return + else: + # if the caller provides xs and ps, just store them + if ps is not None: + if isinstance(ps, str): + logging.warning("Cdf: ps can't be a string") + + self.xs = np.asarray(obj) + self.ps = np.asarray(ps) + return + + # caller has provided just obj, not ps + if isinstance(obj, Cdf): + self.xs = copy.copy(obj.xs) + self.ps = copy.copy(obj.ps) + return + + if isinstance(obj, _DictWrapper): + dw = obj + else: + dw = Hist(obj) + + if len(dw) == 0: + self.xs = np.asarray([]) + self.ps = np.asarray([]) + return + + xs, freqs = zip(*sorted(dw.Items())) + self.xs = np.asarray(xs) + self.ps = np.cumsum(freqs, dtype=np.float) + self.ps /= self.ps[-1] + + def __str__(self): + cls = self.__class__.__name__ + if self.label == DEFAULT_LABEL: + return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps)) + else: + return self.label + + def __repr__(self): + cls = self.__class__.__name__ + if self.label == DEFAULT_LABEL: + return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps)) + else: + return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps), + repr(self.label)) + + def __len__(self): + return len(self.xs) + + def __getitem__(self, x): + return self.Prob(x) + + def __setitem__(self): + raise UnimplementedMethodException() + + def __delitem__(self): + raise UnimplementedMethodException() + + def __eq__(self, other): + return np.all(self.xs == other.xs) and np.all(self.ps == other.ps) + + def Print(self): + """Prints the values and freqs/probs in ascending order.""" + for val, prob in zip(self.xs, self.ps): + print(val, prob) + + def Copy(self, label=None): + """Returns a copy of this Cdf. + + label: string label for the new Cdf + """ + if label is None: + label = self.label + return Cdf(list(self.xs), list(self.ps), label=label) + + def MakePmf(self, label=None): + """Makes a Pmf.""" + if label is None: + label = self.label + return Pmf(self, label=label) + + def Items(self): + """Returns a sorted sequence of (value, probability) pairs. + + Note: in Python3, returns an iterator. + """ + a = self.ps + b = np.roll(a, 1) + b[0] = 0 + return zip(self.xs, a-b) + + def Shift(self, term): + """Adds a term to the xs. + + term: how much to add + """ + new = self.Copy() + # don't use +=, or else an int array + float yields int array + new.xs = new.xs + term + return new + + def Scale(self, factor): + """Multiplies the xs by a factor. + + factor: what to multiply by + """ + new = self.Copy() + # don't use *=, or else an int array * float yields int array + new.xs = new.xs * factor + return new + + def Prob(self, x): + """Returns CDF(x), the probability that corresponds to value x. + + Args: + x: number + + Returns: + float probability + """ + if x < self.xs[0]: + return 0 + index = bisect.bisect(self.xs, x) + p = self.ps[index-1] + return p + + def Probs(self, xs): + """Gets probabilities for a sequence of values. + + xs: any sequence that can be converted to NumPy array + + returns: NumPy array of cumulative probabilities + """ + xs = np.asarray(xs) + index = np.searchsorted(self.xs, xs, side='right') + ps = self.ps[index-1] + ps[xs < self.xs[0]] = 0 + return ps + + ProbArray = Probs + + def Value(self, p): + """Returns InverseCDF(p), the value that corresponds to probability p. + + Args: + p: number in the range [0, 1] + + Returns: + number value + """ + if p < 0 or p > 1: + raise ValueError('Probability p must be in range [0, 1]') + + index = bisect.bisect_left(self.ps, p) + return self.xs[index] + + def Values(self, ps=None): + """Returns InverseCDF(p), the value that corresponds to probability p. + + If ps is not provided, returns all values. + + Args: + ps: NumPy array of numbers in the range [0, 1] + + Returns: + NumPy array of values + """ + if ps is None: + return self.xs + + ps = np.asarray(ps) + if np.any(ps < 0) or np.any(ps > 1): + raise ValueError('Probability p must be in range [0, 1]') + + index = np.searchsorted(self.ps, ps, side='left') + return self.xs[index] + + ValueArray = Values + + def Percentile(self, p): + """Returns the value that corresponds to percentile p. + + Args: + p: number in the range [0, 100] + + Returns: + number value + """ + return self.Value(p / 100) + + def Percentiles(self, ps): + """Returns the value that corresponds to percentiles ps. + + Args: + ps: numbers in the range [0, 100] + + Returns: + array of values + """ + ps = np.asarray(ps) + return self.Values(ps / 100) + + def PercentileRank(self, x): + """Returns the percentile rank of the value x. + + x: potential value in the CDF + + returns: percentile rank in the range 0 to 100 + """ + return self.Prob(x) * 100 + + def PercentileRanks(self, xs): + """Returns the percentile ranks of the values in xs. + + xs: potential value in the CDF + + returns: array of percentile ranks in the range 0 to 100 + """ + return self.Probs(x) * 100 + + def Random(self): + """Chooses a random value from this distribution.""" + return self.Value(random.random()) + + def Sample(self, n): + """Generates a random sample from this distribution. + + n: int length of the sample + returns: NumPy array + """ + ps = np.random.random(n) + return self.ValueArray(ps) + + def Mean(self): + """Computes the mean of a CDF. + + Returns: + float mean + """ + old_p = 0 + total = 0 + for x, new_p in zip(self.xs, self.ps): + p = new_p - old_p + total += p * x + old_p = new_p + return total + + def CredibleInterval(self, percentage=90): + """Computes the central credible interval. + + If percentage=90, computes the 90% CI. + + Args: + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + prob = (1 - percentage / 100) / 2 + interval = self.Value(prob), self.Value(1 - prob) + return interval + + ConfidenceInterval = CredibleInterval + + def _Round(self, multiplier=1000): + """ + An entry is added to the cdf only if the percentile differs + from the previous value in a significant digit, where the number + of significant digits is determined by multiplier. The + default is 1000, which keeps log10(1000) = 3 significant digits. + """ + # TODO(write this method) + raise UnimplementedMethodException() + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + An empirical CDF is a step function; linear interpolation + can be misleading. + + Note: options are ignored + + Returns: + tuple of (xs, ps) + """ + def interleave(a, b): + c = np.empty(a.shape[0] + b.shape[0]) + c[::2] = a + c[1::2] = b + return c + + a = np.array(self.xs) + xs = interleave(a, a) + shift_ps = np.roll(self.ps, 1) + shift_ps[0] = 0 + ps = interleave(shift_ps, self.ps) + return xs, ps + + def Max(self, k): + """Computes the CDF of the maximum of k selections from this dist. + + k: int + + returns: new Cdf + """ + cdf = self.Copy() + cdf.ps **= k + return cdf + + +def MakeCdfFromItems(items, label=None): + """Makes a cdf from an unsorted sequence of (value, frequency) pairs. + + Args: + items: unsorted sequence of (value, frequency) pairs + label: string label for this CDF + + Returns: + cdf: list of (value, fraction) pairs + """ + return Cdf(dict(items), label=label) + + +def MakeCdfFromDict(d, label=None): + """Makes a CDF from a dictionary that maps values to frequencies. + + Args: + d: dictionary that maps values to frequencies. + label: string label for the data. + + Returns: + Cdf object + """ + return Cdf(d, label=label) + + +def MakeCdfFromList(seq, label=None): + """Creates a CDF from an unsorted sequence. + + Args: + seq: unsorted sequence of sortable values + label: string label for the cdf + + Returns: + Cdf object + """ + return Cdf(seq, label=label) + + +def MakeCdfFromHist(hist, label=None): + """Makes a CDF from a Hist object. + + Args: + hist: Pmf.Hist object + label: string label for the data. + + Returns: + Cdf object + """ + if label is None: + label = hist.label + + return Cdf(hist, label=label) + + +def MakeCdfFromPmf(pmf, label=None): + """Makes a CDF from a Pmf object. + + Args: + pmf: Pmf.Pmf object + label: string label for the data. + + Returns: + Cdf object + """ + if label is None: + label = pmf.label + + return Cdf(pmf, label=label) + + +class UnimplementedMethodException(Exception): + """Exception if someone calls a method that should be overridden.""" + + +class Suite(Pmf): + """Represents a suite of hypotheses and their probabilities.""" + + def Update(self, data): + """Updates each hypothesis based on the data. + + data: any representation of the data + + returns: the normalizing constant + """ + for hypo in self.Values(): + like = self.Likelihood(data, hypo) + self.Mult(hypo, like) + return self.Normalize() + + def LogUpdate(self, data): + """Updates a suite of hypotheses based on new data. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + Note: unlike Update, LogUpdate does not normalize. + + Args: + data: any representation of the data + """ + for hypo in self.Values(): + like = self.LogLikelihood(data, hypo) + self.Incr(hypo, like) + + def UpdateSet(self, dataset): + """Updates each hypothesis based on the dataset. + + This is more efficient than calling Update repeatedly because + it waits until the end to Normalize. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + dataset: a sequence of data + + returns: the normalizing constant + """ + for data in dataset: + for hypo in self.Values(): + like = self.Likelihood(data, hypo) + self.Mult(hypo, like) + return self.Normalize() + + def LogUpdateSet(self, dataset): + """Updates each hypothesis based on the dataset. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + dataset: a sequence of data + + returns: None + """ + for data in dataset: + self.LogUpdate(data) + + def Likelihood(self, data, hypo): + """Computes the likelihood of the data under the hypothesis. + + hypo: some representation of the hypothesis + data: some representation of the data + """ + raise UnimplementedMethodException() + + def LogLikelihood(self, data, hypo): + """Computes the log likelihood of the data under the hypothesis. + + hypo: some representation of the hypothesis + data: some representation of the data + """ + raise UnimplementedMethodException() + + def Print(self): + """Prints the hypotheses and their probabilities.""" + for hypo, prob in sorted(self.Items()): + print(hypo, prob) + + def MakeOdds(self): + """Transforms from probabilities to odds. + + Values with prob=0 are removed. + """ + for hypo, prob in self.Items(): + if prob: + self.Set(hypo, Odds(prob)) + else: + self.Remove(hypo) + + def MakeProbs(self): + """Transforms from odds to probabilities.""" + for hypo, odds in self.Items(): + self.Set(hypo, Probability(odds)) + + +def MakeSuiteFromList(t, label=None): + """Makes a suite from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this suite + + Returns: + Suite object + """ + hist = MakeHistFromList(t, label=label) + d = hist.GetDict() + return MakeSuiteFromDict(d) + + +def MakeSuiteFromHist(hist, label=None): + """Makes a normalized suite from a Hist object. + + Args: + hist: Hist object + label: string label + + Returns: + Suite object + """ + if label is None: + label = hist.label + + # make a copy of the dictionary + d = dict(hist.GetDict()) + return MakeSuiteFromDict(d, label) + + +def MakeSuiteFromDict(d, label=None): + """Makes a suite from a map from values to probabilities. + + Args: + d: dictionary that maps values to probabilities + label: string label for this suite + + Returns: + Suite object + """ + suite = Suite(label=label) + suite.SetDict(d) + suite.Normalize() + return suite + + +class Pdf(object): + """Represents a probability density function (PDF).""" + + def Density(self, x): + """Evaluates this Pdf at x. + + Returns: float or NumPy array of probability density + """ + raise UnimplementedMethodException() + + def GetLinspace(self): + """Get a linspace for plotting. + + Not all subclasses of Pdf implement this. + + Returns: numpy array + """ + raise UnimplementedMethodException() + + def MakePmf(self, **options): + """Makes a discrete version of this Pdf. + + options can include + label: string + low: low end of range + high: high end of range + n: number of places to evaluate + + Returns: new Pmf + """ + label = options.pop('label', '') + xs, ds = self.Render(**options) + return Pmf(dict(zip(xs, ds)), label=label) + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + If options includes low and high, it must also include n; + in that case the density is evaluated an n locations between + low and high, including both. + + If options includes xs, the density is evaluate at those location. + + Otherwise, self.GetLinspace is invoked to provide the locations. + + Returns: + tuple of (xs, densities) + """ + low, high = options.pop('low', None), options.pop('high', None) + if low is not None and high is not None: + n = options.pop('n', 101) + xs = np.linspace(low, high, n) + else: + xs = options.pop('xs', None) + if xs is None: + xs = self.GetLinspace() + + ds = self.Density(xs) + return xs, ds + + def Items(self): + """Generates a sequence of (value, probability) pairs. + """ + return zip(*self.Render()) + + +class NormalPdf(Pdf): + """Represents the PDF of a Normal distribution.""" + + def __init__(self, mu=0, sigma=1, label=None): + """Constructs a Normal Pdf with given mu and sigma. + + mu: mean + sigma: standard deviation + label: string + """ + self.mu = mu + self.sigma = sigma + self.label = label if label is not None else '_nolegend_' + + def __str__(self): + return 'NormalPdf(%f, %f)' % (self.mu, self.sigma) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + low, high = self.mu-3*self.sigma, self.mu+3*self.sigma + return np.linspace(low, high, 101) + + def Density(self, xs): + """Evaluates this Pdf at xs. + + xs: scalar or sequence of floats + + returns: float or NumPy array of probability density + """ + return stats.norm.pdf(xs, self.mu, self.sigma) + + +class ExponentialPdf(Pdf): + """Represents the PDF of an exponential distribution.""" + + def __init__(self, lam=1, label=None): + """Constructs an exponential Pdf with given parameter. + + lam: rate parameter + label: string + """ + self.lam = lam + self.label = label if label is not None else '_nolegend_' + + def __str__(self): + return 'ExponentialPdf(%f)' % (self.lam) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + low, high = 0, 5.0/self.lam + return np.linspace(low, high, 101) + + def Density(self, xs): + """Evaluates this Pdf at xs. + + xs: scalar or sequence of floats + + returns: float or NumPy array of probability density + """ + return stats.expon.pdf(xs, scale=1.0/self.lam) + + +class EstimatedPdf(Pdf): + """Represents a PDF estimated by KDE.""" + + def __init__(self, sample, label=None): + """Estimates the density function based on a sample. + + sample: sequence of data + label: string + """ + self.label = label if label is not None else '_nolegend_' + self.kde = stats.gaussian_kde(sample) + low = min(sample) + high = max(sample) + self.linspace = np.linspace(low, high, 101) + + def __str__(self): + return 'EstimatedPdf(label=%s)' % str(self.label) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + return self.linspace + + def Density(self, xs): + """Evaluates this Pdf at xs. + + returns: float or NumPy array of probability density + """ + return self.kde.evaluate(xs) + + def Sample(self, n): + """Generates a random sample from the estimated Pdf. + + n: size of sample + """ + # NOTE: we have to flatten because resample returns a 2-D + # array for some reason. + return self.kde.resample(n).flatten() + + +def CredibleInterval(pmf, percentage=90): + """Computes a credible interval for a given distribution. + + If percentage=90, computes the 90% CI. + + Args: + pmf: Pmf object representing a posterior distribution + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + cdf = pmf.MakeCdf() + prob = (1 - percentage / 100) / 2 + interval = cdf.Value(prob), cdf.Value(1 - prob) + return interval + + +def PmfProbLess(pmf1, pmf2): + """Probability that a value from pmf1 is less than a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 < v2: + total += p1 * p2 + return total + + +def PmfProbGreater(pmf1, pmf2): + """Probability that a value from pmf1 is less than a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 > v2: + total += p1 * p2 + return total + + +def PmfProbEqual(pmf1, pmf2): + """Probability that a value from pmf1 equals a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 == v2: + total += p1 * p2 + return total + + +def RandomSum(dists): + """Chooses a random value from each dist and returns the sum. + + dists: sequence of Pmf or Cdf objects + + returns: numerical sum + """ + total = sum(dist.Random() for dist in dists) + return total + + +def SampleSum(dists, n): + """Draws a sample of sums from a list of distributions. + + dists: sequence of Pmf or Cdf objects + n: sample size + + returns: new Pmf of sums + """ + pmf = Pmf(RandomSum(dists) for i in range(n)) + return pmf + + +def EvalNormalPdf(x, mu, sigma): + """Computes the unnormalized PDF of the normal distribution. + + x: value + mu: mean + sigma: standard deviation + + returns: float probability density + """ + return stats.norm.pdf(x, mu, sigma) + + +def MakeNormalPmf(mu, sigma, num_sigmas, n=201): + """Makes a PMF discrete approx to a Normal distribution. + + mu: float mean + sigma: float standard deviation + num_sigmas: how many sigmas to extend in each direction + n: number of values in the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + low = mu - num_sigmas * sigma + high = mu + num_sigmas * sigma + + for x in np.linspace(low, high, n): + p = EvalNormalPdf(x, mu, sigma) + pmf.Set(x, p) + pmf.Normalize() + return pmf + + +def EvalBinomialPmf(k, n, p): + """Evaluates the binomial PMF. + + Returns the probabily of k successes in n trials with probability p. + """ + return stats.binom.pmf(k, n, p) + + +def MakeBinomialPmf(n, p): + """Evaluates the binomial PMF. + + Returns the distribution of successes in n trials with probability p. + """ + pmf = Pmf() + for k in range(n+1): + pmf[k] = stats.binom.pmf(k, n, p) + return pmf + + +def EvalGammaPdf(x, a): + """Computes the Gamma PDF. + + x: where to evaluate the PDF + a: parameter of the gamma distribution + + returns: float probability + """ + return x**(a-1) * np.exp(-x) / gamma(a) + + +def MakeGammaPmf(xs, a): + """Makes a PMF discrete approx to a Gamma distribution. + + lam: parameter lambda in events per unit time + xs: upper bound of the Pmf + + returns: normalized Pmf + """ + xs = np.asarray(xs) + ps = EvalGammaPdf(xs, a) + pmf = Pmf(dict(zip(xs, ps))) + pmf.Normalize() + return pmf + + +def EvalGeometricPmf(k, p, loc=0): + """Evaluates the geometric PMF. + + With loc=0: Probability of `k` trials to get one success. + With loc=-1: Probability of `k` trials before first success. + + k: number of trials + p: probability of success on each trial + """ + return stats.geom.pmf(k, p, loc=loc) + + +def MakeGeometricPmf(p, loc=0, high=10): + """Evaluates the binomial PMF. + + With loc=0: PMF of trials to get one success. + With loc=-1: PMF of trials before first success. + + p: probability of success + high: upper bound where PMF is truncated + """ + pmf = Pmf() + for k in range(high): + pmf[k] = stats.geom.pmf(k, p, loc=loc) + pmf.Normalize() + return pmf + + +def EvalHypergeomPmf(k, N, K, n): + """Evaluates the hypergeometric PMF. + + Returns the probabily of k successes in n trials from a population + N with K successes in it. + """ + return stats.hypergeom.pmf(k, N, K, n) + + +def EvalPoissonPmf(k, lam): + """Computes the Poisson PMF. + + k: number of events + lam: parameter lambda in events per unit time + + returns: float probability + """ + return stats.poisson.pmf(k, lam) + + +def MakePoissonPmf(lam, high, step=1): + """Makes a PMF discrete approx to a Poisson distribution. + + lam: parameter lambda in events per unit time + high: upper bound of the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + for k in range(0, high + 1, step): + p = stats.poisson.pmf(k, lam) + pmf.Set(k, p) + pmf.Normalize() + return pmf + + +def EvalExponentialPdf(x, lam): + """Computes the exponential PDF. + + x: value + lam: parameter lambda in events per unit time + + returns: float probability density + """ + return lam * math.exp(-lam * x) + + +def EvalExponentialCdf(x, lam): + """Evaluates CDF of the exponential distribution with parameter lam.""" + return 1 - math.exp(-lam * x) + + +def MakeExponentialPmf(lam, high, n=200): + """Makes a PMF discrete approx to an exponential distribution. + + lam: parameter lambda in events per unit time + high: upper bound + n: number of values in the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + for x in np.linspace(0, high, n): + p = EvalExponentialPdf(x, lam) + pmf.Set(x, p) + pmf.Normalize() + return pmf + + +def EvalWeibullPdf(x, lam, k): + """Computes the Weibull PDF. + + x: value + lam: parameter lambda in events per unit time + k: parameter + + returns: float probability density + """ + arg = (x / lam) + return k / lam * arg**(k-1) * np.exp(-arg**k) + + +def EvalWeibullCdf(x, lam, k): + """Evaluates CDF of the Weibull distribution.""" + arg = (x / lam) + return 1 - np.exp(-arg**k) + + +def MakeWeibullPmf(lam, k, high, n=200): + """Makes a PMF discrete approx to a Weibull distribution. + + lam: parameter lambda in events per unit time + k: parameter + high: upper bound + n: number of values in the Pmf + + returns: normalized Pmf + """ + xs = np.linspace(0, high, n) + ps = EvalWeibullPdf(xs, lam, k) + ps[np.isinf(ps)] = 0 + return Pmf(dict(zip(xs, ps))) + + +def EvalParetoPdf(x, xm, alpha): + """Computes the Pareto. + + xm: minimum value (scale parameter) + alpha: shape parameter + + returns: float probability density + """ + return stats.pareto.pdf(x, alpha, scale=xm) + + +def MakeParetoPmf(xm, alpha, high, num=101): + """Makes a PMF discrete approx to a Pareto distribution. + + xm: minimum value (scale parameter) + alpha: shape parameter + high: upper bound value + num: number of values + + returns: normalized Pmf + """ + xs = np.linspace(xm, high, num) + ps = stats.pareto.pdf(xs, alpha, scale=xm) + pmf = Pmf(dict(zip(xs, ps))) + return pmf + +def StandardNormalCdf(x): + """Evaluates the CDF of the standard Normal distribution. + + See http://en.wikipedia.org/wiki/Normal_distribution + #Cumulative_distribution_function + + Args: + x: float + + Returns: + float + """ + return (math.erf(x / ROOT2) + 1) / 2 + + +def EvalNormalCdf(x, mu=0, sigma=1): + """Evaluates the CDF of the normal distribution. + + Args: + x: float + + mu: mean parameter + + sigma: standard deviation parameter + + Returns: + float + """ + return stats.norm.cdf(x, loc=mu, scale=sigma) + + +def EvalNormalCdfInverse(p, mu=0, sigma=1): + """Evaluates the inverse CDF of the normal distribution. + + See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function + + Args: + p: float + + mu: mean parameter + + sigma: standard deviation parameter + + Returns: + float + """ + return stats.norm.ppf(p, loc=mu, scale=sigma) + + +def EvalLognormalCdf(x, mu=0, sigma=1): + """Evaluates the CDF of the lognormal distribution. + + x: float or sequence + mu: mean parameter + sigma: standard deviation parameter + + Returns: float or sequence + """ + return stats.lognorm.cdf(x, loc=mu, scale=sigma) + + +def RenderExpoCdf(lam, low, high, n=101): + """Generates sequences of xs and ps for an exponential CDF. + + lam: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + xs = np.linspace(low, high, n) + ps = 1 - np.exp(-lam * xs) + #ps = stats.expon.cdf(xs, scale=1.0/lam) + return xs, ps + + +def RenderNormalCdf(mu, sigma, low, high, n=101): + """Generates sequences of xs and ps for a Normal CDF. + + mu: parameter + sigma: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + xs = np.linspace(low, high, n) + ps = stats.norm.cdf(xs, mu, sigma) + return xs, ps + + +def RenderParetoCdf(xmin, alpha, low, high, n=50): + """Generates sequences of xs and ps for a Pareto CDF. + + xmin: parameter + alpha: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + if low < xmin: + low = xmin + xs = np.linspace(low, high, n) + ps = 1 - (xs / xmin) ** -alpha + #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha) + return xs, ps + + +class Beta: + """Represents a Beta distribution. + + See http://en.wikipedia.org/wiki/Beta_distribution + """ + def __init__(self, alpha=1, beta=1, label=None): + """Initializes a Beta distribution.""" + self.alpha = alpha + self.beta = beta + self.label = label if label is not None else '_nolegend_' + + def Update(self, data): + """Updates a Beta distribution. + + data: pair of int (heads, tails) + """ + heads, tails = data + self.alpha += heads + self.beta += tails + + def Mean(self): + """Computes the mean of this distribution.""" + return self.alpha / (self.alpha + self.beta) + + def MAP(self): + """Computes the value with maximum a posteori probability.""" + a = self.alpha - 1 + b = self.beta - 1 + return a / (a + b) + + def Random(self): + """Generates a random variate from this distribution.""" + return random.betavariate(self.alpha, self.beta) + + def Sample(self, n): + """Generates a random sample from this distribution. + + n: int sample size + """ + size = n, + return np.random.beta(self.alpha, self.beta, size) + + def EvalPdf(self, x): + """Evaluates the PDF at x.""" + return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1) + + def MakePmf(self, steps=101, label=None): + """Returns a Pmf of this distribution. + + Note: Normally, we just evaluate the PDF at a sequence + of points and treat the probability density as a probability + mass. + + But if alpha or beta is less than one, we have to be + more careful because the PDF goes to infinity at x=0 + and x=1. In that case we evaluate the CDF and compute + differences. + + The result is a little funny, because the values at 0 and 1 + are not symmetric. Nevertheless, it is a reasonable discrete + model of the continuous distribution, and behaves well as + the number of values increases. + """ + if label is None and self.label is not None: + label = self.label + + if self.alpha < 1 or self.beta < 1: + cdf = self.MakeCdf() + pmf = cdf.MakePmf() + return pmf + + xs = [i / (steps - 1.0) for i in range(steps)] + probs = [self.EvalPdf(x) for x in xs] + pmf = Pmf(dict(zip(xs, probs)), label=label) + return pmf + + def MakeCdf(self, steps=101): + """Returns the CDF of this distribution.""" + xs = [i / (steps - 1.0) for i in range(steps)] + ps = special.betainc(self.alpha, self.beta, xs) + cdf = Cdf(xs, ps) + return cdf + + def Percentile(self, ps): + """Returns the given percentiles from this distribution. + + ps: scalar, array, or list of [0-100] + """ + ps = np.asarray(ps) / 100 + xs = special.betaincinv(self.alpha, self.beta, ps) + return xs + + +class Dirichlet(object): + """Represents a Dirichlet distribution. + + See http://en.wikipedia.org/wiki/Dirichlet_distribution + """ + + def __init__(self, n, conc=1, label=None): + """Initializes a Dirichlet distribution. + + n: number of dimensions + conc: concentration parameter (smaller yields more concentration) + label: string label + """ + if n < 2: + raise ValueError('A Dirichlet distribution with ' + 'n<2 makes no sense') + + self.n = n + self.params = np.ones(n, dtype=np.float) * conc + self.label = label if label is not None else '_nolegend_' + + def Update(self, data): + """Updates a Dirichlet distribution. + + data: sequence of observations, in order corresponding to params + """ + m = len(data) + self.params[:m] += data + + def Random(self): + """Generates a random variate from this distribution. + + Returns: normalized vector of fractions + """ + p = np.random.gamma(self.params) + return p / p.sum() + + def Likelihood(self, data): + """Computes the likelihood of the data. + + Selects a random vector of probabilities from this distribution. + + Returns: float probability + """ + m = len(data) + if self.n < m: + return 0 + + x = data + p = self.Random() + q = p[:m] ** x + return q.prod() + + def LogLikelihood(self, data): + """Computes the log likelihood of the data. + + Selects a random vector of probabilities from this distribution. + + Returns: float log probability + """ + m = len(data) + if self.n < m: + return float('-inf') + + x = self.Random() + y = np.log(x[:m]) * data + return y.sum() + + def MarginalBeta(self, i): + """Computes the marginal distribution of the ith element. + + See http://en.wikipedia.org/wiki/Dirichlet_distribution + #Marginal_distributions + + i: int + + Returns: Beta object + """ + alpha0 = self.params.sum() + alpha = self.params[i] + return Beta(alpha, alpha0 - alpha) + + def PredictivePmf(self, xs, label=None): + """Makes a predictive distribution. + + xs: values to go into the Pmf + + Returns: Pmf that maps from x to the mean prevalence of x + """ + alpha0 = self.params.sum() + ps = self.params / alpha0 + return Pmf(zip(xs, ps), label=label) + + +def BinomialCoef(n, k): + """Compute the binomial coefficient "n choose k". + + n: number of trials + k: number of successes + + Returns: float + """ + return scipy.misc.comb(n, k) + + +def LogBinomialCoef(n, k): + """Computes the log of the binomial coefficient. + + http://math.stackexchange.com/questions/64716/ + approximating-the-logarithm-of-the-binomial-coefficient + + n: number of trials + k: number of successes + + Returns: float + """ + return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k) + + +def NormalProbability(ys, jitter=0): + """Generates data for a normal probability plot. + + ys: sequence of values + jitter: float magnitude of jitter added to the ys + + returns: numpy arrays xs, ys + """ + n = len(ys) + xs = np.random.normal(0, 1, n) + xs.sort() + + if jitter: + ys = Jitter(ys, jitter) + else: + ys = np.array(ys) + ys.sort() + + return xs, ys + + +def Jitter(values, jitter=0.5): + """Jitters the values by adding a uniform variate in (-jitter, jitter). + + values: sequence + jitter: scalar magnitude of jitter + + returns: new numpy array + """ + n = len(values) + return np.random.normal(0, jitter, n) + values + + +def NormalProbabilityPlot(sample, fit_color='0.8', **options): + """Makes a normal probability plot with a fitted line. + + sample: sequence of numbers + fit_color: color string for the fitted line + options: passed along to Plot + """ + xs, ys = NormalProbability(sample) + mean, var = MeanVar(sample) + std = math.sqrt(var) + + fit = FitLine(xs, mean, std) + thinkplot.Plot(*fit, color=fit_color, label='model') + + xs, ys = NormalProbability(sample) + thinkplot.Plot(xs, ys, **options) + + +def Mean(xs): + """Computes mean. + + xs: sequence of values + + returns: float mean + """ + return np.mean(xs) + + +def Var(xs, mu=None, ddof=0): + """Computes variance. + + xs: sequence of values + mu: option known mean + ddof: delta degrees of freedom + + returns: float + """ + xs = np.asarray(xs) + + if mu is None: + mu = xs.mean() + + ds = xs - mu + return np.dot(ds, ds) / (len(xs) - ddof) + + +def Std(xs, mu=None, ddof=0): + """Computes standard deviation. + + xs: sequence of values + mu: option known mean + ddof: delta degrees of freedom + + returns: float + """ + var = Var(xs, mu, ddof) + return math.sqrt(var) + + +def MeanVar(xs, ddof=0): + """Computes mean and variance. + + Based on http://stackoverflow.com/questions/19391149/ + numpy-mean-and-variance-from-single-function + + xs: sequence of values + ddof: delta degrees of freedom + + returns: pair of float, mean and var + """ + xs = np.asarray(xs) + mean = xs.mean() + s2 = Var(xs, mean, ddof) + return mean, s2 + + +def Trim(t, p=0.01): + """Trims the largest and smallest elements of t. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + sequence of values + """ + n = int(p * len(t)) + t = sorted(t)[n:-n] + return t + + +def TrimmedMean(t, p=0.01): + """Computes the trimmed mean of a sequence of numbers. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + float + """ + t = Trim(t, p) + return Mean(t) + + +def TrimmedMeanVar(t, p=0.01): + """Computes the trimmed mean and variance of a sequence of numbers. + + Side effect: sorts the list. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + float + """ + t = Trim(t, p) + mu, var = MeanVar(t) + return mu, var + + +def CohenEffectSize(group1, group2): + """Compute Cohen's d. + + group1: Series or NumPy array + group2: Series or NumPy array + + returns: float + """ + diff = group1.mean() - group2.mean() + + n1, n2 = len(group1), len(group2) + var1 = group1.var() + var2 = group2.var() + + pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) + d = diff / math.sqrt(pooled_var) + return d + + +def Cov(xs, ys, meanx=None, meany=None): + """Computes Cov(X, Y). + + Args: + xs: sequence of values + ys: sequence of values + meanx: optional float mean of xs + meany: optional float mean of ys + + Returns: + Cov(X, Y) + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + + if meanx is None: + meanx = np.mean(xs) + if meany is None: + meany = np.mean(ys) + + cov = np.dot(xs-meanx, ys-meany) / len(xs) + return cov + + +def Corr(xs, ys): + """Computes Corr(X, Y). + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + Corr(X, Y) + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + + meanx, varx = MeanVar(xs) + meany, vary = MeanVar(ys) + + corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary) + + return corr + + +def SerialCorr(series, lag=1): + """Computes the serial correlation of a series. + + series: Series + lag: integer number of intervals to shift + + returns: float correlation + """ + xs = series[lag:] + ys = series.shift(lag)[lag:] + corr = Corr(xs, ys) + return corr + + +def SpearmanCorr(xs, ys): + """Computes Spearman's rank correlation. + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + float Spearman's correlation + """ + xranks = pandas.Series(xs).rank() + yranks = pandas.Series(ys).rank() + return Corr(xranks, yranks) + + +def MapToRanks(t): + """Returns a list of ranks corresponding to the elements in t. + + Args: + t: sequence of numbers + + Returns: + list of integer ranks, starting at 1 + """ + # pair up each value with its index + pairs = enumerate(t) + + # sort by value + sorted_pairs = sorted(pairs, key=itemgetter(1)) + + # pair up each pair with its rank + ranked = enumerate(sorted_pairs) + + # sort by index + resorted = sorted(ranked, key=lambda trip: trip[1][0]) + + # extract the ranks + ranks = [trip[0]+1 for trip in resorted] + return ranks + + +def LeastSquares(xs, ys): + """Computes a linear least squares fit for ys as a function of xs. + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + tuple of (intercept, slope) + """ + meanx, varx = MeanVar(xs) + meany = Mean(ys) + + slope = Cov(xs, ys, meanx, meany) / varx + inter = meany - slope * meanx + + return inter, slope + + +def FitLine(xs, inter, slope): + """Fits a line to the given data. + + xs: sequence of x + + returns: tuple of numpy arrays (sorted xs, fit ys) + """ + fit_xs = np.sort(xs) + fit_ys = inter + slope * fit_xs + return fit_xs, fit_ys + + +def Residuals(xs, ys, inter, slope): + """Computes residuals for a linear fit with parameters inter and slope. + + Args: + xs: independent variable + ys: dependent variable + inter: float intercept + slope: float slope + + Returns: + list of residuals + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + res = ys - (inter + slope * xs) + return res + + +def CoefDetermination(ys, res): + """Computes the coefficient of determination (R^2) for given residuals. + + Args: + ys: dependent variable + res: residuals + + Returns: + float coefficient of determination + """ + return 1 - Var(res) / Var(ys) + + +def CorrelatedGenerator(rho): + """Generates standard normal variates with serial correlation. + + rho: target coefficient of correlation + + Returns: iterable + """ + x = random.gauss(0, 1) + yield x + + sigma = math.sqrt(1 - rho**2) + while True: + x = random.gauss(x * rho, sigma) + yield x + + +def CorrelatedNormalGenerator(mu, sigma, rho): + """Generates normal variates with serial correlation. + + mu: mean of variate + sigma: standard deviation of variate + rho: target coefficient of correlation + + Returns: iterable + """ + for x in CorrelatedGenerator(rho): + yield x * sigma + mu + + +def RawMoment(xs, k): + """Computes the kth raw moment of xs. + """ + return sum(x**k for x in xs) / len(xs) + + +def CentralMoment(xs, k): + """Computes the kth central moment of xs. + """ + mean = RawMoment(xs, 1) + return sum((x - mean)**k for x in xs) / len(xs) + + +def StandardizedMoment(xs, k): + """Computes the kth standardized moment of xs. + """ + var = CentralMoment(xs, 2) + std = math.sqrt(var) + return CentralMoment(xs, k) / std**k + + +def Skewness(xs): + """Computes skewness. + """ + return StandardizedMoment(xs, 3) + + +def Median(xs): + """Computes the median (50th percentile) of a sequence. + + xs: sequence or anything else that can initialize a Cdf + + returns: float + """ + cdf = Cdf(xs) + return cdf.Value(0.5) + + +def IQR(xs): + """Computes the interquartile of a sequence. + + xs: sequence or anything else that can initialize a Cdf + + returns: pair of floats + """ + cdf = Cdf(xs) + return cdf.Value(0.25), cdf.Value(0.75) + + +def PearsonMedianSkewness(xs): + """Computes the Pearson median skewness. + """ + median = Median(xs) + mean = RawMoment(xs, 1) + var = CentralMoment(xs, 2) + std = math.sqrt(var) + gp = 3 * (mean - median) / std + return gp + + +class FixedWidthVariables(object): + """Represents a set of variables in a fixed width file.""" + + def __init__(self, variables, index_base=0): + """Initializes. + + variables: DataFrame + index_base: are the indices 0 or 1 based? + + Attributes: + colspecs: list of (start, end) index tuples + names: list of string variable names + """ + self.variables = variables + + # note: by default, subtract 1 from colspecs + self.colspecs = variables[['start', 'end']] - index_base + + # convert colspecs to a list of pair of int + self.colspecs = self.colspecs.astype(np.int).values.tolist() + self.names = variables['name'] + + def ReadFixedWidth(self, filename, **options): + """Reads a fixed width ASCII file. + + filename: string filename + + returns: DataFrame + """ + df = pandas.read_fwf(filename, + colspecs=self.colspecs, + names=self.names, + **options) + return df + + +def ReadStataDct(dct_file, **options): + """Reads a Stata dictionary file. + + dct_file: string filename + options: dict of options passed to open() + + returns: FixedWidthVariables object + """ + type_map = dict(byte=int, int=int, long=int, float=float, + double=float, numeric=float) + + var_info = [] + with open(dct_file, **options) as f: + for line in f: + match = re.search( r'_column\(([^)]*)\)', line) + if not match: + continue + start = int(match.group(1)) + t = line.split() + vtype, name, fstring = t[1:4] + name = name.lower() + if vtype.startswith('str'): + vtype = str + else: + vtype = type_map[vtype] + long_desc = ' '.join(t[4:]).strip('"') + var_info.append((start, vtype, name, fstring, long_desc)) + + columns = ['start', 'type', 'name', 'fstring', 'desc'] + variables = pandas.DataFrame(var_info, columns=columns) + + # fill in the end column by shifting the start column + variables['end'] = variables.start.shift(-1) + variables.loc[len(variables)-1, 'end'] = 0 + + dct = FixedWidthVariables(variables, index_base=1) + return dct + + +def Resample(xs, n=None): + """Draw a sample from xs with the same length as xs. + + xs: sequence + n: sample size (default: len(xs)) + + returns: NumPy array + """ + if n is None: + n = len(xs) + return np.random.choice(xs, n, replace=True) + + +def SampleRows(df, nrows, replace=False): + """Choose a sample of rows from a DataFrame. + + df: DataFrame + nrows: number of rows + replace: whether to sample with replacement + + returns: DataDf + """ + indices = np.random.choice(df.index, nrows, replace=replace) + sample = df.loc[indices] + return sample + + +def ResampleRows(df): + """Resamples rows from a DataFrame. + + df: DataFrame + + returns: DataFrame + """ + return SampleRows(df, len(df), replace=True) + + +def ResampleRowsWeighted(df, column='finalwgt'): + """Resamples a DataFrame using probabilities proportional to given column. + + df: DataFrame + column: string column name to use as weights + + returns: DataFrame + """ + weights = df[column].copy() + weights /= sum(weights) + indices = np.random.choice(df.index, len(df), replace=True, p=weights) + sample = df.loc[indices] + return sample + + +def PercentileRow(array, p): + """Selects the row from a sorted array that maps to percentile p. + + p: float 0--100 + + returns: NumPy array (one row) + """ + rows, cols = array.shape + index = int(rows * p / 100) + return array[index,] + + +def PercentileRows(ys_seq, percents): + """Given a collection of lines, selects percentiles along vertical axis. + + For example, if ys_seq contains simulation results like ys as a + function of time, and percents contains (5, 95), the result would + be a 90% CI for each vertical slice of the simulation results. + + ys_seq: sequence of lines (y values) + percents: list of percentiles (0-100) to select + + returns: list of NumPy arrays, one for each percentile + """ + nrows = len(ys_seq) + ncols = len(ys_seq[0]) + array = np.zeros((nrows, ncols)) + + for i, ys in enumerate(ys_seq): + array[i,] = ys + + array = np.sort(array, axis=0) + + rows = [PercentileRow(array, p) for p in percents] + return rows + + +def Smooth(xs, sigma=2, **options): + """Smooths a NumPy array with a Gaussian filter. + + xs: sequence + sigma: standard deviation of the filter + """ + return ndimage.filters.gaussian_filter1d(xs, sigma, **options) + + +class HypothesisTest(object): + """Represents a hypothesis test.""" + + def __init__(self, data): + """Initializes. + + data: data in whatever form is relevant + """ + self.data = data + self.MakeModel() + self.actual = self.TestStatistic(data) + self.test_stats = None + self.test_cdf = None + + def PValue(self, iters=1000): + """Computes the distribution of the test statistic and p-value. + + iters: number of iterations + + returns: float p-value + """ + self.test_stats = [self.TestStatistic(self.RunModel()) + for _ in range(iters)] + self.test_cdf = Cdf(self.test_stats) + + count = sum(1 for x in self.test_stats if x >= self.actual) + return count / iters + + def MaxTestStat(self): + """Returns the largest test statistic seen during simulations. + """ + return max(self.test_stats) + + def PlotCdf(self, label=None): + """Draws a Cdf with vertical lines at the observed test stat. + """ + def VertLine(x): + """Draws a vertical line at x.""" + thinkplot.Plot([x, x], [0, 1], color='0.8') + + VertLine(self.actual) + thinkplot.Cdf(self.test_cdf, label=label) + + def TestStatistic(self, data): + """Computes the test statistic. + + data: data in whatever form is relevant + """ + raise UnimplementedMethodException() + + def MakeModel(self): + """Build a model of the null hypothesis. + """ + pass + + def RunModel(self): + """Run the model of the null hypothesis. + + returns: simulated data + """ + raise UnimplementedMethodException() + + +def main(): + pass + + +if __name__ == '__main__': + main() \ No newline at end of file