diff --git a/data_analyst/README.md b/data_analyst/README.md new file mode 100644 index 0000000..adf0c45 --- /dev/null +++ b/data_analyst/README.md @@ -0,0 +1 @@ +# Scritps to perform various data analysis tasks diff --git a/data_analyst/lomb_scargle.py b/data_analyst/lomb_scargle.py new file mode 100644 index 0000000..d03d315 --- /dev/null +++ b/data_analyst/lomb_scargle.py @@ -0,0 +1,2 @@ +# * https://docs.astropy.org/en/stable/timeseries/lombscargle.html +# * https://medium.com/@samuelerickson977/analyzing-the-s-p-500-index-with-a-lomb-scargle-periodogram-aaaad8ac5bce diff --git a/data_analyst/requirements.txt b/data_analyst/requirements.txt new file mode 100644 index 0000000..a9e2534 --- /dev/null +++ b/data_analyst/requirements.txt @@ -0,0 +1,3 @@ +numpy +matplotlib +scipy diff --git a/data_analyst/smoothing/main.py b/data_analyst/smoothing/main.py new file mode 100644 index 0000000..0b86328 --- /dev/null +++ b/data_analyst/smoothing/main.py @@ -0,0 +1,125 @@ +import numpy as np +import matplotlib.pyplot as plt +from math import factorial +import csv +import scipy +import argparse + + +# FROM: https://scipy.github.io/old-wiki/pages/Cookbook/SavitzkyGolay +# NOTE: Not the same as https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html +def savitzky_golay(y, window_size, order, deriv=0, rate=1): + r"""Smooth (and optionally differentiate) data with a Savitzky-Golay filter. + The Savitzky-Golay filter removes high frequency noise from data. + It has the advantage of preserving the original shape and + features of the signal better than other types of filtering + approaches, such as moving averages techniques. + Parameters + ---------- + y : array_like, shape (N,) + the values of the time history of the signal. + window_size : int + the length of the window. Must be an odd integer number. + order : int + the order of the polynomial used in the filtering. + Must be less then `window_size` - 1. + deriv: int + the order of the derivative to compute (default = 0 means only smoothing) + Returns + ------- + ys : ndarray, shape (N) + the smoothed signal (or it's n-th derivative). + Notes + ----- + The Savitzky-Golay is a type of low-pass filter, particularly + suited for smoothing noisy data. The main idea behind this + approach is to make for each point a least-square fit with a + polynomial of high order over a odd-sized window centered at + the point. + Examples + -------- + t = np.linspace(-4, 4, 500) + y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape) + ysg = savitzky_golay(y, window_size=31, order=4) + import matplotlib.pyplot as plt + plt.plot(t, y, label='Noisy signal') + plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal') + plt.plot(t, ysg, 'r', label='Filtered signal') + plt.legend() + plt.show() + References + ---------- + .. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of + Data by Simplified Least Squares Procedures. Analytical + Chemistry, 1964, 36 (8), pp 1627-1639. + .. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing + W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery + Cambridge University Press ISBN-13: 9780521880688 + """ + + try: + window_size = np.abs(np.int64(window_size)) + order = np.abs(np.int64(order)) + except ValueError: + raise ValueError("window_size and order have to be of type int") + if window_size % 2 != 1 or window_size < 1: + raise TypeError("window_size size must be a positive odd number") + if window_size < order + 2: + raise TypeError("window_size is too small for the polynomials order") + order_range = range(order+1) + half_window = (window_size -1) // 2 + # precompute coefficients + b = np.asmatrix([[k**i for i in order_range] for k in range(-half_window, half_window+1)]) + m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv) + # pad the signal at the extremes with + # values taken from the signal itself + firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] ) + lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1]) + y = np.concatenate((firstvals, y, lastvals)) + return np.convolve( m[::-1], y, mode='valid') + + +def data_1(): + x = [] + y = [] + with open('smooth.csv') as csvfile: + spamreader = csv.reader(csvfile, delimiter=',') + data = list([x for x in spamreader][1:]) + for row in data: + x.append(float(row[0])) + y.append(float(row[1].replace(",",""))) + x = np.array(x) + y = np.array(y) + # yhat = scipy.signal.savgol_filter(y, 31, 2) + yhat = savitzky_golay(y, 31 , 2) + plt.plot(x,y) + plt.plot(x,yhat, color='red') + plt.show() + + +# TODO(gitbuda): Implement skip x values (e.g. if x are dates -> x axis is a huge mess). +def smooth(csv_file_path, x_axis_column_name, y_axis_column_name): + x = [] + y = [] + with open(csv_file_path) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + print(row) + x.append(row[x_axis_column_name]) + y.append(float(row[y_axis_column_name])) + x = np.array(x) + y = np.array(y) + # yhat = scipy.signal.savgol_filter(y, 31, 2) + yhat = savitzky_golay(y, 31 , 2) + plt.plot(x,y) + plt.plot(x,yhat, color='red') + plt.show() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--path") + parser.add_argument("--x") + parser.add_argument("--y") + args = parser.parse_args() + smooth(args.path, args.x, args.y)