diff --git a/.gitignore b/.gitignore index 5ceb386..92260b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ venv +build diff --git a/Makefile b/Makefile index 8ecf6ed..c749928 100644 --- a/Makefile +++ b/Makefile @@ -13,12 +13,16 @@ freeze: venv . venv/bin/activate; pip freeze > requirements.txt test:\ +src/weather\ src/mnist clean: rm -rf venv + rm -rf build src/%: venv + mkdir -p build . venv/bin/activate; python src/$*.py mnist: src/mnist +weather: src/weather diff --git a/readme.md b/readme.md index fb721e5..5572065 100644 --- a/readme.md +++ b/readme.md @@ -13,6 +13,11 @@ make mnist ``` +[Time series forecasting](https://www.tensorflow.org/tutorials/structured_data/time_series) + +```shell +make weather +``` ## Clean diff --git a/requirements.txt b/requirements.txt index e5b8ef8..c8a0fb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,11 @@ absl-py==0.12.0 astunparse==1.6.3 +backcall==0.2.0 cachetools==4.2.1 certifi==2020.12.5 chardet==4.0.0 +cycler==0.10.0 +decorator==5.0.7 flatbuffers==1.12 gast==0.4.0 google-auth==1.29.0 @@ -11,18 +14,36 @@ google-pasta==0.2.0 grpcio==1.34.1 h5py==3.1.0 idna==2.10 +ipython==7.22.0 +ipython-genutils==0.2.0 +jedi==0.18.0 keras-nightly==2.5.0.dev2021032900 Keras-Preprocessing==1.1.2 +kiwisolver==1.3.1 Markdown==3.3.4 +matplotlib==3.4.1 numpy==1.19.5 oauthlib==3.1.0 opt-einsum==3.3.0 +pandas==1.2.4 +parso==0.8.2 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==8.2.0 +prompt-toolkit==3.0.18 protobuf==3.15.8 +ptyprocess==0.7.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 +Pygments==2.8.1 +pyparsing==2.4.7 +python-dateutil==2.8.1 +pytz==2021.1 requests==2.25.1 requests-oauthlib==1.3.0 rsa==4.7.2 +scipy==1.6.2 +seaborn==0.11.1 six==1.15.0 tensorboard==2.5.0 tensorboard-data-server==0.6.0 @@ -30,7 +51,9 @@ tensorboard-plugin-wit==1.8.0 tensorflow==2.5.0rc1 tensorflow-estimator==2.5.0rc0 termcolor==1.1.0 +traitlets==5.0.5 typing-extensions==3.7.4.3 urllib3==1.26.4 +wcwidth==0.2.5 Werkzeug==1.0.1 wrapt==1.12.1 diff --git a/src/util/baseline.py b/src/util/baseline.py new file mode 100644 index 0000000..ab0ddf9 --- /dev/null +++ b/src/util/baseline.py @@ -0,0 +1,13 @@ +import tensorflow as tf + + +class Baseline(tf.keras.Model): + def __init__(self, label_index=None): + super().__init__() + self.label_index = label_index + + def call(self, inputs): + if self.label_index is None: + return inputs + result = inputs[:, :, self.label_index] + return result[:, :, tf.newaxis] diff --git a/src/util/savefig.py b/src/util/savefig.py new file mode 100644 index 0000000..69ccf32 --- /dev/null +++ b/src/util/savefig.py @@ -0,0 +1,19 @@ +import os + +from matplotlib import pyplot + +root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +fig_count = 0 + + +def build_path(): + return os.path.join(root_path, 'build') + + +def savefig(plt: pyplot): + global fig_count + fig_count += 1 + fig_path = os.path.join(build_path(), f'weather_fig{fig_count}.png') + plt.savefig(fig_path, bbox_inches='tight') + plt.close() + print(f'saved: {fig_path}') diff --git a/src/util/window_generator.py b/src/util/window_generator.py new file mode 100644 index 0000000..1d2bb8f --- /dev/null +++ b/src/util/window_generator.py @@ -0,0 +1,132 @@ +import matplotlib.pyplot as plt +import numpy as np +import tensorflow as tf + + +class WindowGenerator: + def __init__(self, input_width, label_width, shift, + train_df, val_df, test_df, + label_columns=None): + # Store the raw data. + self.example = None + self.train_df = train_df + self.val_df = val_df + self.test_df = test_df + + # Work out the label column indices. + self.label_columns = label_columns + if label_columns is not None: + self.label_columns_indices = {name: i for i, name in + enumerate(label_columns)} + self.column_indices = {name: i for i, name in + enumerate(train_df.columns)} + + # Work out the window parameters. + self.input_width = input_width + self.label_width = label_width + self.shift = shift + + self.total_window_size = input_width + shift + + self.input_slice = slice(0, input_width) + self.input_indices = np.arange(self.total_window_size)[self.input_slice] + + self.label_start = self.total_window_size - self.label_width + self.labels_slice = slice(self.label_start, None) + self.label_indices = np.arange(self.total_window_size)[self.labels_slice] + + def __repr__(self): + return '\n'.join([ + f'Total window size: {self.total_window_size}', + f'Input indices: {self.input_indices}', + f'Label indices: {self.label_indices}', + f'Label column name(s): {self.label_columns}']) + + def split_window(self, features): + inputs = features[:, self.input_slice, :] + labels = features[:, self.labels_slice, :] + if self.label_columns is not None: + labels = tf.stack( + [labels[:, :, self.column_indices[name]] for name in self.label_columns], + axis=-1) + + # Slicing doesn't preserve static shape information, so set the shapes + # manually. This way the `tf.data.Datasets` are easier to inspect. + inputs.set_shape([None, self.input_width, None]) + labels.set_shape([None, self.label_width, None]) + + return inputs, labels + + def plot(self, model=None, plot_col='T (degC)', max_subplots=3): + inputs, labels = self.example + plt.figure(figsize=(12, 8)) + plot_col_index = self.column_indices[plot_col] + max_n = min(max_subplots, len(inputs)) + for n in range(max_n): + plt.subplot(max_n, 1, n + 1) + plt.ylabel(f'{plot_col} [normed]') + plt.plot(self.input_indices, inputs[n, :, plot_col_index], + label='Inputs', marker='.', zorder=-10) + + if self.label_columns: + label_col_index = self.label_columns_indices.get(plot_col, None) + else: + label_col_index = plot_col_index + + if label_col_index is None: + continue + + plt.scatter(self.label_indices, labels[n, :, label_col_index], + edgecolors='k', label='Labels', c='#2ca02c', s=64) + if model is not None: + predictions = model(inputs) + plt.scatter(self.label_indices, predictions[n, :, label_col_index], + marker='X', edgecolors='k', label='Predictions', + c='#ff7f0e', s=64) + + if n == 0: + plt.legend() + + plt.xlabel('Time [h]') + return plt + + def make_dataset(self, data): + data = np.array(data, dtype=np.float32) + ds = tf.keras.preprocessing.timeseries_dataset_from_array( + data=data, + targets=None, + sequence_length=self.total_window_size, + sequence_stride=1, + shuffle=True, + batch_size=32, ) + + ds = ds.map(self.split_window) + + return ds + + @property + def train(self): + return self.make_dataset(self.train_df) + + @property + def val(self): + return self.make_dataset(self.val_df) + + @property + def test(self): + return self.make_dataset(self.test_df) + + @property + def example(self): + """Get and cache an example batch of `inputs, labels` for plotting.""" + result = getattr(self, '_example', None) + if result is None: + # No example batch was found, so get one from the `.train` dataset + result = next(iter(self.train)) + # And cache it for next time + self._example = result + return result + + @example.setter + def example(self, value): + self._example = value diff --git a/src/weather.py b/src/weather.py new file mode 100644 index 0000000..49dbe8b --- /dev/null +++ b/src/weather.py @@ -0,0 +1,237 @@ +import datetime +import os + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import tensorflow as tf + +from util.baseline import Baseline +from util.savefig import savefig +from util.window_generator import WindowGenerator + +mpl.rcParams['figure.figsize'] = (8, 6) +mpl.rcParams['axes.grid'] = False + +# The weather dataset +zip_path = tf.keras.utils.get_file( + origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip', + fname='jena_climate_2009_2016.csv.zip', + extract=True) +csv_path, _ = os.path.splitext(zip_path) + +df = pd.read_csv(csv_path) +# slice [start:stop:step], starting from index 5 take every 6th record. +df = df[5::6] +date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S') +print(df.head()) + +plot_cols = ['T (degC)', 'p (mbar)', 'rho (g/m**3)'] +plot_features = df[plot_cols] +plot_features.index = date_time +_ = plot_features.plot(subplots=True) +savefig(plt) + +plot_features = df[plot_cols][:480] +plot_features.index = date_time[:480] +_ = plot_features.plot(subplots=True) +savefig(plt) + +# Inspect and cleanup +wv = df['wv (m/s)'] +bad_wv = wv == -9999.0 +wv[bad_wv] = 0.0 +max_wv = df['max. wv (m/s)'] +bad_max_wv = max_wv == -9999.0 +max_wv[bad_max_wv] = 0.0 +# The above inplace edits are reflected in the DataFrame +df['wv (m/s)'].min() +print(df.describe().transpose()) + +# Feature engineering +plt.hist2d(df['wd (deg)'], df['wv (m/s)'], bins=(50, 50), vmax=400) +plt.colorbar() +plt.xlabel('Wind Direction [deg]') +plt.ylabel('Wind Velocity [m/s]') +savefig(plt) + +wv = df.pop('wv (m/s)') +max_wv = df.pop('max. wv (m/s)') +# Convert to radians. +wd_rad = df.pop('wd (deg)') * np.pi / 180 +# Calculate the wind x and y components. +df['Wx'] = wv * np.cos(wd_rad) +df['Wy'] = wv * np.sin(wd_rad) +# Calculate the max wind x and y components. +df['max Wx'] = max_wv * np.cos(wd_rad) +df['max Wy'] = max_wv * np.sin(wd_rad) + +plt.hist2d(df['Wx'], df['Wy'], bins=(50, 50), vmax=400) +plt.colorbar() +plt.xlabel('Wind X [m/s]') +plt.ylabel('Wind Y [m/s]') +ax = plt.gca() +ax.axis('tight') +savefig(plt) + +# Time +timestamp_s = date_time.map(datetime.datetime.timestamp) +day = 24 * 60 * 60 +year = (365.2425) * day + +df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day)) +df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day)) +df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year)) +df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year)) + +plt.plot(np.array(df['Day sin'])[:25]) +plt.plot(np.array(df['Day cos'])[:25]) +plt.xlabel('Time [h]') +plt.title('Time of day signal') +savefig(plt) + +fft = tf.signal.rfft(df['T (degC)']) +f_per_dataset = np.arange(0, len(fft)) + +n_samples_h = len(df['T (degC)']) +hours_per_year = 24 * 365.2524 +years_per_dataset = n_samples_h / (hours_per_year) + +f_per_year = f_per_dataset / years_per_dataset +plt.step(f_per_year, np.abs(fft)) +plt.xscale('log') +plt.ylim(0, 400000) +plt.xlim([0.1, max(plt.xlim())]) +plt.xticks([1, 365.2524], labels=['1/Year', '1/day']) +_ = plt.xlabel('Frequency (log scale)') +savefig(plt) + +# Split the data +column_indices = {name: i for i, name in enumerate(df.columns)} + +n = len(df) +train_df = df[0:int(n * 0.7)] +val_df = df[int(n * 0.7):int(n * 0.9)] +test_df = df[int(n * 0.9):] + +num_features = df.shape[1] + +# Normalize the data +train_mean = train_df.mean() +train_std = train_df.std() + +train_df = (train_df - train_mean) / train_std +val_df = (val_df - train_mean) / train_std +test_df = (test_df - train_mean) / train_std + +df_std = (df - train_mean) / train_std +df_std = df_std.melt(var_name='Column', value_name='Normalized') +plt.figure(figsize=(12, 6)) +ax = sns.violinplot(x='Column', y='Normalized', data=df_std) +_ = ax.set_xticklabels(df.keys(), rotation=90) +savefig(plt) + +# Data windowing + +# w1 = WindowGenerator(input_width=24, label_width=1, shift=24, +# label_columns=['T (degC)'], +# train_df=train_df, val_df=val_df, test_df=test_df) +# print(w1) +# w2 = WindowGenerator(input_width=6, label_width=1, shift=1, +# label_columns=['T (degC)'], +# train_df=train_df, val_df=val_df, test_df=test_df) +# print(w2) +# # Stack three slices, the length of the total window: +# example_window = tf.stack([np.array(train_df[:w2.total_window_size]), +# np.array(train_df[100:100+w2.total_window_size]), +# np.array(train_df[200:200+w2.total_window_size])]) +# example_inputs, example_labels = w2.split_window(example_window) +# print('All shapes are: (batch, time, features)') +# print(f'Window shape: {example_window.shape}') +# print(f'Inputs shape: {example_inputs.shape}') +# print(f'labels shape: {example_labels.shape}') +# +# w2.example = example_inputs, example_labels +# plt = w2.plot() +# savefig(plt) +# plt = w2.plot(plot_col='p (mbar)') +# savefig(plt) +# for example_inputs, example_labels in w2.train.take(1): +# print(f'Inputs shape (batch, time, features): {example_inputs.shape}') +# print(f'Labels shape (batch, time, features): {example_labels.shape}') + +# Single step models +single_step_window = WindowGenerator( + input_width=1, label_width=1, shift=1, + label_columns=['T (degC)'], + train_df=train_df, val_df=val_df, test_df=test_df) +print(single_step_window) +for example_inputs, example_labels in single_step_window.train.take(1): + print(f'Inputs shape (batch, time, features): {example_inputs.shape}') + print(f'Labels shape (batch, time, features): {example_labels.shape}') + +# Baseline +baseline = Baseline(label_index=column_indices['T (degC)']) + +baseline.compile(loss=tf.losses.MeanSquaredError(), + metrics=[tf.metrics.MeanAbsoluteError()]) + +val_performance = {} +performance = {} +val_performance['Baseline'] = baseline.evaluate(single_step_window.val) +performance['Baseline'] = baseline.evaluate(single_step_window.test, verbose=0) + +wide_window = WindowGenerator( + input_width=24, label_width=24, shift=1, + label_columns=['T (degC)'], + train_df=train_df, val_df=val_df, test_df=test_df) + +print(wide_window) +print('Input shape:', wide_window.example[0].shape) +print('Output shape:', baseline(wide_window.example[0]).shape) +plt = wide_window.plot(baseline) +savefig(plt) + +# Linear model +linear = tf.keras.Sequential([ + tf.keras.layers.Dense(units=1) +]) +print('Input shape:', single_step_window.example[0].shape) +print('Output shape:', linear(single_step_window.example[0]).shape) +MAX_EPOCHS = 20 + + +def compile_and_fit(model, window, patience=2): + early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', + patience=patience, + mode='min') + + model.compile(loss=tf.losses.MeanSquaredError(), + optimizer=tf.optimizers.Adam(), + metrics=[tf.metrics.MeanAbsoluteError()]) + + history = model.fit(window.train, epochs=MAX_EPOCHS, + validation_data=window.val, + callbacks=[early_stopping]) + return history + + +history = compile_and_fit(linear, single_step_window) + +val_performance['Linear'] = linear.evaluate(single_step_window.val) +performance['Linear'] = linear.evaluate(single_step_window.test, verbose=0) + +print('Input shape:', wide_window.example[0].shape) +print('Output shape:', baseline(wide_window.example[0]).shape) + +plt = wide_window.plot(linear) +savefig(plt) + +plt.bar(x=range(len(train_df.columns)), + height=linear.layers[0].kernel[:, 0].numpy()) +axis = plt.gca() +axis.set_xticks(range(len(train_df.columns))) +_ = axis.set_xticklabels(train_df.columns, rotation=90) +savefig(plt)