diff --git a/data_analysis/data_stats.py b/data_analysis/data_stats.py index 10d5ee7..3c4886b 100644 --- a/data_analysis/data_stats.py +++ b/data_analysis/data_stats.py @@ -143,8 +143,10 @@ def stats_generator() -> pd.DataFrame: def load_stats(**kwargs) -> dict: """Loads stats from database or local - Args: - **kwargs: passed to `cached_table_fetch`. See its docstring for more info. + Parameters + ---------- + **kwargs : + passed to `cached_table_fetch`. See its docstring for more info. Returns ------- diff --git a/data_analysis/ml_stats.py b/data_analysis/ml_stats.py new file mode 100644 index 0000000..ad3c1c9 --- /dev/null +++ b/data_analysis/ml_stats.py @@ -0,0 +1,167 @@ +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from config import MODEL_PATH, ENCODER_PATH +from database import cached_table_fetch +import matplotlib.pyplot as plt +import pandas as pd +import time +import datetime +from helpers import RtdRay, ttl_lru_cache +from sklearn.dummy import DummyClassifier +import pickle + + +def majority_baseline(x, y): + clf = DummyClassifier(strategy="most_frequent", random_state=0) + clf.fit(x, y) + return round((clf.predict(x) == y.to_numpy()).sum() / len(x), 6) + + +def model_score(model, x, y): + return model.score(x, y) + + +def test_model(model, x_test, y_test, model_name) -> dict[str, float]: + baseline = majority_baseline(x_test, y_test) + model_score = (model.predict(x_test) == y_test).sum() / len(y_test) + + stats = {} + + stats["model"] = model_name + stats["baseline"] = round(baseline * 100, 6) + stats["accuracy"] = round(model_score * 100, 6) + stats["improvement"] = round((model_score - baseline)*100, 6) + print(stats) + return stats + + +def model_roc(model, x_test, y_test, model_name): + from sklearn.metrics import ( + precision_recall_curve, + plot_precision_recall_curve, + auc, + roc_curve, + ) + + prediction = model.predict_proba(x_test)[:, 1] + fpr, tpr, thresholds = roc_curve(y_test, prediction, pos_label=1) + roc_auc = auc(fpr, tpr) + + lw = 2 + fig, ax = plt.subplots() + ax.plot( + fpr, tpr, color="darkorange", lw=lw, label="ROC curve (area = %0.2f)" % roc_auc + ) + ax.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--") + ax.set_xlim([0.0, 1.0]) + ax.set_ylim([0.0, 1.05]) + ax.set_xlabel("False Positive Rate") + ax.set_ylabel("True Positive Rate") + ax.set_title( + "Receiver operating characteristic model {}".format(model_name)) + ax.legend(loc="lower right") + + # fig1, ax1 = plt.subplots() + # ax1.set_title('Predictions') + # ax1.boxplot(prediction) + plt.show() + +def generate_stats(n_models = range(15), date = datetime.today()) -> pd.DataFrame: + """ + Generates stats for the machine learning models for a specific day + + Parameters + ---------- + n_models : list + The models for which to compute the stats + date : datetime + The day for which to compute the stats + + Returns + ------- + pd.DataFrame + DataFrame containing the generated stats + """ + status_encoder = {} + status_encoder["ar"] = pickle.load( + open(ENCODER_PATH.format(encoder="ar_cs"), "rb")) + status_encoder["dp"] = pickle.load( + open(ENCODER_PATH.format(encoder="dp_cs"), "rb")) + + # Get midnight + midnight = datetime.combine(date, time.min) + test = RtdRay.load_for_ml_model( + min_date = midnight - datetime.timedelta(days=1), + max_date = midnight, + long_distance_only = False, + return_status = True, + ).compute() + + ar_test = test.loc[~test["ar_delay"].isna(), ["ar_delay", "ar_cs"]] + dp_test = test.loc[~test["dp_delay"].isna(), ["dp_delay", "dp_cs"]] + # ar_test = test[['ar_delay', 'ar_cs']].dropna(subset=["ar_delay"]) + # dp_test = test[['dp_delay', 'dp_cs']].dropna(subset=["dp_delay"]) + + ar_test_x = test.loc[~test["ar_delay"].isna()].drop(columns=["ar_delay", "dp_delay", "ar_cs", "dp_cs"], axis=0) + dp_test_x = test.loc[~test["dp_delay"].isna()].drop(columns=["ar_delay", "dp_delay", "ar_cs", "dp_cs"], axis=0) + + ar_test_x.drop(columns=["obstacles_priority_24", "obstacles_priority_37", "obstacles_priority_63", "obstacles_priority_65", "obstacles_priority_70", "obstacles_priority_80"], inplace = True) + dp_test_x.drop(columns=["obstacles_priority_24", "obstacles_priority_37", "obstacles_priority_63", "obstacles_priority_65", "obstacles_priority_70", "obstacles_priority_80"], inplace = True) + + del test + + stats = [] + for model_number in n_models: + model_name = f"ar_{model_number}" + + test_y = (ar_test["ar_delay"] <= model_number) & ( + ar_test["ar_cs"] != status_encoder["ar"]["c"] + ) + + model = pickle.load(open(MODEL_PATH.format(model_name), "rb")) + + stats.append(test_model(model, ar_test_x, test_y, model_name)) + + # model_number += 1 + model_name = f"dp_{model_number}" + test_y = (dp_test["dp_delay"] >= model_number) & ( + dp_test["dp_cs"] != status_encoder["dp"]["c"] + ) + + model = pickle.load(open(MODEL_PATH.format(model_name), "rb")) + stats.append(test_model(model, dp_test_x, test_y, model_name)) + + stats = pd.DataFrame(stats) + stats["date"] = midnight - datetime.timedelta(days=1) + + return stats + + +@ttl_lru_cache(maxsize=1, seconds_to_live=60*60) +def load_stats(**kwargs) -> dict: + """Loads stats from database or local + + Parameters + ---------- + **kwargs : + passed to `cached_table_fetch`. See its docstring for more info. + + Returns + ------- + dict + Loaded stats + """ + stats = cached_table_fetch('ml_model_stats', if_exists='append' **kwargs) + + return stats.iloc[0].to_dict() + +if __name__ == '__main__': + stats = load_stats( + table_generator=generate_stats, + generate=True, + ) + + print(stats) \ No newline at end of file diff --git a/database/cached_table_fetch.py b/database/cached_table_fetch.py index f84808c..c5ac2f6 100644 --- a/database/cached_table_fetch.py +++ b/database/cached_table_fetch.py @@ -14,6 +14,7 @@ def cached_table_fetch( prefer_cache: Optional[bool] = False, generate: Optional[bool] = False, table_generator: Optional[Callable[[], pd.DataFrame]] = None, + if_exists: Optional[str] = 'replace', push: Optional[bool] = True, **kwargs ) -> pd.DataFrame: @@ -32,6 +33,9 @@ def cached_table_fetch( Whether to use table_generator to generate the DataFrame and not look for cache or database, by default False table_generator : Callable[[], pd.DataFrame], optional Callable that generates the data of table tablename, by default None + if_exists : {'fail', 'replace', 'append'}, default 'replace' + What to do if the table exits + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html push : bool, optional Whether to push data to the db after calling table_generator, by default False @@ -51,7 +55,7 @@ def cached_table_fetch( raise ValueError('Cannot generate if no table_generator was supplied') df = table_generator() if push: - cached_table_push(df, tablename) + cached_table_push(df, tablename, if_exists = if_exists) return df if prefer_cache: @@ -126,7 +130,7 @@ def pd_to_psql(df, uri, table_name, schema_name=None, if_exists='fail', sep=',') return True -def cached_table_push(df: pd.DataFrame, tablename: str, fast: bool = True, **kwargs): +def cached_table_push(df: pd.DataFrame, tablename: str, fast: Optional[bool] = True, if_exists: Optional[str] = 'replace', **kwargs): """ Save df to local cache file and replace the table in the database. @@ -140,12 +144,16 @@ def cached_table_push(df: pd.DataFrame, tablename: str, fast: bool = True, **kwa Whether to use a faster push method or not, by default False True: use the fast method, which might not be as accurate False: use the slow method, which is more accurate + if_exists : {'fail', 'replace', 'append'}, default 'replace' + What to do if the table exits + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html + """ cache_path = CACHE_PATH + '/' + tablename + '.pkl' df.to_pickle(cache_path) # d6stack is way faster than pandas at inserting data to sql. # It exports the dataframe to a csv and then inserts it to the database. if fast: - pd_to_psql(df, DB_CONNECT_STRING, tablename, if_exists='replace') + pd_to_psql(df, DB_CONNECT_STRING, tablename, if_exists = if_exists) else: - df.to_sql(tablename, DB_CONNECT_STRING, if_exists='replace', method='multi', chunksize=10_000, **kwargs) + df.to_sql(tablename, DB_CONNECT_STRING, if_exists = if_exists, method='multi', chunksize=10_000, **kwargs) diff --git a/helpers/RtdRay.py b/helpers/RtdRay.py index ac9411d..71d6427 100644 --- a/helpers/RtdRay.py +++ b/helpers/RtdRay.py @@ -434,6 +434,9 @@ def load_for_ml_model(return_date_id=False, label_encode=True, return_times=Fals Whether to label encode categorical columns, by default True return_times : bool, optional Whether to return planned and changed arrival and departure times, by default False + return_status : bool, optional + Whether to return the columns 'ar_cs', 'dp_cs', which contain the arrival/departure status of the train + for ex 'c' = canceled Returns ------- diff --git a/k8s/jupyer-notebooks.yaml b/k8s/jupyer-notebooks.yaml index e38a4d5..88203ac 100644 --- a/k8s/jupyer-notebooks.yaml +++ b/k8s/jupyer-notebooks.yaml @@ -1,3 +1,35 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: jupyter-pv +spec: + capacity: + storage: 10G + storageClassName: jupyter + accessModes: + - ReadOnlyMany + hostPath: + path: "/mnt/jupyter/" + +--- + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jupyter-pvc + labels: + type: local +spec: + accessModes: + - ReadOnlyMany + storageClassName: jupyter + resources: + requests: + storage: 1G + volumeName: jupyter-pv + +--- + apiVersion: apps/v1 kind: Deployment metadata: @@ -24,8 +56,8 @@ spec: volumeMounts: - name: tz-berlin # set timezone to CEST mountPath: /etc/localtime - - mountPath: /mnt/config - name: config-pvc-storage + - mountPath: /home/jovyan + name: jupyter-pvc-storage - mountPath: /mnt/cache name: cache-pvc-storage dnsPolicy: ClusterFirst @@ -34,12 +66,12 @@ spec: securityContext: {} terminationGracePeriodSeconds: 30 volumes: - - name: config-pvc-storage - persistentVolumeClaim: - claimName: config-pvc - name: cache-pvc-storage persistentVolumeClaim: claimName: cache-pvc + - name: jupyter-pvc-storage + persistentVolumeClaim: + claimName: jupyter-pvc - name: tz-berlin # set timezone to CEST hostPath: path: /usr/share/zoneinfo/Europe/Berlin diff --git a/update_butler/__main__.py b/update_butler/__main__.py index ee05a98..7c3da8a 100644 --- a/update_butler/__main__.py +++ b/update_butler/__main__.py @@ -38,6 +38,17 @@ print("--Done") + print("--ML Stats") + + from data_analysis import ml_stats + + ml_stats.load_stats( + table_generator=ml_stats.stats_generator, + generate=True, + ) + + print("--Done") + print("--Per Station Data") import datetime @@ -64,6 +75,6 @@ print("Training ML Models...") - # TODO + print("Done") diff --git a/webserver/api.py b/webserver/api.py index cd62814..a918c85 100644 --- a/webserver/api.py +++ b/webserver/api.py @@ -14,7 +14,7 @@ ) from webserver import predictor, streckennetz, per_station_time from webserver.db_logger import log_activity -from data_analysis import data_stats +from data_analysis import data_stats, ml_stats from config import CACHE_PATH bp = Blueprint("api", __name__, url_prefix="/api") @@ -219,4 +219,20 @@ def obstacle_plot(date_range): # even though os.path.isfile('cache/plot_cache/'+ plot_name + '.png') works return send_file( f"{CACHE_PATH}/plot_cache/{plot_name}.png", mimetype="image/png" - ) \ No newline at end of file + ) + +@bp.route("/ml_stats") +@log_activity +def stats_ml(): + """ + Retrives stats for the machine learning models + + Parameters + ---------- + + Returns + ------- + The statisics for the ml models + """ + + ml_stats.load_stats() diff --git a/webserver/predictor.py b/webserver/predictor.py index 2ff73b3..35b407b 100644 --- a/webserver/predictor.py +++ b/webserver/predictor.py @@ -2,8 +2,11 @@ import pandas as pd import datetime import numpy as np +import os from pytz import timezone +from xgboost import XGBClassifier from webserver import streckennetz +from helpers import RtdRay from config import ENCODER_PATH, MODEL_PATH @@ -14,6 +17,40 @@ def from_utc(utc_time: str) -> datetime.datetime: ).astimezone(timezone("Europe/Berlin")).replace(tzinfo=None) +def train_model(self, train_x, train_y, **model_parameters): + est = XGBClassifier( + n_jobs=-1, + objective="binary:logistic", + eval_metric="logloss", + random_state=0, + tree_method="gpu_hist", + use_label_encoder=False, + **model_parameters, + ) + est.fit(train_x, train_y) + return est + + +parameters = { + -1: {'learning_rate': 0.4, 'max_depth': 14, 'n_estimators': 100, 'gamma': 2.8, }, + 0: {'learning_rate': 0.4, 'max_depth': 14, 'n_estimators': 100, 'gamma': 2.8, }, + 1: {'learning_rate': 0.4, 'max_depth': 14, 'n_estimators': 100, 'gamma': 2.8, }, + 2: {'learning_rate': 0.4, 'max_depth': 13, 'n_estimators': 100, 'gamma': 2.8, }, + 3: {'learning_rate': 0.4, 'max_depth': 13, 'n_estimators': 100, 'gamma': 2.8, }, + 4: {'learning_rate': 0.4, 'max_depth': 12, 'n_estimators': 100, 'gamma': 2.8, }, + 5: {'learning_rate': 0.4, 'max_depth': 12, 'n_estimators': 100, 'gamma': 2.8, }, + 6: {'learning_rate': 0.4, 'max_depth': 11, 'n_estimators': 100, 'gamma': 2.8, }, + 7: {'learning_rate': 0.4, 'max_depth': 11, 'n_estimators': 100, 'gamma': 2.8, }, + 8: {'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 100, 'gamma': 2.8, }, + 9: {'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 100, 'gamma': 2.8, }, + 10: {'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 100, 'gamma': 2.8, }, + 11: {'learning_rate': 0.4, 'max_depth': 9, 'n_estimators': 100, 'gamma': 2.8, }, + 12: {'learning_rate': 0.4, 'max_depth': 9, 'n_estimators': 100, 'gamma': 2.8, }, + 13: {'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 100, 'gamma': 2.8, }, + 14: {'learning_rate': 0.4, 'max_depth': 8, 'n_estimators': 100, 'gamma': 2.8, }, +} + + class Predictor: def __init__(self, n_models=40): self.n_models = n_models @@ -25,21 +62,25 @@ def __init__(self, n_models=40): self.ar_models = [] self.dp_models = [] for model in range(self.n_models): - self.ar_models.append(pickle.load(open(MODEL_PATH.format('ar_' + str(model)), "rb"))) - self.dp_models.append(pickle.load(open(MODEL_PATH.format('dp_' + str(model)), "rb"))) + self.ar_models.append(pickle.load( + open(MODEL_PATH.format('ar_' + str(model)), "rb"))) + self.dp_models.append(pickle.load( + open(MODEL_PATH.format('dp_' + str(model)), "rb"))) def predict_ar(self, features): features = features.to_numpy() prediction = np.empty((len(features), self.n_models)) for model in range(self.n_models): - prediction[:, model] = self.ar_models[model].predict_proba(features, validate_features=False)[:, 1] + prediction[:, model] = self.ar_models[model].predict_proba( + features, validate_features=False)[:, 1] return prediction def predict_dp(self, features): features = features.to_numpy() prediction = np.empty((len(features), self.n_models)) for model in range(self.n_models): - prediction[:, model] = self.dp_models[model].predict_proba(features, validate_features=False)[:, 1] + prediction[:, model] = self.dp_models[model].predict_proba( + features, validate_features=False)[:, 1] return prediction def predict_con(self, ar_prediction, dp_prediction, transfer_time): @@ -47,9 +88,11 @@ def predict_con(self, ar_prediction, dp_prediction, transfer_time): for tra_time in range(self.n_models): mask = transfer_time == tra_time if mask.any(): - con_score[mask] = ar_prediction[mask, max(tra_time - 2, 0)] * dp_prediction[mask, max(0, 2 - tra_time)] + con_score[mask] = ar_prediction[mask, max( + tra_time - 2, 0)] * dp_prediction[mask, max(0, 2 - tra_time)] con_score[mask] = con_score[mask] + np.sum(( - ar_prediction[mask, max(tra_time-2, 0)+1:dp_prediction.shape[1] - max(0, 2 - tra_time)] + ar_prediction[mask, max( + tra_time-2, 0)+1:dp_prediction.shape[1] - max(0, 2 - tra_time)] - ar_prediction[mask, max(tra_time-2, 0):dp_prediction.shape[1] - 1 - max(0, 2 - tra_time)]) * dp_prediction[mask, max(0, 2 - tra_time)+1:dp_prediction.shape[1] + min(2-tra_time, 0)], axis=1) return np.minimum(con_score, np.ones(len(con_score))) @@ -71,23 +114,9 @@ def get_pred_data(self, segments: list): 'stay_time': 'float' } ar_data = pd.DataFrame( - columns=[ - 'station', - 'lat', - 'lon', - 'o', - 'c', - 'n', - 'distance_to_start', - 'distance_to_end', - 'pp', - 'stop_id', - 'minute', - 'day', - 'stay_time' - ], - index=range(len(segments)) - ) + columns=dtypes.keys, + index=range(len(segments)) + ) dp_data = ar_data.copy() for i, segment in enumerate(segments): # Encode categoricals @@ -99,7 +128,8 @@ def get_pred_data(self, segments: list): ar_data.at[i, cat] = self.cat_encoders[cat][segment['ar_' + cat]] except KeyError: ar_data.at[i, cat] = -1 - print('unknown {cat}: {value}'.format(cat=cat, value=segment['ar_' + cat])) + print('unknown {cat}: {value}'.format( + cat=cat, value=segment['ar_' + cat])) try: if cat == 'pp': dp_data.at[i, cat] = self.cat_encoders[cat][segment['dp_' + 'cp']] @@ -107,8 +137,9 @@ def get_pred_data(self, segments: list): dp_data.at[i, cat] = self.cat_encoders[cat][segment['dp_' + cat]] except KeyError: dp_data.at[i, cat] = -1 - print('unknown {cat}: {value}'.format(cat=cat, value=segment['dp_' + cat])) - + print('unknown {cat}: {value}'.format( + cat=cat, value=segment['dp_' + cat])) + ar_data.at[i, 'lat'] = segment['ar_lat'] ar_data.at[i, 'lon'] = segment['ar_lon'] dp_data.at[i, 'lat'] = segment['dp_lat'] @@ -134,9 +165,11 @@ def get_pred_data(self, segments: list): date=segment['dp_pt'] ) - ar_data.at[i, 'minute'] = segment['ar_ct'].time().minute + segment['ar_ct'].time().hour * 60 + ar_data.at[i, 'minute'] = segment['ar_ct'].time( + ).minute + segment['ar_ct'].time().hour * 60 ar_data.at[i, 'day'] = segment['ar_ct'].weekday() - dp_data.at[i, 'minute'] = segment['dp_ct'].time().minute + segment['dp_ct'].time().hour * 60 + dp_data.at[i, 'minute'] = segment['dp_ct'].time( + ).minute + segment['dp_ct'].time().hour * 60 dp_data.at[i, 'day'] = segment['dp_ct'].weekday() ar_data.at[i, 'stay_time'] = segment['stay_times'][ar_data.at[i, 'stop_id']] @@ -144,6 +177,70 @@ def get_pred_data(self, segments: list): return ar_data.astype(dtypes), dp_data.astype(dtypes) + def train_models(self, min_date=datetime.datetime.today() - datetime.timedelta(days=7 * 4), **load_parameters): + train = RtdRay.load_for_ml_model( + min_date=min_date, + long_distance_only=False, + return_status=True, + **load_parameters).compute() + + # We need this in order to set canceld trains as delayed + status_encoder = {} + status_encoder["ar"] = pickle.load( + open(ENCODER_PATH.format(encoder="ar_cs"), "rb")) + status_encoder["dp"] = pickle.load( + open(ENCODER_PATH.format(encoder="dp_cs"), "rb")) + + ar_train = train.loc[~train["ar_delay"].isna()] + dp_train = train.loc[~train["dp_delay"].isna()] + del train + + ar_labels = {} + dp_labels = {} + for label in range(self.n_models): + ar_labels[label] = (ar_train["ar_delay"] <= label) & ( + ar_train["ar_cs"] != status_encoder["ar"]["c"] + ) + dp_labels[label + 1] = (dp_train["dp_delay"] >= (label + 1)) & ( + dp_train["dp_cs"] != status_encoder["dp"]["c"] + ) + + del ar_train["ar_delay"] + del ar_train["dp_delay"] + del ar_train["ar_cs"] + del ar_train["dp_cs"] + + del dp_train["ar_delay"] + del dp_train["dp_delay"] + del dp_train["ar_cs"] + del dp_train["dp_cs"] + + newpath = "cache/models" + if not os.path.exists(newpath): + os.makedirs(newpath) + + for label in range(self.n_models): + model_name = f"ar_{label}" + print("training", model_name) + self.ar_models[label] = self.train_model( + dp_train, dp_labels[label], **parameters[label-1] + ) + pickle.dump( + self.ar_models[label], + open(MODEL_PATH.format(model_name), "wb"), + ) + + model_name = f"dp_{label}" + print("training", model_name) + self.dp_models[label] = self.train_model( + dp_train, dp_labels[label], **parameters[label-1] + ) + pickle.dump( + self.dp_models[label], + # **parameters[label] # n_estimators=50, max_depth=6 + open(MODEL_PATH.format(model_name), "wb"), + ) + if __name__ == "__main__": pred = Predictor() diff --git a/webserver/random_forest.py b/webserver/random_forest.py deleted file mode 100644 index 1539888..0000000 --- a/webserver/random_forest.py +++ /dev/null @@ -1,260 +0,0 @@ -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.metrics import multilabel_confusion_matrix -import numpy as np -import pandas as pd -from joblib import dump, load -import sys -import os -#import logging - -basepath = os.path.dirname(os.path.realpath(__file__)) - -def train_and_test(X, y, X_test, y_test, name): - clf = ExtraTreesClassifier(n_estimators=70, max_depth=12, - random_state=0, n_jobs = None, criterion = 'entropy') - clf.fit(X, y) - - #save the trained model - dump(clf, basepath + '/ml_models/' + name + '_extra.joblib') - - #make and print some acc statistics - test_pred = clf.predict(X_test) - acc = test_pred[test_pred == y_test] - print('Feature importances', clf.feature_importances_) - print('acc_' + name + ':', len(acc)/ len(test_pred)) - - #print confusion matrix - c_m = multilabel_confusion_matrix(y_test, test_pred).ravel() - dadx = [' ad0', ' ad5', 'ad10', 'ad15', ' dd0', ' dd5', 'dd10', 'dd15'] - print('\ttn, \tfp, \tfn, \ttp, \tacc1, \t\t\tacc2') - for i in range(8): - print(dadx[i], c_m[4*i+0], c_m[4*i+1], c_m[4*i+2], c_m[4*i+3], c_m[4*i+3]/(c_m[4*i+3]+c_m[4*i+1]), c_m[4*i+0]/(c_m[4*i+0]+c_m[4*i+2]), sep='\t') - - -def np_sets(dataset): - # make the balance of delayed trains in the dataset better - set1 = dataset[dataset['adelay10'] == True] - buffer1 = dataset[dataset['adelay10'] == False] - buffer1 = buffer1.sample(n=len(set1)) - set1 = pd.concat([set1, buffer1], ignore_index=True, sort=False) - set1 = set1.sample(n=len(set1)).reset_index(drop=True) - dataset = set1 - del set1 - - #only train on these features - dataset = dataset[['adelay0', 'adelay5', 'adelay10', 'adelay15', 'ddelay0', 'ddelay5', 'ddelay10', 'ddelay15', - 'month', - 'dayofweek', - 'hour', - 'track_length_since_start', - 'time_since_first_station', - 'station_number', - 'lat', - 'lon', - 'track_length', - 'stay_time', - 'time_since_last_station', - 'start_lat', - 'start_lon', - 'destination_lat', - 'destination_lon', - 'total_lenth', - 'total_time', - 'delta_lon', - 'delta_lat']] - # 'relative_humidity', 'dew_point_c', 'air_pressure_hpa', 'temperature_c', 'trainno', 'weather_condition', 'type', 'bhf', 'wind_speed_kmh', - - #classes - columns = ['adelay0', 'adelay5', 'adelay10', 'adelay15', 'ddelay0', 'ddelay5', 'ddelay10', 'ddelay15'] - - #split into train and test set / labels - train_set1 = dataset.sample(frac=0.8,random_state=0) - test_set1 = dataset.drop(train_set1.index) - - train_labels = train_set1[columns].copy() - train_set1.drop(columns, axis=1, inplace=True) - - test_labels = test_set1[columns].copy() - test_set1.drop(columns, axis=1, inplace=True) - - train_set1 = train_set1.to_numpy() - train_labels = train_labels.to_numpy() - - test_set1 = test_set1.to_numpy() - test_labels = test_labels.to_numpy() - - return train_set1, train_labels, test_set1, test_labels - -def handle_non_numerical_data(df): - columns = df.columns.values - - for column in columns: - text_digit_vals = {} - def convert_to_int(val): - return text_digit_vals[val] - - if df[column].dtype != np.int64 and df[column].dtype != np.float64: - unique_elements = df[column].unique() - x = 0 - for unique in unique_elements: - if unique not in text_digit_vals: - text_digit_vals[unique] = x - x+=1 - - df.loc[:,column] = df[column].apply(convert_to_int) - return df - -def train(): - print('creating models...', end=' ') - - path = 'data/combinedData/na_droped_trains2.csv' - - dataset = pd.read_csv(path, index_col=False, compression='zip') #C:/Users/McToel/Desktop/regression.csv combinedData/na_droped_trains.csv C:/Users/Serverkonto/Desktop/regression.csv - dataset = dataset.sample(frac=1.0,random_state=0) - - date = dataset['date'].astype('datetime64[D]') - dataset['month'] = date.dt.month - dataset['dayofweek'] = date.dt.dayofweek - dataset['hour'] = dataset['zeit'] - del date - dataset = dataset[['adelay', - 'ddelay', - 'month', - 'dayofweek', - 'hour', - 'track_length_since_start', - 'time_since_first_station', - 'station_number', - 'lat', - 'lon', - 'track_length', - 'stay_time', - 'time_since_last_station', - 'start_lat', - 'start_lon', - 'destination_lat', - 'destination_lon', - 'total_lenth', - 'total_time', - 'delta_lon', - 'delta_lat']] - - #dataset[['bhf', 'weather_condition', 'trainno', 'type']] = handle_non_numerical_data(dataset[['bhf', 'weather_condition', 'trainno', 'type']]) - - # one_hot = pd.get_dummies(dataset['type']) - # dataset = dataset.drop('type',axis = 1) - # dataset = dataset.join(one_hot) - - dataset['adelay0'] = dataset['adelay'] <= 5 - dataset['adelay5'] = (dataset['adelay'] > 5) #& (dataset['adelay'] <= 10) - dataset['adelay10'] = (dataset['adelay'] > 10) #& (dataset['adelay'] <= 15) - dataset['adelay15'] = dataset['adelay'] > 15 - - dataset['ddelay0'] = dataset['ddelay'] <= 5 - dataset['ddelay5'] = (dataset['ddelay'] > 5) #& (dataset['ddelay'] <= 10) - dataset['ddelay10'] = (dataset['ddelay'] > 10) #& (dataset['ddelay'] <= 15) - dataset['ddelay15'] = dataset['ddelay'] > 15 - - d_set, d_lab, test_set, test_lab = np_sets(dataset) - del dataset - train_and_test(d_set, d_lab, test_set, test_lab, 'multiclass') - - - - -class predictor: - def __init__(self): - try: - self.multiclass_rf = load(basepath + '/ml_models/multiclass_extra.joblib') - except FileNotFoundError: - if input("Models not present! Create Models? y/[n]") == "y": - train() - self.__init__() - else: - sys.exit('Please create Models') - - def predict(self, features_df): - #this is to make sure that we olny pass the right features in the right order into the predictor. - features_df = features_df[[ 'month', - 'dayofweek', - 'hour', - 'track_length_since_start', - 'time_since_first_station', - 'station_number', - 'lat', - 'lon', - 'track_length', - 'stay_time', - 'time_since_last_station', - 'start_lat', - 'start_lon', - 'destination_lat', - 'destination_lon', - 'total_lenth', - 'total_time', - 'delta_lon', - 'delta_lat']] - - #convert df to 2d numpy array - features = features_df.to_numpy() - features = features.reshape(1, -1) - del features_df - - #make a probability prediction for one train - delays = np.array(self.multiclass_rf.predict_proba(features))[:, 0,1] # , check_input=False - - #return delay probabilitys in a dict - return {'adelay0': delays[0], - 'adelay5': delays[1], - 'adelay10': delays[2], - 'adelay15': delays[3], - 'ddelay0': delays[4], - 'ddelay5': delays[5], - 'ddelay10': delays[6], - 'ddelay15': delays[7] } - - def predict_con(self, features1, features2, transfertime): - if (features1 == False): - # if we have no data in our database to predict the delay of the train, - # we set the yearly average delay for regio trains as delay probability. - # Values from https://www.deutschebahn.com/de/konzern/konzernprofil/zahlen_fakten/puenktlichkeitswerte-1187696 - pred1 = {'adelay0': 0.95, 'adelay5': 1 - 0.95, 'adelay10': 1 - 0.97, 'adelay15': 1 - 0.99} - else: - features1 = pd.DataFrame(features1,index=[0]) - pred1 = self.predict(features1) - - if (features2 == False): - # if we have no data in our database to predict the delay of the train, - # we set the yearly average delay for regio trains as delay probability. - # Values from https://www.deutschebahn.com/de/konzern/konzernprofil/zahlen_fakten/puenktlichkeitswerte-1187696 - pred2 = {'ddelay0': 0.95, 'ddelay5': 1 - 0.95, 'ddelay10': 1 - 0.97, 'ddelay15': 1 - 0.99} - else: - features2 = pd.DataFrame(features2,index=[0]) - pred2 = self.predict(features2) - - con_score = 0 - if (transfertime > 17): - con_score = 1 #if the transfer time is higher than our highest lable, we can only predict the connection as working - - elif (transfertime > 12): - p1 = pred1['adelay15'] * (1 - pred2['ddelay5']) - con_score = 1 - (p1) - - elif (transfertime > 7): - #if the arrival train has 10 min delay, and the departure one does not have 5 min delay - p1 = (pred1['adelay10'] - pred1['adelay15']) * (1 - pred2['ddelay5']) - - #if the arrival train has 15 min delay, and the departure one does not have 10 min delay - p2 = pred1['adelay15'] * (1 - pred2['ddelay10']) - con_score = 1 - (p1+p2) - - else: - p1 = (pred1['adelay5'] - pred1['adelay10']) * (1 - pred2['ddelay5']) - p2 = (pred1['adelay10'] - pred1['adelay15']) * (1 - pred2['ddelay10']) - p3 = pred1['adelay15'] * (1 - pred2['ddelay15']) - con_score = 1 - (p1+p2+p3) - - return con_score, pred1['adelay5'], pred2['ddelay5'] - - -#train() \ No newline at end of file