diff --git a/README.md b/README.md index 198badd737e5a44f0b019257f898a09633f55d3c..9efa3a793ae664c6d943fc4d84d79b3c070a2aac 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -## Run Simulation +## Install Simulation Extension ``` @@ -43,6 +43,7 @@ wget "https://platform.elaad.io/download-data/filedownload.php?file=elaadnl_open cp -r ./wattson-ev/wattson ./wattson/ cp -r ./wattson-ev/scenarios ./wattson/ +cp ./wattson-ev/run_wattson.sh ./wattson/ @@ -50,6 +51,33 @@ sudo python3 wattson/setup.py wattson sudo python3 -m pip install -e ./wattson sudo pip install ujson pyoscp websockets ocpp flask CherryPy cheroot openpyxl +``` + + +## Run Simulation +``` +#Single simulation run with e.g.: sudo python3 -m wattson wattson/scenarios/powerowl_example --no-cli --seed 5 + +#Full run (incl. attack and normal runs) with: +cd wattson +sudo bash run_wattson.sh ``` + + +## IDS Dependencies and Execution: + +``` +#Install IDS dependencies with: +sudo pip install scikit-learn==1.3.2 + +#Execute full IDS run (training and testing) with: +cd wattson/ids +bash run_ids.sh +``` + + + + + diff --git a/ids/features_aux.py b/ids/features_aux.py new file mode 100644 index 0000000000000000000000000000000000000000..4530bc681e046ba9c0f4c74abe3e3e8d12320887 --- /dev/null +++ b/ids/features_aux.py @@ -0,0 +1,512 @@ + +from collections import defaultdict +from datetime import datetime +import gc +import gzip +import os +from pathlib import Path +import pickle +import re +import warnings +import numpy as np +import pandas as pd +from tqdm import tqdm + +import json +from powerowl.simulators.pandapower import PandaPowerGridModel + + +import logging + +logger = logging.getLogger("WATTSON_EV_IDS.FeaturesAUX") + + + + +def discretize_hour_balancing(ts): + hour = ts.hour + if hour in [7, 8, 9]: + return "peak" + elif hour in [16, 17, 18, 19, 20, 21]: + return "peak" + elif hour in [22, 23, 24, 0, 1, 2, 3, 4, 5, 6]: + return "low" + elif hour in [10, 11, 12, 13, 14, 15]: + return "low" +def discretize_hour2(hour): + if hour in [5, 6, 7, 8, 9, 10, 11]: + return "Morning" + elif hour in [12, 13, 14, 15, 16]: + return "Afternoon" + elif hour in [17, 18, 19, 20, 21]: + return "Evening" + elif hour in [21, 22, 23, 0, 1, 2, 3, 4]: + return "Night" +def discretize_hour_only(hour): + if hour in [9, 10, 11, 12, 13, 14, 15, 16]: + return "Work" + elif hour in [22, 23, 0, 1, 2, 3, 4, 5]: + return "Sleep" + elif hour in [17, 18, 19, 20, 21, 6, 7, 8]: + return "Play" +def discretize_hour_ts(ts): + hour = ts.hour + return discretize_hour_only(hour) +def discretize_day_is_work(ts): + day = ts.weekday() + if day in [5, 6]: # weekend + return False + else: + return True +def discretize_hour_day(ts): + hour = ts.hour + day = ts.weekday() + if day in [5, 6]: # weekend + if hour in [22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: + return "High-Home" + elif hour in [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]: + return "High-Home" + return "High-Leisure" + else: + if hour in [22, 23, 0, 1, 2, 3, 4, 5, 6, 7]: + return "High-Home" + elif hour in [8, 9, 10, 11, 12, 13, 14, 15, 16]: + return "High-Work" + elif hour in [17, 18, 19, 20, 21]: + return "High-Home" + return "High-Leisure" + else: + raise Exception("discretize_hour_only", hour) + +def get_date_exog(sum_df, prefix): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + # sum_df['day_name'] = sum_df.index.to_series().dt.day_name() + sum_df[prefix+'dayofweek'] = sum_df.index.to_series().dt.dayofweek + sum_df[prefix+'hour'] = sum_df.index.to_series().dt.hour + sum_df[prefix+'discretize_hour_only'] = sum_df.index.to_series().apply( + discretize_hour_ts) + sum_df[prefix+'discretize_hour_day'] = sum_df.index.to_series().apply( + discretize_hour_day) + sum_df[prefix+'discretize_day_is_work'] = sum_df.index.to_series().apply( + discretize_day_is_work) + sum_df[prefix+'discretize_hour_balancing'] = sum_df.index.to_series().apply( + discretize_hour_balancing) + sum_df = sum_df.copy() + + +def get_date_exog_col(sum_df, col, prefix): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + # sum_df['day_name'] = sum_df[[col]].to_series().dt.day_name() + sum_df[prefix+col+'_dayofweek'] = sum_df[col].dt.dayofweek + sum_df[prefix+col+'_hour'] = sum_df[col].dt.hour + sum_df[prefix+col+'_discretize_hour_only'] = sum_df[col].apply( + discretize_hour_ts) + sum_df[prefix+col+'_discretize_hour_day'] = sum_df[col].apply( + discretize_hour_day) + sum_df[prefix+col+'_discretize_day_is_work'] = sum_df[col].apply( + discretize_day_is_work) + sum_df[prefix+col+'_discretize_hour_balancing'] = sum_df[col].apply( + discretize_hour_balancing) + sum_df = sum_df.copy() + + +def normalize_cols(df, suffix, ntype="std", skip_norm="_norm"): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + if ntype == "minmax": + for c in df.columns: + if skip_norm is not None and skip_norm in c: + continue + div = (df[c].max() - df[c].min()) + if div != 0: + df[c+suffix] = (df[c] - df[c].min()) / div + else: + df[c+suffix] = 0 + elif ntype == "std": + for c in df.columns: + if skip_norm is not None and skip_norm in c: + continue + div = df[c].std() + if div != 0: + df[c+suffix] = (df[c] - df[c].mean()) / div + else: + df[c+suffix] = 0 + else: raise ValueError(f"unk {ntype=}") + df = df.copy() + + + +def add_lags(df, num_lags=5, do_date=False, fillna="bfill", only_do_col=None): + init_cols = df.columns + do_cols = [c for c in init_cols if "is_attack" not in c] + if not do_date: + do_cols = [c for c in do_cols if "date_exog_" not in c] + if only_do_col is not None: + do_cols = [only_do_col] + new_cols=[] + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + for l in range(1, num_lags+1): + _new_cols = [(c+"_lag_"+str(l), c) for c in do_cols] + df[[c[0] for c in _new_cols]] = df[[c for c in do_cols]].shift(l) + new_cols.extend(_new_cols) + #df = df.copy() + for new_col, c in new_cols: + if fillna == "mean": + df[new_col].fillna(df[c].mean(), inplace=True) + elif fillna == "bfill": + df[new_col].fillna(method="bfill", inplace=True) + df.rename(columns={c: c+"_lag_0" for c in init_cols}, inplace=True) + return df + +def add_lags2(df, num_lags=5, do_date=False, fillna="bfill"): + init_cols = df.columns + do_cols = [c for c in init_cols if "is_attack" not in c] + if not do_date: + do_cols = [c for c in do_cols if "date_exog_" not in c] + new_cols=[] + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + for l in range(1, num_lags+1): + for c in do_cols: + new_col=c+"_lag_"+str(l) + new_cols.append((new_col, c)) + df[new_col] = df[c].shift(l) + #df = df.copy() + for new_col, c in new_cols: + if fillna == "mean": + df[new_col].fillna(df[c].mean(), inplace=True) + elif fillna == "bfill": + df[new_col].fillna(method="bfill", inplace=True) + df.rename(columns={c: c+"_lag_0" for c in init_cols}, inplace=True) + return df + +def add_all_lags(cpo_df_interp, num_lags=5, do_date=False, fillna="bfill", only_do_col=None): + for group, cp_df_interp in tqdm(cpo_df_interp.items(), desc="add_all_lags"): + cp_df_interp_l = add_lags(cp_df_interp, num_lags=num_lags, do_date=do_date, fillna=fillna, only_do_col=only_do_col) + yield group, cp_df_interp_l + + + +def get_grid_pp(DIR): + grid_dir = os.path.join(DIR,"controller-export/power-grid/") + grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))] + + for f in grid_files[:10]: + regex_grid = re.compile(r"^WALL\-[\-\dT\+]+__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$") + #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz + result = regex_grid.search(f) + if result: + f_time = "".join(result.group(1).rsplit("-", 1)) + f_time = pd.to_datetime(f_time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00 + + ff = os.path.join(grid_dir, f) + p_grid = load_single_feat(ff) + pd.reset_option('^display.', silent=True) + + pg = PandaPowerGridModel() + pg.from_primitive_dict(p_grid) + + pp=pg.to_external() + #print(pp) + return pp + logger.error(f"no pp grid found in {grid_dir}") + return None + + + +def interpolate_sim_time(ts, target): + target = target.timestamp() + if target in ts.index: + return ts[target] + ts1 = ts.sort_index() + b = (ts1.index > target).argmax() # index of first entry after target + s = ts1.iloc[b-1:b+1] + # Insert empty value at target time. + s = s.reindex(list(s.index.values) + [target]) + return s.interpolate(method='index', limit_direction="both").loc[target] + +def get_grids_json(DIR, wall_sim_map, start_time=None): + grid_dir = os.path.join(DIR,"power_grid_exports/") + grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))] + + pps=[] + for f in tqdm(grid_files[:], desc="get_grids_json"): + #power_grid_2024-02-03T19-04-14-008259+00-00.json + regex_grid = re.compile(r"^power_grid_([\-\dT\+]+)\.json$") + result = regex_grid.search(f) + if result: + f_time = "".join(result.group(1).rsplit("-", 1)) + f_time = pd.to_datetime(f_time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00 + sim_time=interpolate_sim_time(wall_sim_map, pd.to_datetime(f_time)) + if start_time is not None: + if np.isnan(sim_time): + continue + s_t = datetime.utcfromtimestamp(sim_time) + #print(f"{start_time} {s_t}") + if s_t < start_time.tz_localize(None): + continue + + ff = os.path.join(grid_dir, f) + + #p_grid = load_single_feat(ff) + with open(ff, 'r') as pg_f: + p_grid = json.load(pg_f) + + p_grid["simtime"] = sim_time + pd.reset_option('^display.', silent=True) + pps.append(p_grid) + #yield p_grid + return pps + + +def get_elems_attatched_to_bus(pp, group_n): + ret=[] + lines=pp["line"] + try: + lines=lines[lines["from_bus"] < lines["to_bus"]] + if (lines["to_bus"] == group_n).any(): + line = "line."+str(lines[lines["to_bus"] == group_n].sort_values(by="from_bus", ascending=True).index[0]) + ret.append(line) + except Exception as e: + logger.error(group_n) + logger.error(pp["line"]) + logger.error(lines) + raise e + + elems = ["load", "sgen", "storage"] #, "storage" only for non CP storages...; loads not included in default wattson + for e in elems: + try: + el=pp[e] + if e == "storage": + el = el[el["type"] == "CP"] + # print(el) + # exit() + if (el["bus"] == group_n).any(): + for el_idx in el[el["bus"] == group_n].index: + el = e+"."+str(el_idx) + ret.append(el) + except Exception as e: + logger.error(group_n) + logger.error(el) + raise e + return ret + + +def get_ts_sim_map(DIR): + grid_dir = os.path.join(DIR,"controller-export/power-grid/") + grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))] + + lst=[] + ind=[] + for f in grid_files[:]: + regex_grid = re.compile(r"^WALL\-([\-\dT\+]+)__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$") + #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz + result = regex_grid.search(f) + if result: + wall_time = "".join(result.group(1).rsplit("-", 1)) + wall_time = pd.to_datetime(wall_time, format="%Y-%m-%dT%H-%M-%S-%f%z").timestamp() #2023-11-23T19-05-29-585438+00-00 + + sim_time = "".join(result.group(2).rsplit("-", 1)) + sim_time = pd.to_datetime(sim_time, format="%Y-%m-%dT%H-%M-%S-%f%z").timestamp() #2023-11-23T19-05-29-585438+00-00 + + lst.append(sim_time) + ind.append(wall_time) + + #print(pp) + x = pd.Series(lst, index = ind) + return x + +def get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power", start_time=None, time_delta_m=60): + plot_l_dict=defaultdict(lambda:list()) + if not bus_ns: + if "bus" in pp[target_elem]: + bus_ns = sorted(set(i for i in pp[target_elem]["bus"].values)) + else: + bus_ns = sorted(set(i for i in pp["load"]["bus"].values)) + logger.debug(f"no bus list provided, using all: {bus_ns}") + + wall_sim_map=get_ts_sim_map(DIR).sort_index() + pps=get_grids_json(DIR, wall_sim_map, start_time=start_time) + for bus_n in bus_ns: + + elems = get_elems_attatched_to_bus(pp, bus_n) + if target_elem == "bus": + elems = ["bus."+str(bus_n)] + # print(elems) + for elem in [e for e in elems if target_elem in e]: + # print(elem) + # exit() + + plot_t=[] + #plot_b=[] + plot_l=[] + for grid in pps: + if not np.isnan(grid["simtime"]): + plot_t.append(datetime.utcfromtimestamp(grid["simtime"]) + pd.Timedelta(minutes=time_delta_m)) + #plot_b.append(grid["values"]["bus."+str(bus_n)".MEASUREMENT.active_power"]) + plot_l.append(grid["values"][elem+target_var]) + else: + logger.debug(f"nan simtime for {grid['timestamp']}") + #plot_b_s=pd.Series(plot_b, index = plot_t).sort_index() + plot_l_s=pd.Series(plot_l, index = plot_t, name="grid_expo_"+elem+target_var).sort_index() + plot_l_dict[bus_n].append(plot_l_s) + return plot_l_dict + + +def save_feats(feat, OutDataDIR, t_name): + Path(OutDataDIR+"/feats/"+t_name+"/").mkdir(parents=True, exist_ok=True) + + groups=[k for k in feat.keys()] + for group in tqdm(groups, desc="save_feats", disable=(len(groups) == 1)): + fdf_dict = feat[group] + logger.debug(fdf_dict["cp_g_df"].columns) + with gzip.open(OutDataDIR+"/feats/"+t_name+"/"+group+".gz", "wb") as f: + pickle.dump(fdf_dict, f) + del feat[group] + gc.collect() + +def load_feats_len(OutDataDIR, t_name): + x_path = OutDataDIR+"/feats/"+t_name+"/" + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + return len(onlyfiles) + +# def load_feats2(OutDataDIR, t_name): +# x_path = OutDataDIR+"/feats/"+t_name+"/" +# onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] +# for fi in onlyfiles: +# fdf_dict = load_single_feat(x_path+fi) +# yield (fi.replace(".gz", ""), fdf_dict) + + + + +def load_feats(OutDataDIR, t_name): + for fi, fi_p in iter_feats(OutDataDIR, t_name): + fdf_dict = load_single_feat(fi_p) + yield (fi, fdf_dict) + +def iter_feats(OutDataDIR, t_name): + x_path = OutDataDIR+"/feats/"+t_name+"/" + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + for fi in onlyfiles: + yield (fi.replace(".gz", ""), x_path+fi) + +def load_single_feat(file_p): + with gzip.open(file_p, "rb") as f: + fdf_dict = pickle.load(f) + return fdf_dict + +def load_feats_CPg(OutDataDIR, t_name, cp_g): + fi = OutDataDIR+"/feats/"+t_name+"/"+cp_g+".gz" + if not os.path.isfile(os.path.join(fi)): + logger.error(f"file {fi} not found") + return None + + with gzip.open(fi, "rb") as f: + fdf_dict = pickle.load(f) + return fdf_dict + +def prune_feats(df_dict, feat_cases=None): + if feat_cases is None or "all" in feat_cases: + return + + feat_cols = df_dict["feat_cols"] + + + add_expo = [feat_case for feat_case in feat_cases if "add_grid_load_expo_" in feat_case] + if add_expo: + if len(add_expo) > 1: + logger.warning(f"multiple add_grid_load_expo_ sets provided: {add_expo}; using first") + add_expo = add_expo[0] + sub_case="static" + if "expo_rnd" in add_expo: + sub_case="rnd" + add_expo_fac_min = int(add_expo.split("_")[-1]) + add_expo_fac_max = min(add_expo_fac_min+10, 100) + #print(add_expo, add_expo_fac_min, add_expo_fac_max) + expo_cols = [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and sub_case+"_fac."+str(add_expo_fac_min) in c] + #print(expo_cols) + #print(feat_cols) + feat_cols = feat_cols + expo_cols + #print(feat_cols) + #exit() + + if "add_bus_relations" in feat_cases: + for add_c in [c for c in df_dict["cp_g_df"].columns if ".bus_and_sgen." in c or ".no_ev_load." in c]: + feat_cols.append(add_c) + + for feat_case in feat_cases: + if feat_case == "no_cps": + feat_cols = [f for f in feat_cols if "CP_" not in f] + elif feat_case == "no_cps_but_speed": + feat_cols = [f for f in feat_cols if "CP_" not in f or "charge_speed" in f] + + elif feat_case == "no_grid": + #feat_cols = [f for f in feat_cols if "bus." not in f] + feat_cols = [f for f in feat_cols if "grid_est_" not in f] + feat_cols = [f for f in feat_cols if "grid_meas_" not in f] + elif feat_case == "no_grid_storage": + #feat_cols = [f for f in feat_cols if "bus." not in f] + feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".storage." in f)] + feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".storage." in f)] + elif feat_case == "no_grid_line": + #feat_cols = [f for f in feat_cols if "bus." not in f] + feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".line." in f)] + feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".line." in f)] + elif feat_case == "no_grid_sgen": + #feat_cols = [f for f in feat_cols if "bus." not in f] + feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".sgen." in f)] + feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".sgen." in f)] + elif feat_case == "no_grid_bus_rel_to_pred": + pred_col = df_dict["pred_col"] + feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".bus." in f and "_relation_to_"+pred_col.replace("_lag_0", "") in f)] + + elif feat_case == "no_est": + feat_cols = [f for f in feat_cols if "grid_est_" not in f] + elif feat_case == "no_meas": + feat_cols = [f for f in feat_cols if "grid_meas_" not in f] + + elif feat_case == "no_time_diff": + feat_cols = [f for f in feat_cols if "time_diff_" not in f] + + elif feat_case == "only_pred_lag": + pred_col = df_dict["pred_col"] + feat_cols = [f for f in feat_cols if "_lag_0" in f or f.startswith(pred_col.replace("_lag_0", "_lag_"))] + + elif feat_case == "only_norm": + feat_cols = [f for f in feat_cols if "_norm" in f] + elif feat_case == "only_norm_but_pred": + pred_col = df_dict["pred_col"] + feat_cols = [f for f in feat_cols if "_norm" in f or f.startswith(pred_col.replace("_lag_0", "_lag_"))] + elif feat_case == "no_norm": + feat_cols = [f for f in feat_cols if "_norm" not in f] + + elif feat_case == "no_norm1": + feat_cols = [f for f in feat_cols if "_cp_g_norm" not in f and "_cp_norm" not in f] + elif feat_case == "no_norm2": + feat_cols = [f for f in feat_cols if "_group_norm" not in f] + + elif feat_case == "no_date1": + feat_cols = [f for f in feat_cols if "date_exog_cp_g_" not in f and "date_exog_cp_" not in f] + elif feat_case == "no_date2": + feat_cols = [f for f in feat_cols if "date_exog_group_" not in f] + + elif feat_case == "no_hour_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "hour" in f and "discretize" not in f)] + elif feat_case == "no_hour_only_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_only" in f)] + elif feat_case == "no_hour_day_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_day" in f)] + elif feat_case == "no_day_of_week_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "dayofweek" in f)] + elif feat_case == "no_day_is_work_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_day_is_work" in f)] + elif feat_case == "no_hour_balancing_date": + feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_balancing" in f)] + + + + df_dict["feat_cols"] = feat_cols diff --git a/ids/features_clf.py b/ids/features_clf.py new file mode 100644 index 0000000000000000000000000000000000000000..c979a29c373db54ef323467b96fd1b1be31416ed --- /dev/null +++ b/ids/features_clf.py @@ -0,0 +1,1357 @@ + +import ast +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from copy import deepcopy +from datetime import datetime +import gc +import gzip +import hashlib +import json +import multiprocessing +import os +from pathlib import Path +import pickle +import sys +import threading +import time +import warnings +import pandas as pd +from sklearn.covariance import EllipticEnvelope +from sklearn.discriminant_analysis import StandardScaler +from sklearn.ensemble import IsolationForest +from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning +from sklearn.metrics import accuracy_score +from sklearn.calibration import cross_val_predict +from sklearn.metrics import confusion_matrix, mean_squared_error, precision_recall_fscore_support +from sklearn.model_selection import ParameterGrid +from sklearn.neighbors import LocalOutlierFactor +from sklearn.pipeline import make_pipeline +from sklearn.svm import OneClassSVM +from tqdm import tqdm + + +from joblib import parallel_backend +from threadpoolctl import threadpool_limits + +import re + +import logging + +from features_aux import add_lags, get_date_exog, load_feats, load_feats_CPg, load_feats_len, normalize_cols, add_all_lags +from regression import get_cp_group_eval_dicts + +logger = logging.getLogger("WATTSON_EV_IDS.FeaturesClf") + +m_dict = multiprocessing.Manager() +CLF_CACHE=m_dict.dict() +#CLF_CACHE=dict() +with open("ids.conf", 'r') as f: + conf = json.load(f) + NUM_THREADS = conf["NUM_THREADS"] + + +feat_warn1=0 +feat_warn2=0 +def save_clf_data(feat, out_d, out_f): + #cp_g_dict = {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/year/predictions/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']} + Path(out_d).mkdir(parents=True, exist_ok=True) + + logger.debug(feat.columns) + feat.to_csv(out_f) + del feat + gc.collect() + +def add_grid_meas_relations(clf_df, cp_g_df, pred_col, speed_preds): + get_rel_expo_cols = [c for c in cp_g_df.columns if ".load." in c and "_norm" not in c and "grid_expo_" in c and ".active_power" in c and "_relation_to_" not in c] # and "10_" in c + get_rel_expo_cols_base = [c for c in cp_g_df.columns if ".load." in c and "_norm" not in c and "grid_expo_" in c and ".active_power_lag_0" in c and "_relation_to_" not in c]#. + get_rel_expo_cols_map=defaultdict(lambda:list()) + + if len(get_rel_expo_cols_base) > 1: #'grid_expo_0.0.load.14.active_power_lag_0', 'grid_expo_0.0.load.6.active_power_lag_0', + load_ns = [c.split(".")[3] for c in get_rel_expo_cols_base] + for rel_expo in get_rel_expo_cols: + for load_n in load_ns: + if ".load."+load_n+"." in rel_expo: + rel_expo_base = rel_expo.replace(".load."+load_n+".",".load.total.") + get_rel_expo_cols_map[rel_expo_base].append(rel_expo) + for rel_expo_base,rel_expos in get_rel_expo_cols_map.items(): + cp_g_df[rel_expo_base]=0 + get_rel_expo_cols.append(rel_expo_base) + for rel_expo in rel_expos: + cp_g_df[rel_expo_base]+=cp_g_df[rel_expo] + + + # print([c for c in clf_df.columns]) + # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_" in c and "sgen" in c]) + # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_meas_" in c and "sgen" in c and ".active_power_lag_0" in c and "_relation_to_" not in c]) + # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_meas_" in c and "bus" in c and ".active_power_lag_0" in c and "_relation_to_" not in c]) + # print(get_rel_expo_cols) + #print(cp_g_df) + + new_cols=[] + + #max ev power = bus + sgen - load(s) + bus_cols = [c for c in cp_g_df.columns if ".bus." in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c] + bus_col = bus_cols[0] + sgen_cols = [c for c in cp_g_df.columns if ".sgen." in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c] + + name_b_total = bus_col.replace("active_power","total_power") + clf_df[name_b_total] = cp_g_df[bus_col] + for sgen_col in sgen_cols: + cp_g_df_sgen_col = cp_g_df[sgen_col] + clf_df[name_b_total] += cp_g_df_sgen_col #grid_meas_585.10030.bus.11.active_power_lag_0 + new_cols.append(name_b_total) + + non_ev_power = bus_col.replace("active_power","non_ev_power") + clf_df[non_ev_power] = clf_df[name_b_total] - cp_g_df[pred_col] # (bus + sgen) - ev load = loads + new_cols.append(non_ev_power) + + non_ev_power_relation_to_bus = bus_col.replace("active_power","non_ev_power_relation_to_bus") + clf_df[non_ev_power_relation_to_bus] = clf_df[non_ev_power] / clf_df[name_b_total] + new_cols.append(non_ev_power_relation_to_bus) + + ev_power_relation_to_bus = bus_col.replace("active_power","only_ev_power_relation_to_bus") + clf_df[ev_power_relation_to_bus] = cp_g_df[pred_col] / clf_df[name_b_total] + new_cols.append(ev_power_relation_to_bus) + + for rec in get_rel_expo_cols: + name = rec.replace("_lag_0","")+"_max_ev_power"+"_lag_0" + clf_df[name] = clf_df[name_b_total] - cp_g_df[rec] + new_cols.append(name) + + name2 = rec.replace("_lag_0","")+"_max_ev_power_relation_to_bus"+"_lag_0" + clf_df[name2] = clf_df[name] / clf_df[name_b_total] + new_cols.append(name2) + + name2 = rec.replace("_lag_0","")+"_non_ev_power_relation_to_expo"+"_lag_0" + clf_df[name2] = cp_g_df[rec] / clf_df[non_ev_power] + new_cols.append(name2) + + + name = rec.replace("_lag_0","")+"_relation_to_"+pred_col.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to received charge speed + clf_df.loc[(cp_g_df[rec] != 0), name] = cp_g_df[pred_col] / cp_g_df[rec] + clf_df.loc[(cp_g_df[rec] == 0), name] = 0 + new_cols.append(name) + + for speed_pred in speed_preds: + name = rec.replace("_lag_0","")+"_relation_to_"+speed_pred.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to predicted charge speeds + clf_df.loc[(cp_g_df[rec] != 0), name] = clf_df[speed_pred] / cp_g_df[rec] + clf_df.loc[(cp_g_df[rec] == 0), name] = 0 + new_cols.append(name) + + if len(bus_cols) >= 1: + if len(bus_cols) > 1: + logger.warning(f"len(bus_cols) > 1: {bus_cols}") + bus_col = bus_cols[0] + name = bus_col.replace("_lag_0","")+"_relation_to_"+rec.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to bus measurement + clf_df.loc[(cp_g_df[rec] != 0), name] = cp_g_df[bus_col] / cp_g_df[rec] + clf_df.loc[(cp_g_df[rec] == 0), name] = 0 + new_cols.append(name) + else: + logger.warning(f"bus_cols empty: {[c for c in cp_g_df.columns if 'bus' in c]}") + + + clf_df = clf_df.copy() + + # print(clf_df) + # #print(new_cols) + # exit() + + if False: + new_cols_df = clf_df[new_cols] + normalize_cols(new_cols_df, "_prune_norm") + for c in new_cols_df.columns: + if "_norm" in c and c not in cp_g_df.columns: + new_c = c.replace("_lag_0_prune_norm", "_prune_norm_lag_0") + clf_df[new_c] = new_cols_df[c] + clf_df = clf_df.copy() + return clf_df + + +def get_clf_feat_dfs(cp_g_pred, cp_g_df, pred_col): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + pred_cols = [c for c in cp_g_pred.columns if "prediction_" in c] + clf_df = cp_g_pred[pred_cols].copy() + clf_df["prediction_mean"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c]].mean(axis=1) + clf_df["prediction_min"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c and "_mean" not in c]].min(axis=1) + clf_df["prediction_max"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c and "_mean" not in c]].max(axis=1) + if len(pred_cols) >= 2: + clf_df["prediction_mean_0_1"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_0" in c or "prediction_1" in c]].mean(axis=1) + if len(pred_cols) >= 3: + clf_df["prediction_mean_0_1_2"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_0" in c or "prediction_1" in c or "prediction_2" in c]].mean(axis=1) + clf_df["prediction_mean_1_2"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_1" in c or "prediction_2" in c]].mean(axis=1) + if len(pred_cols) >= 4: + clf_df["prediction_mean_2_3"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_2" in c or "prediction_3" in c]].mean(axis=1) + if len(pred_cols) >= 5: + clf_df["prediction_mean_3_4"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_3" in c or "prediction_4" in c]].mean(axis=1) + # print(clf_df) + # exit() + pred_col_df = cp_g_df[pred_col].loc[clf_df.index] + # if len(clf_df) != len(pred_col_df): + # print(clf_df) + # print(pred_col_df) + for c in clf_df.columns: + if "_mean" in c or "_min" in c or "_max" in c: + clf_df[c+"_diff"] = clf_df[c] - pred_col_df + clf_df[c+"_diff_abs"] = (clf_df[c] - pred_col_df).abs() + #clf_df[c+"_diff_rmse"] = mean_squared_error(clf_df[c], pred_col_df, squared=False) + clf_df = clf_df.copy() + + clf_df = add_grid_meas_relations(clf_df, cp_g_df, pred_col, ["prediction_mean", "prediction_min", "prediction_max"]) #pred_cols+ + # print(clf_df) + # for c in clf_df.columns: + # print(c) + # exit() + + normalize_cols(clf_df, "_clf_norm") + #print(clf_df) + return clf_df + + +def get_clf_feat_file_dicts(OutDataDIR): + regex = re.compile(r"^(CP_\d+)_(\w+)_([\d\w]+).csv.gz$") + + ret_d=defaultdict(lambda:defaultdict(lambda:list())) + DIR=OutDataDIR+"/clf_feats/" + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))] + for actor in sub_folders: + if ".bak" in actor: + logger.warning(f'skiping {OutDataDIR+"/clf_feats/"+actor}') + continue + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + x_path=OutDataDIR+"/clf_feats/"+actor + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + for f in onlyfiles: + result = regex.search(f) + if result: + result.group(1) + ret_d[actor][result.group(1)].append({"reg":result.group(2),"shifts":result.group(3),"file":os.path.join(x_path, f),"features":features}) + else: + logger.error(f"unk file {f}") + continue + return ret_d + + +def get_clf_is_atk_dfs(OutDataDIR, group): + is_atk_cols = ['meter_value_sampled_value_lag_0', 'custom_data_meter_no_atk_lag_0', + 'custom_data_energy_interval_lag_0', 'custom_data_original_energy_interval_lag_0', + 'custom_data_average_power_lag_0', 'custom_data_original_average_power_lag_0', + 'meter_diff_lag_0', 'custom_data_meter_diff_lag_0', + 'charge_speed_lag_0', 'custom_data_charge_speed_lag_0','is_attack_lag_0'] + + is_atk_dfs = dict() + + cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")] + for actor_prefix in sorted(cpos): + act_feat = load_feats_CPg(OutDataDIR, actor_prefix, group) #generator + df = act_feat["cp_g_df"] + is_atk_dfs[actor_prefix] = df[is_atk_cols].copy() + + return is_atk_dfs + + +def _interp(s, targets, col="charge_speed_lag_0", method="time"): + o_idx = list(s.index.values) + n_idx = list(targets.values) # if v not in o_idx + f_idx=set(o_idx + n_idx) + s2 = s.reindex(pd.to_datetime(list(f_idx))) + if col is None: + return s2.interpolate(method=method).loc[targets] + else: + return s2[col].interpolate(method=method).loc[targets] + +def _interp1(ts, target, col="charge_speed_lag_0"): + if target in ts.index: + return ts[col].loc[target] + if target < ts.index.min() or target > ts.index.max(): #before or after table + return 0.0 + ts1 = ts.sort_index() + b = (ts1.index > target).argmax() # index of first entry after target + s = ts1.iloc[b-1:b+1] + if len(s)==0: #before or after table + return 0.0 + # Insert empty value at target time. + s = s.reindex(pd.to_datetime(list(s.index.values) + [target])) + return s[col].interpolate(method='time').loc[target] + +def get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, target, col="charge_speed_lag_0", col_s=None): + val = 0 + val_s = 0 + if col_s is None: + col_s = "custom_data_" + col + + val += _interp(cp_group_grid_power, target, col=None) + for actor, is_atk_df in is_atk_dfs.items(): + val_s += _interp(is_atk_df, target, col=col_s) + + return (val - val_s).rename(cp_group_grid_power.name+"_diff_to_"+col_s) + +def get_clf_is_atk(is_atk_dfs, target, col="charge_speed_lag_0", col_s=None): + val = 0 + val_s = 0 + if col_s is None: + col_s = "custom_data_" + col + for actor, is_atk_df in is_atk_dfs.items(): + val += _interp(is_atk_df, target, col=col) + if col != "is_attack_lag_0": + val_s += _interp(is_atk_df, target, col=col_s) + return (val - val_s).rename(col+"_diff") + +def get_param_grid_len(clf_type): + return len(ParameterGrid(get_param_grid_from_string(clf_type))) + +def get_param_grid_from_string(clf_type, train_contamination=False): + if clf_type == "LocalOutlierFactor": + if train_contamination == True: + param_grid = {'algorithm': ['auto'], 'leaf_size': [30,20,40], 'metric': ['minkowski'], 'metric_params': [None], 'n_jobs': [NUM_THREADS], 'n_neighbors': [20,10,30], 'novelty': [True], 'p': [1,2], 'contamination': ['auto']} + else: + param_grid = {'algorithm': ['auto'], 'leaf_size': [30,20,40], 'metric': ['minkowski'], 'metric_params': [None], 'n_jobs': [NUM_THREADS], 'n_neighbors': [20,10,30], 'novelty': [True], 'p': [1,2], 'contamination': [1e-323]} + elif clf_type == "OneClassSVM": + param_grid = {'cache_size': [7000], 'kernel': ['rbf', 'sigmoid'], 'max_iter': [200], 'nu': [0.5,0.2,0.7], 'shrinking': [True], 'tol': [0.001], 'verbose': [False]} + # param_grid1 = {'cache_size': [7000], 'coef0': [0.0, 0.5, 2.0], 'gamma': ['scale'], 'kernel': ['poly', 'sigmoid'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]} + # param_grid2 = {'cache_size': [7000], 'degree': [3,6,9], 'gamma': ['scale'], 'kernel': ['poly'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]} + # param_grid3 = {'cache_size': [7000], 'kernel': ['rbf', 'linear'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]} + # param_grid = [param_grid1, param_grid2, param_grid3] + elif clf_type == "EllipticEnvelope": + # param_grid = {'assume_centered': [False,True], 'contamination': [1e-323], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [None,0.1,0.5,0.8,0.9,0.99], "random_state": [12345]} #The covariance matrix of the support data is equal to 0, try to increase support_fractio + if train_contamination == True: + param_grid = {'assume_centered': [False,True], 'contamination': [0.1], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [0.7,0.8,0.9,0.99]} + else: + param_grid = {'assume_centered': [False,True], 'contamination': [1e-323], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [0.7,0.8,0.9,0.99]} + elif clf_type == "IsolationForest": + if train_contamination == True: + param_grid = {'bootstrap': [False,True], 'contamination': ['auto'], 'max_features': [1.0,0.9], 'max_samples': ['auto',0.9,0.8], 'n_estimators': [50,100,200], 'n_jobs': [NUM_THREADS], 'random_state': [12345], 'verbose': [0], 'warm_start': [False]} + else: + param_grid = {'bootstrap': [False,True], 'contamination': [1e-323], 'max_features': [1.0,0.9], 'max_samples': ['auto',0.9,0.8], 'n_estimators': [50,100,200], 'n_jobs': [NUM_THREADS], 'random_state': [12345], 'verbose': [0], 'warm_start': [False]} + else: + logger.error(f"unk clf_type {clf_type}") + param_grid = {} + return param_grid + +def get_clfs_from_string(clf_type, config=None, train_contamination=False): + param_grid = get_param_grid_from_string(clf_type, train_contamination=train_contamination) + if config is None: + pg = ParameterGrid(param_grid) + else: + pg = [config] + for p in pg: + if clf_type == "LocalOutlierFactor": + yield LocalOutlierFactor(**p) + elif clf_type == "OneClassSVM": + yield OneClassSVM(**p) + elif clf_type == "EllipticEnvelope": + yield EllipticEnvelope(**p) + elif clf_type == "IsolationForest": + yield IsolationForest(**p) + else: + logger.error(f"unk clf_type {clf_type}") + yield None + + + +def get_clf_result_output_conc(best_d_ag, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=None, allow_diff_shifts=True): + res_l=[] + th_i=0 + # data_lock=threading.Lock() + # data_lock2=threading.Lock() + # pbar2=None + # with ThreadPoolExecutor(NUM_THREADS) as pool: + m = multiprocessing.Manager() + data_lock=m.Lock() + data_lock2=m.Lock() + pbar2=pbar + pbar=None + with ProcessPoolExecutor(NUM_THREADS) as pool: #multiprocessing + results=[] + for clf, best_dict in best_d_ag.items(): + th_i+=1 + if th_i % 2 == 0: + with data_lock: + results.append( pool.submit(get_clf_result_output, clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar, allow_diff_shifts, data_lock) ) + else: + with data_lock2: + results.append( pool.submit(get_clf_result_output, clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar, allow_diff_shifts, data_lock2) ) + time.sleep(0.05) + + if th_i % NUM_THREADS == 0 or th_i == 2 or th_i == 4: + for r in results: + timestamp1 = time.time() + clf_results = r.result() + timestamp2 = time.time() + logger.debug("r.result() took %.2f seconds" % (timestamp2 - timestamp1)) + if pbar2 is not None: + pbar2.update(1) + if clf_results is not None: + res_l.append(clf_results) + results=[] + gc.collect() + + + for r in results: + clf_results = r.result() + if pbar2 is not None: + pbar2.update(1) + if clf_results is not None: + res_l.append(clf_results) + + return res_l + +def do_get_clf_is_atk_conc(pool, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None): + return pool.submit(do_get_clf_is_atk_wrap, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=pbar) + +def do_get_clf_is_atk_wrap(atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None): + end_d = max([v.index.max() for v in is_atk_dfs.values()]) + atk_df = atk_df.loc[ATK_START_DATE:end_d] + # print(atk_df) + # print([k for k in is_atk_dfs.keys()]) + # print(is_atk_dfs["CPO_0"]) + clf_is_atk = do_get_clf_is_atk(is_atk_dfs, atk_df, cp_group_grid_power) + if pbar is not None: + pbar.update(1) + return clf_is_atk + +def do_get_clf_is_atk(is_atk_dfs, atk_full_clf_feat, cp_group_grid_power): + clf_is_atk1 = get_clf_is_atk(is_atk_dfs, atk_full_clf_feat.index) + clf_is_atk2 = get_clf_is_atk(is_atk_dfs, atk_full_clf_feat.index, col="is_attack_lag_0") + # print(cp_group_grid_power) + # print(is_atk_dfs) + clf_is_atk3 = get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, atk_full_clf_feat.index) + clf_is_atk4 = get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, atk_full_clf_feat.index, col_s="charge_speed_lag_0") + + #clf_is_atk = clf_is_atk1.to_frame().join(clf_is_atk2) + #print(clf_is_atk) + clf_is_atk = pd.concat([clf_is_atk1,clf_is_atk2,clf_is_atk3,clf_is_atk4 ],axis=1) + #print(clf_is_atk) + #exit() + clf_is_atk = clf_is_atk.loc[atk_full_clf_feat.index] + clf_is_atk["is_attack1"] = 1 + clf_is_atk["is_attack2"] = 1 + clf_is_atk["is_attack3"] = 1 + clf_is_atk["is_attack4"] = 1 + clf_is_atk.loc[clf_is_atk[clf_is_atk1.name] != 0, "is_attack1"] = -1 #"charge_speed_lag_0_diff" + clf_is_atk.loc[clf_is_atk[clf_is_atk2.name] != 0, "is_attack2"] = -1 #"is_attack_lag_0_diff" + clf_is_atk.loc[clf_is_atk[clf_is_atk3.name] != 0, "is_attack3"] = -1 #grid vs should speed + clf_is_atk.loc[clf_is_atk[clf_is_atk4.name] != 0, "is_attack4"] = -1 #grid vs reported speed + # print(clf_is_atk) + # print(clf_is_atk['is_attack1'].equals(clf_is_atk['is_attack2'])) + + # clf_is_atk["is_attack_d"] = 1 + # clf_is_atk.loc[clf_is_atk["is_attack1"] != clf_is_atk["is_attack2"], "is_attack_d"] = -1 + + ##print(clf_is_atk[clf_is_atk["is_attack_d"] == 1]) + # print(clf_is_atk[clf_is_atk["is_attack_d"] == -1]) #TODO: why the diff + + # print(atk_full_clf_feat) + # print(clf_is_atk) + # print(y_p) + return clf_is_atk + + +def get_atk_full_clf_feat(atk_clf_feat_file, clf_f, atk_df, atk_labels_df, ATK_START_DATE, allow_diff_shifts=True): + atk_cp_g_clf_feat = pd.read_csv(atk_clf_feat_file, index_col=0, parse_dates=[0]) + atk_df = atk_df.loc[atk_cp_g_clf_feat.index.min():atk_cp_g_clf_feat.index.max()] + + if clf_f is None: + clf_f=list(atk_cp_g_clf_feat.columns)+list(atk_df.columns) + if allow_diff_shifts: + clf_f_bak = clf_f + clf_f_l = len(clf_f) + atk_full_clf_feat_columns=list(atk_cp_g_clf_feat.columns)+list(atk_df.columns) + clf_f = [c for c in clf_f if c in atk_full_clf_feat_columns] + if len(clf_f) != clf_f_l: + # logger.info(f"train and test used diff features! {clf_f_bak=}") + # logger.info(f"train and test used diff features! {clf_f=}") + # logger.info(f"train and test used diff features! {[f for f in clf_f_bak if f not in clf_f]}") + # exit() + + global feat_warn1 + if feat_warn1 <=1: + feat_warn1+=1 + logger.warning(f"train and test used diff features! {allow_diff_shifts=}") + + atk_cp_g_clf_feat = atk_cp_g_clf_feat[[c for c in clf_f if c in atk_cp_g_clf_feat.columns]] + atk_df = atk_df[[c for c in clf_f if c in atk_df.columns]] + + #end_d = max([v.index.max() for v in is_atk_dfs.values()]) + end_d = atk_labels_df.index.max() + atk_full_clf_feat = pd.concat([atk_cp_g_clf_feat, atk_df], axis=1).loc[ATK_START_DATE:end_d][clf_f] + del atk_cp_g_clf_feat + del atk_df + gc.collect() + return atk_full_clf_feat, clf_f + +def get_clf_result_output(clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=None, allow_diff_shifts=True, data_lock=None): + #print(clf,best_dict) + + if data_lock is not None: + data_lock.acquire() + + clf_feat_file = get_clf_feat_for_conf(cp_g_dict, best_dict) + atk_clf_feat_file = get_clf_feat_for_conf(atk_cp_g_dict, best_dict, allow_diff_shifts=allow_diff_shifts) + # print(cp_g_dict) + # print(atk_cp_g_dict) + # print(best_dict) + if atk_clf_feat_file is None: + logger.warning(f"no atk_clf_feat_file found for {clf=} in {atk_cp_g_dict=}") + if pbar is not None: + pbar.update(1) + if data_lock is not None: + data_lock.release() + return None + #print(clf_feat_file) + #print(atk_clf_feat_file) + clf_n = clf.split(".")[0] + clf_c = best_dict["conf"] + clf_c = ast.literal_eval(clf_c) + _clf_f = best_dict["feat"] + clf_o = best_dict["offset"] + clf_r = best_dict["reg"] + #print(f"TEST: {clf=} {clf_c=} {_clf_f=} {clf_o=} {clf_r=} {[k for k in best_dict.keys()]}") + + clf_only = clf.replace("."+clf_r, "") + clf_features_string = clf_only.split(".")[1:] + do_scale_pipeline = False + if "only_norm" in clf_features_string: + do_scale_pipeline = True + + #print(f"TEST: {clf_only=} {clf_r=} {clf_features_string=} {do_scale_pipeline=}") + + #print(atk_clf_feat_file) + atk_full_clf_feat, clf_f = get_atk_full_clf_feat(atk_clf_feat_file, _clf_f, atk_df, atk_labels_df, ATK_START_DATE, allow_diff_shifts) + + + cp_g_clf_feat = pd.read_csv(clf_feat_file, index_col=0, parse_dates=[0]) + train_df = train_df.loc[cp_g_clf_feat.index.min():cp_g_clf_feat.index.max()] + + cp_g_clf_feat = cp_g_clf_feat[[c for c in clf_f if c in cp_g_clf_feat.columns]] + train_df = train_df[[c for c in clf_f if c in train_df.columns]] + + + full_clf_feat = pd.concat([cp_g_clf_feat, train_df], axis=1).loc[TRAIN_START_DATE:ATK_START_DATE][clf_f] + del cp_g_clf_feat + del train_df + gc.collect() + + + # full_clf_feat = full_clf_feat[clf_f] + # atk_full_clf_feat = atk_full_clf_feat[clf_f] + # gc.collect() + + if data_lock is not None: + data_lock.release() + + # print(clf_n, type(clf_n)) + # print(clf_c, type(clf_c)) + # print(clf_f, type(clf_f)) + # print(clf_o, type(clf_o)) + # exit() + + #is_atk_pred = do_clf(clf_n, clf_c, full_clf_feat, atk_full_clf_feat, offset_=0, pbar=pbar) + try: + y_p = do_clf(clf_n, clf_c, full_clf_feat, atk_full_clf_feat, pbar=pbar, overwrite_jobs=2, scale_pipeline=do_scale_pipeline) + except Exception as e: #TODO: handle "ValueError: The covariance matrix of the support data is equal to 0, try to increase support_fraction" + clf_n=clf_n+"_"+str(e) + y_p = [1 for x in atk_full_clf_feat.values] + + #is_atk_pred_0 = [1 if p >= 0 else -1 for p in y_p] + is_atk_pred = [1 if p >= clf_o else -1 for p in y_p] + #print(is_atk_pred) + + # print(full_clf_feat) + # print(atk_full_clf_feat) + # print(is_atk_dfs["CPO_0"].loc[ATK_START_DATE:]) + timestamp1 = time.time() + #clf_is_atk = do_get_clf_is_atk(is_atk_dfs, atk_full_clf_feat) + clf_is_atk = atk_labels_df + timestamp2 = time.time() + logger.debug("do_get_clf_is_atk() took %.2f seconds" % (timestamp2 - timestamp1)) + + clf_results = get_clf_eval(y=clf_is_atk['is_attack1'], y_pred=is_atk_pred) + #clf_results = get_clf_eval(y=clf_is_atk['is_attack2'], y_pred=is_atk_pred) + clf_results["clf"] = clf + clf_results["clf_n"] = clf_n + clf_results["clf_f"] = clf_f + clf_results["clf_o"] = clf_o + clf_results["clf_r"] = clf_r + #print(clf_results) + clf_results["decision_function"] = list(y_p) + return clf_results + + + +def get_clf_id(clf_n, estimator, df, feat_cols, conf): + h = hashlib.new('sha256') + h.update(repr(conf).encode("utf-8")) + h.update(pd.util.hash_pandas_object(df, index=True).values) + h.update(repr(sorted(feat_cols)).encode("utf-8")) + h.update(repr(estimator).encode("utf-8")) + ha = h.hexdigest() + clf_id = (clf_n, ha) + return clf_id + +def do_clf(clf_type, config, full_clf_feat, atk_full_clf_feat, offset_=None, pbar=None, scale_pipeline=False, overwrite_jobs=None): + # print([c for c in full_clf_feat.columns]) + # exit() + X = full_clf_feat.values + atk_X = atk_full_clf_feat.values + + if overwrite_jobs is not None: + if "n_jobs" in config: + config["n_jobs"] = overwrite_jobs + + estimators = [e for e in get_clfs_from_string(clf_type, config=config)] # + if len(estimators) > 1: + raise ValueError(f"len(estimators) > 1: {estimators=}") + + + if not sys.warnoptions and (clf_type == "OneClassSVM" or clf_type == "EllipticEnvelope"): + warnings.simplefilter("ignore") + os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + for estimator in estimators: #should be only one since config... + if scale_pipeline: + estimator = make_pipeline(StandardScaler(),estimator) + + timestamp1 = time.time() + clf_id = get_clf_id(clf_type, estimator, full_clf_feat, [c for c in full_clf_feat.columns], config) + if clf_id in CLF_CACHE: + logger.debug(f"hit {len(CLF_CACHE)}") + estimator = deepcopy(CLF_CACHE[clf_id]) + else: + logger.debug(f"miss {len(CLF_CACHE)}") + estimator.fit(X=X) + CLF_CACHE[clf_id] = estimator + + timestamp2 = time.time() + logger.debug("fit() took %.2f seconds" % (timestamp2 - timestamp1)) + + + y_p = estimator.decision_function(X=atk_X) + timestamp3 = time.time() + logger.debug("decision_function() took %.2f seconds" % (timestamp3 - timestamp2)) + #y_pred = estimator.predict(X=atk_X) + if pbar is not None: + pbar.update(1) + return y_p + +def get_clf_eval(y, y_pred): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UndefinedMetricWarning) + #precision0, recall0, fscore0, support0 = precision_recall_fscore_support(y, y_pred) + precision1, recall1, fscore1, support1 = precision_recall_fscore_support(y, y_pred, average="binary", labels=[1, -1]) + #precision2, recall2, fscore2, support2 = precision_recall_fscore_support(y, y_pred, average="weighted") + tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel() + accuracy = accuracy_score(y, y_pred) + + if (tp+fn) != 0: + tpr = tp/(tp+fn) + else: + tpr=0 + if (fp+tn) != 0: + fpr = fp/(fp+tn) + else: + fpr=0 + ret = { + "tpr": tpr, + "fpr": fpr, + "precision": precision1, + "recall": recall1, + "fscore": fscore1, + "support": support1, + "tn": tn, + "fp": fp, + "fn": fn, + "tp": tp, + "accuracy": accuracy, + } + return ret + +def cross_val_clf(clf_type, full_clf_feat, feat_cols, clf_feat_cols, pred_col=None, only_clf_feat=False, pbar=None, scale_pipeline=False, train_contamination=False): + full_feat_cols = sorted(feat_cols + clf_feat_cols) + if only_clf_feat: + full_feat_cols = sorted(clf_feat_cols) + # print(full_clf_feat) + # print(full_feat_cols) + # # print([c for c in full_clf_feat.columns if "grid" in c]) + # exit() + X = full_clf_feat[full_feat_cols].values + + if pred_col is None: + y = [1] * len(X) + else: + y = full_clf_feat[pred_col].values + + ret_l = [] + estimators = get_clfs_from_string(clf_type, train_contamination=train_contamination) #get list w/ hyper params... + # LOF: decision_function: The shift offset allows a zero threshold for being an outlier. + offset_ = 0 + + # IF: When the contamination parameter is set to "auto", the offset is equal to -0.5 as the scores of inliers are close to 0 and the scores of outliers are close to -1. + # LOF: inliers score around -1 (the higher, the less abnormal). + # LOF: The offset is set to -1.5 (inliers score around -1), except when a + # contamination parameter different than "auto" is provided. + # if clf_type== "IsolationForest": + # offset_ = -0.5 + # elif clf_type== "LocalOutlierFactor": + # offset_ = -1.5 + + + for estimator in estimators: + clf_params = estimator.get_params() + if scale_pipeline: + estimator = make_pipeline(StandardScaler(),estimator) + if "make_pipeline(StandardScaler," not in clf_type: + clf_type = f"make_pipeline(StandardScaler,{clf_type})" + now = datetime.now() + try: + if not sys.warnoptions and ("OneClassSVM" in clf_type or "EllipticEnvelope" in clf_type): + warnings.simplefilter("ignore") + os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + + nt=NUM_THREADS + if "LocalOutlierFactor" in clf_type or "IsolationForest" in clf_type: #native multi-threading + nt=1 #doesnt seem to work... + # if clf_type == "EllipticEnvelope" and not only_clf_feat: + # nt=1 + #if not only_clf_feat: + # nt=round(nt/2) + pre_dispatch=nt + #logger.info(f"{pre_dispatch=}, {nt=}") + with parallel_backend("threading", n_jobs=nt): + with threadpool_limits(limits=nt, user_api='blas'): + os.environ["OMP_NUM_THREADS"] = str(nt) + os.environ["MKL_NUM_THREADS"] = str(nt) + os.environ["OPENBLAS_NUM_THREADS"] = str(nt) + os.environ["BLIS_NUM_THREADS"] = str(nt) + y_p = cross_val_predict(estimator, X, y=None, cv=5, n_jobs=nt, method="decision_function", pre_dispatch=pre_dispatch) #Returns -1 for anomalies/outliers and 1 for inliers. + + #print(y_p[:100]) + y_pred = [1 if p >= offset_ else -1 for p in y_p] + # y_pred = cross_val_predict(estimator, X, y=None, cv=5, n_jobs=NUM_THREADS) #Returns -1 for anomalies/outliers and 1 for inliers. + except Exception as e: + logger.error(e) + logger.error(str(estimator)) + # logger.error(full_clf_feat[full_feat_cols]) + # logger.error(full_clf_feat[[c for c in full_clf_feat.columns if "pred"in c]]) + ret = { + "clf_type": clf_type, + "clf_params": clf_params, + "full_feat_cols": full_feat_cols, + "y_p": str(e), + } + if pbar is not None: + pbar.update(1) + ret_l.append(ret) + #raise e + continue + + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UndefinedMetricWarning) + #precision0, recall0, fscore0, support0 = precision_recall_fscore_support(y, y_pred) + precision1, recall1, fscore1, support1 = precision_recall_fscore_support(y, y_pred, average="binary", labels=[1, -1]) + #precision2, recall2, fscore2, support2 = precision_recall_fscore_support(y, y_pred, average="weighted") + tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel() + accuracy = accuracy_score(y, y_pred) + + # print(y[:100]) + # print(y_pred[:100]) + + # print(precision1, recall1, fscore1, support1) + # print(tn, fp, fn, tp) + # print(accuracy) + + later = datetime.now() + difference = (later - now).total_seconds() + ret = { + "clf_type": clf_type, + #"clf": str(estimator), + "clf_params": clf_params, + "full_feat_cols": full_feat_cols, + #"y_true": y, + "y_p": list(y_p), + #"y_pred": list(y_pred), + #"offset_":offset_, + "precision": precision1, + "recall": recall1, + "fscore": fscore1, + "tn": tn, + "fp": fp, + "fn": fn, + "tp": tp, + "accuracy": accuracy, + "t_diff": difference, + } + if pbar is not None: + pbar.update(1) + ret_l.append(ret) + return ret_l + + + + +def prune_clf_feats(df, feat_cases=None, df_dict=None): + feat_cols = list(df.columns) + if feat_cases is None or "all" in feat_cases: + return feat_cols + + # print([c for c in df_dict["cp_g_df"].columns if "_non_ev_power_" in c]) + # print([c for c in feat_cols if "_non_ev_power_" in c]) + # exit() + + + add_expo = [feat_case for feat_case in feat_cases if "add_grid_load_expo_" in feat_case] + if add_expo: + if df_dict is None: + logger.error(f"add_grid_load_expo_ provided: {add_expo}; but no df_dict") + exit() + if len(add_expo) > 1: + logger.warning(f"multiple add_grid_load_expo_ sets provided: {add_expo}; using first") + add_expo = add_expo[0] + sub_case="static" + if "expo_rnd" in add_expo: + sub_case="rnd" + add_expo_fac_min = int(add_expo.split("_")[-1]) + #add_expo_fac_max = min(add_expo_fac_min+10, 100) + #print(add_expo, add_expo_fac_min, add_expo_fac_max) + expo_cols = [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and sub_case+"_fac."+str(add_expo_fac_min) in c and "_relation_to_" not in c] + feat_cols = [c for c in feat_cols if "grid_expo_" not in c or (sub_case+"_fac."+str(add_expo_fac_min) in c)] + # print(expo_cols) + # print(feat_cols) + feat_cols = feat_cols + expo_cols + #print(feat_cols) + else: + feat_cols = [f for f in feat_cols if "grid_expo_"not in f] + + # if "add_rel_to_cols" in feat_cases: + # add_rel_cols= [c for c in df_dict["cp_g_df"].columns if "_relation_to" in c] + # get_rel_expo_cols = [c for c in feat_cols if "_norm" not in c and "grid_expo_" in c] + + # #print(df_dict["cp_g_df"]) + # new_cols=[] + # for rec in get_rel_expo_cols: + # name = rec.replace("_lag_0","")+"_relation_to_"+df_dict["pred_col"].replace("_lag_0","")+"_lag_0" + # df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] != 0), name] = df_dict["cp_g_df"][df_dict["pred_col"]] / df_dict["cp_g_df"][rec] + # df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] == 0), name] = 0 + # new_cols.append(name) + + # bus_col = [c for c in df_dict["cp_g_df"].columns if "bus" in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c][0] + # name = bus_col.replace("_lag_0","")+"_relation_to_"+rec.replace("_lag_0","")+"_lag_0" + # df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] != 0), name] = df_dict["cp_g_df"][bus_col] / df_dict["cp_g_df"][rec] + # df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] == 0), name] = 0 + # new_cols.append(name) + + # new_cols_df = df_dict["cp_g_df"][new_cols] + # normalize_cols(new_cols_df, "_prune_norm") + # for c in new_cols_df.columns: + # if "_norm" in c and c not in df_dict["cp_g_df"].columns: + # new_c = c.replace("_lag_0_prune_norm", "_prune_norm_lag_0") + # df_dict["cp_g_df"][new_c] = new_cols_df[c] + # new_cols.append(new_c) + + # add_rel_cols = add_rel_cols + new_cols + # #print(df_dict["pred_col"].replace("_lag_0","")) + # feat_cols = feat_cols + add_rel_cols + + if feat_cases is not None and "set_1" in feat_cases: + set_1 = ['prediction_0_diff', 'prediction_mean_diff', ] + set_1 +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and "non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #'grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0', + set_1 +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_static_fac." in f and + ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', + "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', + "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0', + "_relation_to_prediction_mean_lag_0" in f) and # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0', + "_norm" not in f] + + feat_cols = [f for f in feat_cols if f in set_1] + # print(set_1) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_2" in feat_cases: + set_2 = ['prediction_0_diff', 'prediction_mean_diff', ] + feat_cols = [f for f in feat_cols if f in set_2] + return feat_cols + elif feat_cases is not None and "set_3" in feat_cases: + set_x = ['prediction_0_diff', 'prediction_mean_diff', ] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_4" in feat_cases: + set_x = ['prediction_0_diff','prediction_1_diff','prediction_2_diff', 'prediction_mean_diff', ] + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_5" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f and "_rmse" not in f] + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_6" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f] + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c] + h_set +=[df_dict["pred_col"]] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_7" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" in f] + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" in c] + h_set +=["charge_speed_group_norm_lag_0"] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_8" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f] + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_9" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" in f] + h_set =["charge_speed_lag_0"] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_10" in feat_cases: + set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + h_set =["charge_speed_lag_0"] + set_x+=h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_11" in feat_cases: + set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_12" in feat_cases: + set_x = ['prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_13" in feat_cases: + set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f and "_rmse" not in f and "_diff" in f and "_abs" not in f and "_mean" in f] + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + + elif feat_cases is not None and "set_31" in feat_cases: + set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_32" in feat_cases: + set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c] + h_set +=["charge_speed_lag_0"] + set_x += h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_33" in feat_cases: + set_x = ['prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c] + set_x += h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + return feat_cols + elif feat_cases is not None and "set_34" in feat_cases: + set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + h_set =["charge_speed_lag_0"] + set_x += h_set + feat_cols+=h_set + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + + elif feat_cases is not None and "set_35" in feat_cases: + set_x = ['prediction_mean_diff', ] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_static_fac." in f and + ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', + "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', + "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0', + "_relation_to_prediction_mean_lag_0" in f) and # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0', + "_norm" not in f] + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + elif feat_cases is not None and "set_352" in feat_cases: + set_x = ['prediction_mean_diff', ] + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0 + set_x +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_" in f and "_fac." in f and #add_grid_load_expo_rnd_90 + ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', + "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', + "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0', + "_relation_to_prediction_mean_lag_0" in f) and # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0', + "_norm" not in f] + feat_cols = [f for f in feat_cols if f in set_x] + # print(set_x) + # print(feat_cols) + # exit() + return feat_cols + + for feat_case in feat_cases: + if feat_case == "only_norm": + feat_cols = [f for f in feat_cols if "_norm" in f] + elif feat_case == "no_norm": + feat_cols = [f for f in feat_cols if "_norm" not in f] + elif feat_case == "only_diff": + feat_cols = [f for f in feat_cols if "_diff" in f or "_relation_to" in f] + + return feat_cols + + +def get_best_clf_conf_offset(y_p, max_fpr): + if y_p["y_p"] is None: + return None, None + offset = 0 + y_p = sorted(y_p["y_p"]) + best_y_p = 0 + p_y_p = len(y_p) * max_fpr + # print(y_p) + for min_y in y_p: + _best_y_p = len([y for y in y_p if y < min_y]) #len of false positives + if _best_y_p > p_y_p: #more false postivies than allowed + break + offset = min_y + best_y_p = _best_y_p + # print(offset, best_y_p, p_y_p, len(y_p)) + # exit() + return offset, best_y_p + +def get_best_clf_conf2(dfr, max_fpr=0.005, ignore_fscore=False): # >= offset_ -> inlier=1 + #print(dfr) + res = dfr.apply(lambda x: get_best_clf_conf_offset(x, max_fpr=max_fpr), axis=1) + dfr["offset"], dfr["false_positives"]= zip(*res) + if ignore_fscore: + df_best = dfr + else: + df_best = dfr[dfr["fscore"] == dfr["fscore"].max()] + df_best_o = df_best[df_best["offset"] == df_best["offset"].max()] + df_best_f = df_best_o[df_best_o["t_diff"] == df_best_o["t_diff"].min()].iloc[0] + # print(df_best_f, df_best_f["offset"]) + # exit() + return df_best_f, df_best_f["offset"] + +def get_best_clf_conf(dfr, max_fpr=0.01): + df_best = dfr[dfr["fscore"] == dfr["fscore"].max()] + df_best_f = df_best[df_best["t_diff"] == df_best["t_diff"].min()].iloc[0] + return df_best_f, 0 + +def to_float_list(val): + val = val.replace("[", "").replace("]", "") + try: + ret = [float(v) for v in val.split(", ")] + except Exception as e: + logger.info(f"{e} with {val}") + return None + return ret + +def _eval_tuning_clf(dfr): + df_best_f, offset = get_best_clf_conf2(dfr.copy()) + # print(group,clf,reg,df_best_f["clf_params"],df_best_f["fscore"], offset) + # exit() + return {"conf":df_best_f["clf_params"], "eval":df_best_f["fscore"], "feat":ast.literal_eval(df_best_f["full_feat_cols"]), "offset": offset} + +def eval_tuning_clf(x_path, tqdm_n=0, full=False): + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + + #CP_4_LocalOutlierFactor.no_reg.only_norm.csv.gz + regex = re.compile(r"^(CP_\d+)_([\w\.\_]+)\.csv\.gz$") + eval_tuning_files=defaultdict(lambda: dict()) + for f in tqdm(onlyfiles, desc=f"eval_tuning {x_path} get eval_tuning_files", disable=tqdm_n>0): + #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz + result = regex.search(f) + if result: # + f_df = pd.read_csv(os.path.join(x_path, f), index_col=0) # , converters={"clf_params":ast.literal_eval, "full_feat_cols":ast.literal_eval, "y_p":ast.literal_eval} , "features":ast.literal_eval, "clf_features":ast.literal_eval + # f_df['clf_params'] = f_df['clf_params'].apply(lambda x: ast.literal_eval(x)) + # f_df['full_feat_cols'] = f_df['full_feat_cols'].apply(lambda x: ast.literal_eval(x)) + f_df['y_p'] = f_df['y_p'].apply(lambda x: to_float_list(x)) + # print(f_df) + # print(f_df.columns) + eval_tuning_files[result.group(1)][result.group(2)] = f_df + #break + else: + logger.info(f"unk file {os.path.join(x_path, f)}") + continue + + if full: + return eval_tuning_files + + ret_dict=defaultdict(lambda: dict()) + results=[] + #with ThreadPoolExecutor(NUM_THREADS) as pool: + with ProcessPoolExecutor(NUM_THREADS) as pool: + for group,group_d in tqdm(eval_tuning_files.items(), desc=f"eval_tuning {x_path} get ret_dict", disable=True): + for clf, df in group_d.items(): + for index, rsf in df[["reg", 'shifts', 'features']].drop_duplicates().iterrows(): + # print(rsf["reg"]) + dfr = df[(df["reg"] == rsf["reg"]) & (df["shifts"] == rsf["shifts"]) & (df["features"] == rsf["features"])] + reg = dfr["reg"].iloc[0] +"."+ str(dfr["shifts"].iloc[0]) +"."+ ".".join(ast.literal_eval(dfr["features"].iloc[0])) + + results.append( (group,clf,reg, pool.submit(_eval_tuning_clf, dfr)) ) + #ret_dict[group][(clf,reg)] = _eval_tuning_clf(dfr) + + for group,clf,reg,r in tqdm(results, desc=f"eval_tuning {x_path} get ret_dict", disable=tqdm_n>0): + ret_dict[group][(clf,reg)] = r.result() + return ret_dict + + +def get_eval_dicts_clf(OutDataDIR): + ret_d=dict() + DIR=OutDataDIR+"/clf_eval/" + + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")] + for cpo in sub_folders: + ret_dict=eval_tuning_clf(OutDataDIR+"/clf_eval/"+cpo) + ret_d[cpo] = ret_dict + + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")] + for dso in sub_folders: + ret_dict=eval_tuning_clf(OutDataDIR+"/clf_eval/"+dso) + ret_d[dso] = ret_dict + + return ret_d + + +def save_eval_dicts_clf(best_d, OutDataDIR): + for actor in sorted(best_d.keys()): + ret_dict = best_d[actor] + Path(OutDataDIR+"/clf_eval_dicts/"+actor+"/").mkdir(parents=True, exist_ok=True) + for group,ret_d2 in tqdm(ret_dict.items(), desc=f"saving {actor}"): + with gzip.open(OutDataDIR+"/clf_eval_dicts/"+actor+"/"+group+".gz", "wb") as f: + pickle.dump(ret_d2, f) + + +def load_eval_dicts_clf(OutDataDIR): + DIR = OutDataDIR+"/clf_eval_dicts/" + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))] + + best_d=defaultdict(lambda: defaultdict(lambda: dict())) + for _x_path in sub_folders: + x_path = os.path.join(DIR,_x_path) + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + for fi in onlyfiles: + with gzip.open(os.path.join(x_path,fi), "rb") as f: + fdf_dict = pickle.load(f) + best_d[_x_path][fi.replace(".gz", "")] = fdf_dict + return best_d + +def get_clf_feat_for_conf(cp_g_dict, best_dict, allow_diff_shifts=False): + #print(cp_g_dict) #[{'reg': 'GradientBoostingRegressor', 'shifts': '5', 'file': 'data/elaadnl/clf_feats/DSO.all/CP_11_GradientBoostingRegressor_5.csv.gz', 'features': ['all']}, ... + #print(best_dict["reg"].split(".")) + best_dict_reg = best_dict["reg"].split(".") + for cp_g_d in cp_g_dict: + if cp_g_d["reg"] == best_dict_reg[0] and cp_g_d["shifts"] == best_dict_reg[1] and cp_g_d["features"] == best_dict_reg[2:]: + return cp_g_d["file"] + for cp_g_d in cp_g_dict: + if cp_g_d["reg"] == best_dict_reg[0] and allow_diff_shifts and cp_g_d["features"] == best_dict_reg[2:]: + + global feat_warn2 + if feat_warn2 <=1: + feat_warn2+=1 + logger.warning(f"train and test used diff shifts for features! {allow_diff_shifts=}") + return cp_g_d["file"] + return None + +def get_atks_old(l_buffer=2): #based on EVAtksScript; TODO: auto gen... + atk_l=[] + step=0 + times = [ #Timedelta from sim start time [select attack day]; (hour, minute) to start attack [select attack time] + (pd.Timedelta(days=1), 0, 20, None), #atk 0 getSeed + (pd.Timedelta(days=1), 0, 40, None), #atk 1 getCPs + ] + atk_cycles=27 + atk_cycles_actual=0 + for x in range(0, atk_cycles, 5): #[0, 5, 10, 15, 20, 25] + times.append((pd.Timedelta(days=0+x), 8, 00, "atk 2 MAD inc")) #atk 2 MAD inc + times.append((pd.Timedelta(days=0+x), 13, 00, "atk 3 FDI red")) #atk 3 FDI red + times.append((pd.Timedelta(days=1+x), 8, 00, "atk 4 FDI inc (no limit)")) #atk 4 FDI inc (no limit) + times.append((pd.Timedelta(days=1+x), 18, 00, "atk 5 MAD inc and FDI same")) #atk 5 MAD inc and FDI same + times.append((pd.Timedelta(days=2+x), 8, 00, "atk 6 MAD red")) #atk 6 MAD red + times.append((pd.Timedelta(days=2+x), 18, 00, "atk 7 FDI inc")) #atk 7 FDI inc + times.append((pd.Timedelta(days=3+x), 8, 00, "atk 8 MAD inc x2")) #atk 8 MAD inc x2 + times.append((pd.Timedelta(days=3+x), 18, 00, "atk 9 MAD red and FDI inc")) #atk 9 MAD red and FDI inc + times.append((pd.Timedelta(days=4+x), 8, 00, "atk 10 MAD inc and FDI red")) #atk 10 MAD inc and FDI red + times.append((pd.Timedelta(days=4+x), 18, 00, "atk 11 MAD inc x1.5")) #atk 11 MAD inc x1.5 + atk_cycles_actual+=1 + steps_per_cycle=10 + + times.append((pd.Timedelta(days=31), 23, 59, None)) #Shutdown + + offset = pd.Timedelta(days=2) + + setup_times = [0,1] + times = [(x[0]+offset, x[1], x[2], x[3]) if i not in setup_times else (x[0], x[1], x[2], x[3]) for i,x in enumerate(times)] + + start_sim_clock_time = pd.to_datetime("2023-11-29 00:00:00") + + for x in setup_times: + atk_time = times[step] + atk_time_sim = start_sim_clock_time + atk_time[0] + atk_time_h = atk_time[1] + atk_time_m = atk_time[2] + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim}) + step += 1 + + for x in range(atk_cycles_actual): + for i in range(steps_per_cycle): + atk_dur_h = x+1 #*6 *365 + atk_time = times[step] + if atk_time[3] is None: + break + + atk_time_sim = start_sim_clock_time + atk_time[0] + + atk_time_h = atk_time[1] -1 #rnd times + atk_time_m = atk_time[2] + # atk_time_h = atk_time[1] + self.rand.randint(low=-1,high=2) + # atk_time_m = atk_time[2] + self.rand.randint(low= 0,high=60-atk_time[2]) + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + length=15 * 4 * atk_dur_h + #l_buffer=2#10 + length+=60*(1+3 +l_buffer) #1+3 rnd times + end_diff = pd.Timedelta(minutes=length) + atk_time_sim_end = atk_time_sim+end_diff + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim_end}) + step += 1 + + + for x in ["shutdown"]: + atk_time = times[step] + atk_time_sim = start_sim_clock_time + atk_time[0] + atk_time_h = atk_time[1] + atk_time_m = atk_time[2] + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim}) + step += 1 + + return atk_l + + +def get_atks(l_buffer=2, start_sim_offset_h=0): #based on EVAtksScript; TODO: auto gen... + atk_l=[] + step=0 + times = [ #Timedelta from sim start time [select attack day]; (hour, minute) to start attack [select attack time] + (pd.Timedelta(days=1), 0, 20, None), #atk 0 getSeed + (pd.Timedelta(days=1), 0, 40, None), #atk 1 getCPs + ] + atk_cycles=27 + atk_cycles_actual=0 + for x in range(0, atk_cycles, 10): #[0, 10, 20,] + times.append((pd.Timedelta(days=0+x), 7, 00, "atk 2 MAD inc")) #atk 2 MAD inc + times.append((pd.Timedelta(days=1+x), 14, 00, "atk 3 FDI red")) #atk 3 FDI red + times.append((pd.Timedelta(days=2+x), 8, 00, "atk 4 FDI inc (no limit)")) #atk 4 FDI inc (no limit) + times.append((pd.Timedelta(days=3+x), 18, 00, "atk 5 MAD inc and FDI same")) #atk 5 MAD inc and FDI same + times.append((pd.Timedelta(days=4+x), 8, 00, "atk 6 MAD red")) #atk 6 MAD red + times.append((pd.Timedelta(days=5+x), 18, 00, "atk 7 FDI inc")) #atk 7 FDI inc + times.append((pd.Timedelta(days=6+x), 8, 00, "atk 8 MAD inc x2")) #atk 8 MAD inc x2 + times.append((pd.Timedelta(days=7+x), 18, 00, "atk 9 MAD red and FDI inc")) #atk 9 MAD red and FDI inc + times.append((pd.Timedelta(days=8+x), 8, 00, "atk 10 MAD inc and FDI red")) #atk 10 MAD inc and FDI red + times.append((pd.Timedelta(days=9+x), 18, 00, "atk 11 MAD inc x2 (no limit) and FDI same")) #atk 11 MAD inc x2 (no limit) and FDI same + atk_cycles_actual+=1 + steps_per_cycle=10 + + times.append((pd.Timedelta(days=31), 23, 59, None)) #Shutdown + + offset = pd.Timedelta(days=2) + + setup_times = [0,1] + times = [(x[0]+offset, x[1], x[2], x[3]) if i not in setup_times else (x[0], x[1], x[2], x[3]) for i,x in enumerate(times)] + + start_sim_clock_time = pd.to_datetime("2023-11-29 00:00:00") + + for x in setup_times: + atk_time = times[step] + atk_time_sim = start_sim_clock_time + atk_time[0] + atk_time_h = atk_time[1] + atk_time_m = atk_time[2] + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim}) + step += 1 + + for x in range(atk_cycles_actual): + for i in range(steps_per_cycle): + atk_dur_h = x+1 #*6 *365 + atk_dur_h = min(atk_dur_h, 4) #max 4 hours... + atk_time = times[step] + if atk_time[3] is None: + break + + atk_time_sim = start_sim_clock_time + atk_time[0] + + atk_time_h = atk_time[1] - 1 #rnd times + atk_time_h += start_sim_offset_h + atk_time_m = atk_time[2] + # atk_time_h = atk_time[1] + self.rand.randint(low=-1,high=2) + # atk_time_m = atk_time[2] + self.rand.randint(low= 0,high=60-atk_time[2]) + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + length=15 * 4 * atk_dur_h + #l_buffer=2#10 + length+=60*(1+3 +l_buffer) #1+3 rnd times + end_diff = pd.Timedelta(minutes=length) + atk_time_sim_end = atk_time_sim+end_diff + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim_end}) + step += 1 + + + for x in ["shutdown"]: + atk_time = times[step] + atk_time_sim = start_sim_clock_time + atk_time[0] + atk_time_h = atk_time[1] + atk_time_m = atk_time[2] + atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m) + + atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim}) + step += 1 + + return atk_l \ No newline at end of file diff --git a/ids/features_cpo.py b/ids/features_cpo.py new file mode 100644 index 0000000000000000000000000000000000000000..425375413fcf8f73b426796f754e74338be8cd92 --- /dev/null +++ b/ids/features_cpo.py @@ -0,0 +1,409 @@ + +from collections import defaultdict +import os +from matplotlib import pyplot as plt +import pandas as pd +from tqdm import tqdm + + +import re + +import logging + +from features_aux import add_lags, get_date_exog, normalize_cols, add_all_lags + +logger = logging.getLogger("WATTSON_EV_IDS.FeaturesCPO") + +ROUND_TO=6 + +def get_cpo_files(DIR): + ret={} + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))] + for x in [f for f in sub_folders if f.startswith("CPO_")]: + x_path = os.path.join(DIR, x) + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + onlyfiles = [f for f in onlyfiles if f.startswith("cpo_ocpp_data_") and f.endswith(".csv.gz")] + ret[x] = onlyfiles + return ret + +def read_cpo_files(dir, files): + # print(dir, files) + regex = re.compile(r"^cpo_ocpp_data_(CP_\d+)_([\w\d]+).csv.gz$") + cpo_files=defaultdict(lambda: dict()) + for f in tqdm(files, desc="loading cpo_files"): + #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz + result = regex.search(f) + if result: + f_df = pd.read_csv(os.path.join(dir, f), index_col="meter_value_ts", parse_dates=["meter_value_ts"]) + if result.group(1) != f_df.iloc[0]["cp_group"]: + logger.error(f"wrong cp_group {result.group(1)} != {f_df.iloc[0]['cp_group']}") + # f_df = f_df[[c for c in f_df.columns if not c.startswith("custom_data") and c != "cp_c_id" and c != "cp_group"]] + cpo_files[result.group(1)][result.group(2)] = f_df + # break + else: + logger.error(f"unk file {f}") + continue + return cpo_files + + + +def get_features_cpo(df, per_cp_normalize, per_cp_date_exog): + keep_cols = ["charge_speed", "meter_value_sampled_value", "meter_diff", "time_diff", "cp_c_id"] + f_df = df[[c for c in df.columns if c != "cp_group"]] # c != "cp_c_id" and not c.startswith("custom_data") and + f_df = pd.concat([f_df, pd.get_dummies(f_df["event"], prefix="event")], axis=1) + # f_df["meter_value_p"] = (f_df["meter_value_sampled_value"] - f_df["meter_value_sampled_value"].min()) / (f_df["meter_value_sampled_value"].max() - f_df["meter_value_sampled_value"].min()) + # f_df["meter_value_p_diff"] = f_df["meter_value_p"] - f_df["meter_value_p"].shift(1) + # f_df["meter_value_p_diff"] = f_df["meter_value_p_diff"].fillna(0) + f_df["time_diff"] = f_df["time_diff"].fillna(0) + + f_df["is_attack"] = 0 + f_df.loc[f_df[f_df["custom_data_atk_type"] != "None"].index, "is_attack"] = 1 + f_df = pd.concat([f_df, pd.get_dummies(f_df["custom_data_atk_type"], prefix="custom_data_atk_type")], axis=1) + + if per_cp_date_exog: + c1 = f_df.columns + get_date_exog(f_df, prefix="date_exog_cp_") + c_del = [c for c in f_df.columns if c not in c1] + for c in f_df.columns: + if c.startswith("date_exog_cp_"): + f_df = pd.concat([f_df, pd.get_dummies(f_df[c], prefix=c)], axis=1) + f_df = f_df[[c for c in f_df.columns if c not in c_del]] #del non dummy date exog + keep_cols+= [c for c in f_df.columns if c.startswith("date_exog_cp_")] + + keep_cols+= [c for c in f_df.columns if c.startswith("custom_data") and "custom_data_vendor_id" not in c and "custom_data_atk_type" not in c] + keep_cols+= ["is_attack"] + f_df = f_df[[c for c in f_df.columns if c.startswith("event_") or c in keep_cols]] + + # if len(f_df["is_attack"].drop_duplicates()) > 0: + # print(f_df) + # exit() + # if len(f_df["is_attack"].drop_duplicates()) > 0: + # print(f_df) + # exit() + + if per_cp_normalize: + normalize_cols(f_df, "_cp_norm") + return f_df.copy() + +def resample_feats3(df, t_min, t_max, freq='15Min'): + oidx = df.index + t_min2=df.index.min() + t_max2=df.index.max() + #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min') + nidx = pd.date_range(t_min2, t_max2, freq=freq) #, normalize=True + res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx) + res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0) + res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0) + res = resample_feats(res, t_min, t_max, freq) + return res + +def resample_feats2(df, t_min, t_max, freq='15Min'): + oidx = df.index + #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min') + nidx = pd.date_range(t_min, t_max, freq=freq) + res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx) + res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0) + res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0) + + for fward_c in ["meter_value_sampled_value", "meter_diff", "charge_speed", "meter_no_atk"]: + fward_cs = [c for c in df.columns if fward_c in c] + res[fward_cs] = df[fward_cs].reindex(oidx.union(nidx)).ffill().reindex(nidx) + for fward_c in ["meter_value_sampled_value", "meter_no_atk"]: + fward_cs = [c for c in df.columns if fward_c in c] + res[fward_cs] = res[fward_cs].fillna(res[fward_cs].min()) + for fward_c in ["meter_diff", "charge_speed"]: + fward_cs = [c for c in df.columns if fward_c in c] + res[fward_cs] = res[fward_cs].fillna(0) + return res + +def resample_feats(df, t_min, t_max, freq='15Min'): + oidx = df.index + #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min') + nidx = pd.date_range(t_min, t_max, freq=freq) + res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx) + res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx) + res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0) + res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0) + return res + + +def get_cpo_dfs(cpo_files, per_cp_normalize, per_cp_date_exog): + cpo_dfs=dict() + t_min=dict() + t_max=dict() + for group, cps in tqdm(cpo_files.items(), desc="loading cpo_dfs"): + cp_dfs=[] + for cp, cp_df in cps.items(): + cp_dff = get_features_cpo(cp_df, per_cp_normalize=per_cp_normalize, per_cp_date_exog=per_cp_date_exog) + if group not in t_min: + t_min[group] = cp_dff.index.min() + t_max[group] = cp_dff.index.max() + else: + t_min[group] = min(t_min[group], cp_dff.index.min()) + t_max[group] = max(t_max[group], cp_dff.index.max()) + + cp_dfs.append(cp_dff) + # print(group, cp, cp_dff) + # exit() + cpo_dfs[group] = cp_dfs + return cpo_dfs, t_min, t_max + + + # ['meter_value_sampled_value', 'custom_data_meter_no_atk', + # 'custom_data_scaling', 'custom_data_energy_interval', + # 'custom_data_original_energy_interval', 'custom_data_average_power', + # 'custom_data_original_average_power', 'meter_diff', 'time_diff', + # 'charge_speed', 'custom_data_meter_diff', 'custom_data_charge_speed', + # 'event_Ended', 'event_Started', 'event_Updated', 'is_attack', + # 'date_exog_cp_dayofweek_1', 'date_exog_cp_dayofweek_2', + # 'date_exog_cp_dayofweek_6', 'date_exog_cp_hour_7', + # 'date_exog_cp_hour_8', 'date_exog_cp_hour_9', 'date_exog_cp_hour_10', + # 'date_exog_cp_hour_11', 'date_exog_cp_hour_12', 'date_exog_cp_hour_13', + # 'date_exog_cp_hour_14', 'date_exog_cp_hour_15', 'date_exog_cp_hour_16', + # 'date_exog_cp_discretize_hour_only_Play', + # 'date_exog_cp_discretize_hour_only_Work', + # 'date_exog_cp_discretize_hour_day_High-Home', + # 'date_exog_cp_discretize_hour_day_High-Work', + # 'date_exog_cp_discretize_day_is_work_False', + # 'date_exog_cp_discretize_day_is_work_True', + # 'date_exog_cp_discretize_hour_balancing_low', + # 'date_exog_cp_discretize_hour_balancing_peak'], +def _add_individual_cp_features(cp_dfs): + for cp_df in cp_dfs: + do_cols = [c for c in cp_df.columns if "date_exog_" not in c and "custom_data_" not in c and c != "cp_c_id"] + cp_id = cp_df["cp_c_id"].drop_duplicates().iloc[0] + cp_df2 = cp_df[[c for c in do_cols]].copy() + cp_df2.rename(columns={c:"CP_"+str(cp_id)+"_"+c for c in do_cols}, inplace=True) + cp_df = pd.concat([cp_df, cp_df2], axis=1) + cp_df.drop(columns="cp_c_id", inplace=True) + yield cp_df + + +def insert_sess_ends(cp_dff): + #print([c for c in cp_dff.columns]) + #print(cp_dff[['time_diff', 'charge_speed','event_Ended', 'event_Started', 'event_Updated', "is_attack",]]) + #print(cp_dff[cp_dff['event_Ended'] == 1]) + if "event_Ended" not in cp_dff.columns: + cp_dff["event_Ended"] = 0 + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Ended")] = 1 + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Updated")] = 0 + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Started")] = 0 + if cp_dff.iloc[-1]["event_Ended"] != 1: + logger.debug('cp_dff.iloc[-1]["event_Ended"] != 1') + #logger.error(cp_dff[['time_diff', 'charge_speed','event_Started', 'event_Updated', "event_Ended", "is_attack",]]) + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Ended")] = 1 + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Updated")] = 0 + cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Started")] = 0 + #exit() + for idx in cp_dff[cp_dff['event_Ended'] == 1].index: + idx_l = cp_dff.index.get_loc(idx) + idx_c = cp_dff.index[idx_l] + idx_max=len(cp_dff)-1 + idx_n = cp_dff.index[min(idx_max,idx_l+1)] + new_idx=None + time_diff=None + for x in [10,1,0.1]: + #print(idx_c+pd.Timedelta(seconds=x),idx_n) + if idx_c+pd.Timedelta(seconds=x) < idx_n or idx_n == cp_dff.index[idx_max]: + new_idx = idx_c+pd.Timedelta(seconds=x) + time_diff=(x/60)/60 # s to h + break + if new_idx is None: + raise Exception(f"no new_idx found for {idx_c=}, {idx_n=} in:\n {cp_dff}") + #print(new_idx) + cp_dff.loc[new_idx] = cp_dff.loc[idx_c] + cp_dff.loc[new_idx, "time_diff"] = time_diff + zero_list = ['energy_interval', 'average_power', 'meter_diff', "charge_speed", "is_attack"] + for z in zero_list: + for c in cp_dff.columns: + if z in c: + cp_dff.loc[new_idx, c] = 0 + cp_dff = cp_dff.sort_index() + #print(cp_dff[['time_diff', 'charge_speed','event_Ended', 'event_Started', 'event_Updated', "is_attack",]]) + return cp_dff + + +def get_cp_df_interp2(cp_dfs, group, t_min, t_max, add_individual_cp_features=True): + base_cols=sorted(set([c for cp_df in cp_dfs for c in cp_df.columns if c != "cp_c_id"])) + if add_individual_cp_features: + cp_dfs = _add_individual_cp_features(cp_dfs) + + cp_df_interp2=None + for cp_dff in cp_dfs: + cp_dff = insert_sess_ends(cp_dff) + cp_dff = resample_feats(cp_dff, t_min[group], t_max[group]) + # fig, ax = plt.subplots(figsize=(16,9)) + # ax.plot(cp_dff.index, cp_dff["charge_speed"], label="charge_speed") + # plt.legend() + # plt.show() + # plt.close() + # print(row) + #exit() + if cp_df_interp2 is None: + cp_df_interp2=cp_dff + else: + add_cols=[c for c in cp_dff.columns if c in base_cols and c in cp_df_interp2.columns] + concat_cols=[c for c in cp_dff.columns if c not in add_cols] + cp_df_interp2[add_cols]+=cp_dff[add_cols] + # cp_df_interp2 = pd.concat([cp_df_interp2, cp_dff[add_cols]]) + # cp_df_interp2 = cp_df_interp2.groupby(cp_df_interp2.index).sum() + cp_df_interp2 = pd.concat([cp_df_interp2, cp_dff[concat_cols]], axis=1) + return cp_df_interp2.round(ROUND_TO) + +def get_cp_df_interp(cp_dfs, group, t_min, t_max, add_individual_cp_features=True): + cp_dfs_interp=[] + + if add_individual_cp_features: + cp_dfs = _add_individual_cp_features(cp_dfs) + + for cp_dff in cp_dfs: + cp_dff = resample_feats(cp_dff, t_min[group], t_max[group]) + cp_dfs_interp.append(cp_dff) + # print(cp_dff["meter_value_sampled_value"].min()) + # print(group, cp_dff) + + cp_df_interp = pd.concat(cp_dfs_interp) + cp_df_interp = cp_df_interp.groupby(cp_df_interp.index).sum() + return cp_df_interp.round(ROUND_TO) + +def resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize, per_cp_group_date_exog, add_individual_cp_features=True): + for group, cp_dfs in tqdm(cpo_dfs.items(), desc="loading cp_df_interp"): + + cp_df_interp = get_cp_df_interp2(cp_dfs, group, t_min, t_max, add_individual_cp_features) + # cp_df_interp = get_cp_df_interp(cp_dfs, group, t_min, t_max, add_individual_cp_features) + #print(cp_df_interp["charge_speed"]) + # print(cp_df_interp.columns) + # print(cp_df_interp2) + # print(cp_df_interp2.columns) + # print(cp_df_interp.equals(cp_df_interp2)) + #exit() + #cp_df_interp.loc[:, "num_current_sessions"] = cp_df_interp[[c for c in cp_df_interp if "event_Updated" in c]].sum(axis=1) # not needed since event_Updated already summed up... + + if per_cp_group_date_exog: + c1 = cp_df_interp.columns + get_date_exog(cp_df_interp, prefix="date_exog_group_") + c_del = [c for c in cp_df_interp.columns if c not in c1] + for c in cp_df_interp.columns: + if c.startswith("date_exog_group_"): + cp_df_interp = pd.concat([cp_df_interp, pd.get_dummies(cp_df_interp[c], prefix=c)], axis=1) + cp_df_interp = cp_df_interp[[c for c in cp_df_interp.columns if c not in c_del]] + + if per_cp_group_normalize: + normalize_cols(cp_df_interp, "_group_norm") + + # if len(cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) > 1: + # print("1",cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) + # exit() + # print(group, cp_df_interp) + # break + yield group, cp_df_interp + + +def _get_cp_feat_cols(cp_df_interp, group, cpo_feat_cols, keep_only_normalized=False, keep_all_normalized=False, pred_col = "charge_speed_lag_0"): + keep_lag_0_CPs = ["charge_speed"] + keep_lag_0_n = ["time_diff", 'event_Ended', 'event_Started', 'event_Updated', "date_exog_cp_", "date_exog_group_"] + if keep_all_normalized: + keep_lag_0_n+=["_norm"] + keep_lag_1_n = ["meter_value_sampled_value", "meter_diff", "charge_speed"] + + + for col in cp_df_interp.columns: + + for s in keep_lag_0_CPs: + if (s in col and "_lag_" in col and "CP_" in col) and ("custom_data_" not in col and "is_attack" not in col): + if keep_only_normalized and "_norm" not in col: + continue + cpo_feat_cols[group]["keep_lag_0_n_cols"].add(col) + + for s in keep_lag_0_n: + if (s in col and "_lag_" in col) and ("custom_data_" not in col and "is_attack" not in col): + if keep_only_normalized and "_norm" not in col: + continue + cpo_feat_cols[group]["keep_lag_0_n_cols"].add(col) + for s in keep_lag_1_n: + if (s in col and "_lag_0" not in col) and ("custom_data_" not in col and "is_attack" not in col): + if keep_only_normalized and "_norm" not in col: + continue + if col in cpo_feat_cols[group]["keep_lag_0_n_cols"]: + continue + cpo_feat_cols[group]["keep_lag_1_n_cols"].add(col) + + cpo_feat_cols[group]["pred_col"].add(pred_col) + +def _get_cpo_feat_cols(cpo_df_interp_l, keep_only_normalized=False): + cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: set())) + for group, cp_df_interp in cpo_df_interp_l.items(): + _get_cp_feat_cols(cp_df_interp, group, cpo_feat_cols, keep_only_normalized) + return cpo_feat_cols + +def get_cp_feat_cols(g,t,cpo_group_feat_cols): + for col_t,cols in t.items(): + if col_t in ["keep_lag_0_n_cols", "keep_lag_1_n_cols"]: + cpo_group_feat_cols[g]["feat_cols"].extend(sorted(cols)) + elif col_t in ["pred_col"]: + cpo_group_feat_cols[g]["pred_col"].extend(sorted(cols)) + +def get_cpo_feat_cols(cpo_df_interp_l): + cpo_feat_cols = _get_cpo_feat_cols(cpo_df_interp_l) + cpo_group_feat_cols = defaultdict(lambda: defaultdict(lambda: list())) + for g,t in cpo_feat_cols.items(): + get_cp_feat_cols(g,t,cpo_group_feat_cols) + return cpo_group_feat_cols + +def get_cpo_group_feat_dfs(cpo_files, per_cp_normalize=False, per_cp_date_exog=True, per_cp_group_normalize=True, per_cp_group_date_exog=True, num_lags=4): + cpo_dfs, t_min, t_max = get_cpo_dfs(cpo_files, per_cp_normalize=per_cp_normalize, per_cp_date_exog=per_cp_date_exog) + # cpo_df_interp = {group: cp_df_interp for group, cp_df_interp in resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog)} + # cpo_df_interp_l = {group: cp_df_interp_l for group, cp_df_interp_l in add_all_lags(cpo_df_interp, num_lags=num_lags)} + # cpo_feat_cols = get_cpo_feat_cols(cpo_df_interp_l) + pred_col_init = "charge_speed" + pred_col = pred_col_init+"_lag_0" + + + for group, cp_df_interp in resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog): + cp_df_interp_l = add_lags(cp_df_interp, num_lags=num_lags, only_do_col=pred_col_init) + _cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: set())) + _get_cp_feat_cols(cp_df_interp_l, group, _cpo_feat_cols, pred_col=pred_col) + cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: list())) + for g,t in _cpo_feat_cols.items(): + get_cp_feat_cols(g,t,cpo_feat_cols) + yield {group: {"cp_g_df": cp_df_interp_l, + "feat_cols": cpo_feat_cols[group]["feat_cols"], + "pred_col": cpo_feat_cols[group]["pred_col"][0]}} + + + + # cpo_df_interp_l_feat = {group: {"cp_g_df": cp_df_interp_l, + # "feat_cols": cpo_feat_cols[group]["feat_cols"], + # "pred_col": cpo_feat_cols[group]["pred_col"][0]} for group, cp_df_interp_l in cpo_df_interp_l.items()} + # return cpo_df_interp_l_feat + # for group, cp_df_interp_l in cpo_df_interp_l.items(): + # print(group, cp_df_interp_l) + # print([c for c in cp_df_interp_l.columns if "_norm" not in c and "_lag_0" in c]) + # """['meter_value_sampled_value', 'custom_data_meter_no_atk', 'custom_data_scaling', 'custom_data_energy_interval', 'custom_data_original_energy_interval', + # 'custom_data_average_power', 'custom_data_original_average_power', 'meter_diff', 'time_diff', 'charge_speed', 'custom_data_meter_diff', 'custom_data_charge_speed', + # 'event_Ended', 'event_Started', 'event_Updated', 'is_attack', + # 'date_exog_cp_dayofweek', 'date_exog_cp_hour', 'date_exog_cp_dayofweek_4', 'date_exog_cp_hour_8', + # 'date_exog_cp_hour_9', 'date_exog_cp_hour_10', 'date_exog_cp_hour_11', 'date_exog_cp_hour_12', 'date_exog_cp_hour_13', 'date_exog_cp_hour_14', + # 'date_exog_cp_discretize_hour_only_Play', 'date_exog_cp_discretize_hour_only_Work', 'date_exog_cp_discretize_hour_day_High-Work', 'date_exog_cp_discretize_day_is_work_True', + # 'date_exog_cp_discretize_hour_balancing_low', 'date_exog_cp_discretize_hour_balancing_peak', 'date_exog_cp_dayofweek_3', 'date_exog_cp_hour_15', 'date_exog_cp_dayofweek_0', + # 'date_exog_cp_hour_16', 'date_exog_cp_dayofweek_5', 'date_exog_cp_hour_0', 'date_exog_cp_hour_1', 'date_exog_cp_hour_2', 'date_exog_cp_hour_3', 'date_exog_cp_hour_4', + # 'date_exog_cp_hour_5', 'date_exog_cp_hour_6', 'date_exog_cp_hour_18', 'date_exog_cp_hour_19', 'date_exog_cp_hour_20', 'date_exog_cp_hour_21', 'date_exog_cp_hour_22', + # 'date_exog_cp_hour_23', 'date_exog_cp_discretize_hour_only_Sleep', 'date_exog_cp_discretize_hour_day_High-Home', 'date_exog_cp_discretize_day_is_work_False', 'date_exog_cp_hour_7', + # 'date_exog_cp_hour_17', 'date_exog_cp_dayofweek_1', + # 'date_exog_group_dayofweek_0', 'date_exog_group_dayofweek_1', 'date_exog_group_dayofweek_3', 'date_exog_group_dayofweek_4', + # 'date_exog_group_dayofweek_5', 'date_exog_group_dayofweek_6', 'date_exog_group_hour_0', 'date_exog_group_hour_1', 'date_exog_group_hour_2', 'date_exog_group_hour_3', + # 'date_exog_group_hour_4', 'date_exog_group_hour_5', 'date_exog_group_hour_6', 'date_exog_group_hour_7', 'date_exog_group_hour_8', 'date_exog_group_hour_9', 'date_exog_group_hour_10', + # 'date_exog_group_hour_11', 'date_exog_group_hour_12', 'date_exog_group_hour_13', 'date_exog_group_hour_14', 'date_exog_group_hour_15', 'date_exog_group_hour_16', 'date_exog_group_hour_17', + # 'date_exog_group_hour_18', 'date_exog_group_hour_19', 'date_exog_group_hour_20', 'date_exog_group_hour_21', 'date_exog_group_hour_22', 'date_exog_group_hour_23', + # 'date_exog_group_discretize_hour_only_Play', 'date_exog_group_discretize_hour_only_Sleep', 'date_exog_group_discretize_hour_only_Work', 'date_exog_group_discretize_hour_day_High-Home', + # 'date_exog_group_discretize_hour_day_High-Work', 'date_exog_group_discretize_day_is_work_False', 'date_exog_group_discretize_day_is_work_True', 'date_exog_group_discretize_hour_balancing_low', + # 'date_exog_group_discretize_hour_balancing_peak']""" + # break #TODO no + # exit() + #TODO dont cheat on features, eg only prev speed but not current... + \ No newline at end of file diff --git a/ids/features_dso.py b/ids/features_dso.py new file mode 100644 index 0000000000000000000000000000000000000000..370fa2413104ecbcecb6b8e41f39cc241f52145a --- /dev/null +++ b/ids/features_dso.py @@ -0,0 +1,348 @@ + +from collections import defaultdict +from datetime import datetime +import os +import warnings +from matplotlib import pyplot as plt +import pandas as pd +from tqdm import tqdm + +import numpy as np + +import re + +import logging + +from features_aux import get_date_exog, get_date_exog_col, get_elems_attatched_to_bus, get_grid_pp, load_single_feat, normalize_cols, add_all_lags +from features_clf import _interp + + +logger = logging.getLogger("WATTSON_EV_IDS.FeaturesDSO") + +ROUND_TO=6 + +def get_dso_files(x_path): + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + onlyfiles = [f for f in onlyfiles if f.startswith("dso_oscp_data_") and f.endswith(".csv.gz")] + return onlyfiles + +def read_dso_files(dir, files): + # print(dir, files) + regex = re.compile(r"^dso_oscp_data_(CP_\d+).csv.gz$") + dso_files=dict() + for f in tqdm(files, desc="loading read_dso_files"): + #dso_oscp_data_CP_1.csv.gz + #sim_time oscp_interval_h group_id measurements_value measurements_phase measurements_unit measurements_energy_type measurements_direction measurements_measure_time measurements_initial_measure_time meter_diff time_diff charge_speed + result = regex.search(f) + if result: + f_df = pd.read_csv(os.path.join(dir, f), index_col="sim_time", parse_dates=["sim_time", "measurements_measure_time", "measurements_initial_measure_time"]) + if result.group(1) != f_df.iloc[0]["group_id"]: + logger.error(f"wrong cp_group {result.group(1)} != {f_df.iloc[0]['group_id']}") + # f_df = f_df[[c for c in f_df.columns if not c.startswith("custom_data") and c != "cp_c_id" and c != "cp_group"]] + dso_files[result.group(1)] = f_df + # print(f_df) + # break + else: + logger.error(f"unk file {f}") + continue + return dso_files + +def read_dso_estimations(file): + m_df = pd.read_csv(file, parse_dates=["Unnamed: 0"]) + m_df.rename(columns={"Unnamed: 0":"sim_time"}, inplace=True) + m_df["sim_time"] = m_df["sim_time"].dt.tz_localize(None) + m_df = m_df.set_index("sim_time", drop=True) + return m_df + +def read_dso_measurements(file): + try: + m_df = pd.read_csv(file, parse_dates=["sim_time"]) + except ValueError as e: + logger.info(f"{e} in read_dso_measurements... trying Unnamed: 0") + m_df = pd.read_csv(file, parse_dates=["Unnamed: 0"]) + m_df.rename(columns={"Unnamed: 0":"sim_time"}, inplace=True) + logger.info(f"Unnamed: 0 worked.") + m_df["sim_time"] = m_df["sim_time"].dt.tz_localize(None) + m_df = m_df.set_index("sim_time", drop=True) + return m_df + + +def get_features_dso(df, per_cp_group_normalize, per_cp_group_date_exog): + # ['oscp_interval_h', 'group_id', 'measurements_value', + # 'measurements_phase', 'measurements_unit', 'measurements_energy_type', + # 'measurements_direction', 'measurements_measure_time', + # 'measurements_initial_measure_time', 'meter_diff', 'time_diff', + # 'charge_speed'] + #print(df) + # exit() + keep_cols = ["charge_speed", "measurements_value", "meter_diff", "time_diff"] + f_df = df[[c for c in df.columns if c != "oscp_interval_h" and c != "group_id" and c != "measurements_phase" and c != "measurements_unit" and c != "measurements_energy_type" and c != "measurements_direction"]].copy() + #f_df = pd.concat([f_df, pd.get_dummies(f_df["event"], prefix="event")], axis=1) + f_df["time_diff"] = f_df["time_diff"].fillna(0) + + if per_cp_group_date_exog: + c1 = f_df.columns + get_date_exog(f_df, prefix="date_exog_cp_g_") + get_date_exog_col(f_df, col="measurements_measure_time", prefix="date_exog_cp_g_") + get_date_exog_col(f_df, col="measurements_initial_measure_time", prefix="date_exog_cp_g_") + c_del = [c for c in f_df.columns if c not in c1] + for c in f_df.columns: + if c.startswith("date_exog_cp_g_"): + f_df = pd.concat([f_df, pd.get_dummies(f_df[c], prefix=c)], axis=1) + f_df = f_df[[c for c in f_df.columns if c not in c_del]] #del non dummy date exog + keep_cols+= [c for c in f_df.columns if c.startswith("date_exog_cp_g_")] + + f_df = f_df[[c for c in f_df.columns if c in keep_cols]] + + if per_cp_group_normalize: + normalize_cols(f_df, "_cp_g_norm") + return f_df.copy() + +def resample_feats(df, t_min, t_max, freq='15Min'): + oidx = df.index + #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min') + nidx = pd.date_range(t_min, t_max, freq=freq) + res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + return res + + +def get_dso_dfs(dso_files, per_cp_group_normalize, per_cp_group_date_exog): + dso_dfs=dict() + t_min=dict() + t_max=dict() + for group, cpg_df in tqdm(dso_files.items(), desc="loading dso_dfs"): + cp_dff = get_features_dso(cpg_df, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog) + if group not in t_min: + t_min[group] = cp_dff.index.min() + t_max[group] = cp_dff.index.max() + else: + t_min[group] = min(t_min[group], cp_dff.index.min()) + t_max[group] = max(t_max[group], cp_dff.index.max()) + + dso_dfs[group] = cp_dff + return dso_dfs, t_min, t_max + + +def get_group_meas_est_cols(pp, group, cols): + g_regex = re.compile(r"^CP_(\d+)$") + res = g_regex.search(group) + col_n = "bus."+res.group(1)+"." + group_meas_cols = [c for c in cols if col_n in c] + + if pp is not None: + conn_cols = get_elems_attatched_to_bus(pp, int(res.group(1))) + logger.debug(f"get_group_meas_est_cols adding: {conn_cols} for {col_n}") + for col_c in conn_cols: + col_c=col_c+"." + group_meas_cols += [c for c in cols if col_c in c and (("line" in col_c and not "from_" in c) or ("line" not in col_c))] + group_meas_cols = sorted(set(group_meas_cols)) + logger.debug(f"get_group_meas_est_cols new: {group_meas_cols}") + else: + logger.warning(f"empty pp grid in get_group_meas_est_cols") + return group_meas_cols + +def resample_dso_dfs(pp, dso_dfs, dso_meas_df, dso_est_df, t_min, t_max, post_interp_normalize, post_interp_date_exog, add_meas_data=True, add_est_data=False, further_meas_export_data=None, pred_col="charge_speed"): + g_regex = re.compile(r"^CP_(\d+)$") + for group, cp_dff in tqdm(dso_dfs.items(), desc="loading cp_df_interp"): + cp_df_interp = resample_feats(cp_dff, t_min[group], t_max[group]) + + if add_meas_data: + group_meas_cols = get_group_meas_est_cols(pp, group, dso_meas_df.columns) + dso_meas_df_g = dso_meas_df[group_meas_cols].copy() + dso_meas_df_g.rename(columns={c:"grid_meas_"+c for c in group_meas_cols}, inplace=True) + dso_meas_df_interp = resample_feats(dso_meas_df_g, t_min[group], t_max[group]) + # print([c for c in dso_meas_df_interp.columns]) + # print(dso_meas_df_interp["grid_meas_849.10030.bus.8.active_power"]) + # print(cp_df_interp[pred_col]) + cp_df_interp = pd.concat([cp_df_interp, dso_meas_df_interp], axis=1) + for c in dso_meas_df_interp.columns: + if ".bus." in c and ".active_power" in c: #should be only one + cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c] + + cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] = cp_df_interp[c] + for c2 in dso_meas_df_interp.columns: + if ".sgen." in c2 and ".active_power" in c2: + cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] += cp_df_interp[c2] + cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")] = cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] - cp_df_interp[pred_col] + cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.") +"_relation_to_"+ pred_col] = cp_df_interp[pred_col] / cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")] + cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.") +"_relation_to_bus"] = cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")] / cp_df_interp[c] + + + if further_meas_export_data is not None: + with warnings.catch_warnings(): + warnings.simplefilter('ignore', pd.errors.PerformanceWarning) + group_int = int(g_regex.search(group).group(1)) + for exp_data in further_meas_export_data[group_int]: + exp_data_interp = resample_feats(exp_data, t_min[group], t_max[group]) + rng = np.random.default_rng(seed=group_int) + for x in range(10,110,10): + rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=None) + rnd_fac_s = rnd_fac * 0.01 + #print(rnd_fac_s) + exp_data_interp_fac=exp_data_interp*rnd_fac_s + exp_data_interp_fac.name = exp_data_interp.name+"_static_fac."+str(x) #same random fac over all measurements of a load (diff for diff loads) + cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1) + for x in range(10,110,10): + rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=exp_data_interp.size) + rnd_fac_s = pd.Series(rnd_fac, index = exp_data_interp.index) + rnd_fac_s = rnd_fac_s * 0.01 + exp_data_interp_fac=exp_data_interp*rnd_fac_s + exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load + cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1) + for c in [col for col in cp_df_interp.columns]: #grid_expo_load.5.MEASUREMENT.active_power_static_fac.80 + if ".load." in c and ".active_power" in c and "_relation_to_" not in c: #should be only one + cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c] + for c2 in dso_meas_df_interp.columns: + if ".bus." in c2 and ".active_power" in c2 and "_relation_to_" not in c2: #should be only one + cp_df_interp.loc[:,c+"_relation_to_"+c2] = cp_df_interp[c2]/cp_df_interp[c] + cp_df_interp = cp_df_interp.copy() + else: + group_int = int(g_regex.search(group).group(1)) + rng = np.random.default_rng(seed=group_int) + load_cols=[] + for c in [col for col in cp_df_interp.columns]: #grid_meas_668.10340.load.11.active_power + if ".load." in c and ".active_power" in c and "_relation_to_" not in c: #should be only one or two + load_cols.append(c) + load_cols_new = {l:l.replace("grid_meas_", "grid_expo_") for l in load_cols} + cp_df_interp = cp_df_interp.rename(columns=load_cols_new) + load_cols = [v for v in load_cols_new.values()] + for lc in load_cols: + exp_data_interp = cp_df_interp[lc] + #for x in range(10,110,10): + for x in [100]: + rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=None) + rnd_fac_s = rnd_fac * 0.01 + #print(rnd_fac_s) + exp_data_interp_fac=exp_data_interp*rnd_fac_s + exp_data_interp_fac.name = exp_data_interp.name+"_static_fac."+str(x) #same random fac over all measurements of a load (diff for diff loads) + cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1) + for x in range(2,21,2): + index_4 = exp_data_interp.index.values[::4] + rnd_fac4 = rng.uniform(low=-1*x, high=x, size=len(index_4)) + rnd_fac_s4 = pd.Series(rnd_fac4, index = index_4) + rnd_fac_s1 = _interp(rnd_fac_s4, exp_data_interp.index, col=None) + rnd_fac_s = rnd_fac_s1 * 0.01 + exp_data_interp_fac=exp_data_interp*(1+rnd_fac_s) + exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load + cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1) + # print(cp_df_interp) + # exit() + + for x in [90]: #TODO: deprecated... delete + rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=exp_data_interp.size) + rnd_fac_s = pd.Series(rnd_fac, index = exp_data_interp.index) + rnd_fac_s = rnd_fac_s * 0.01 + exp_data_interp_fac=exp_data_interp*rnd_fac_s + exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load + cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1) + + for c in [col for col in cp_df_interp.columns]: #grid_expo_load.5.MEASUREMENT.active_power_static_fac.80 + if ".load." in c and ".active_power" in c and "_relation_to_" not in c and ("_static_fac." in c or "_rnd_fac." in c): #should be only one + cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c] + for c2 in dso_meas_df_interp.columns: + if ".bus." in c2 and ".active_power" in c2 and "_relation_to_" not in c2: #should be only one + cp_df_interp.loc[:,c+"_relation_to_"+c2] = cp_df_interp[c2]/cp_df_interp[c] + cp_df_interp = cp_df_interp.copy() + + if add_est_data and dso_est_df is not None: + group_est_cols = get_group_meas_est_cols(pp, group, dso_est_df.columns) + dso_est_df_g = dso_est_df[group_est_cols].copy() + dso_est_df_g.rename(columns={c:"grid_est_"+c for c in group_est_cols}, inplace=True) + dso_est_df_interp = resample_feats(dso_est_df_g, t_min[group], t_max[group]) + cp_df_interp = pd.concat([cp_df_interp, dso_est_df_interp], axis=1) + + + # print(group, [c for c in cp_df_interp.columns if "bus" in c]) + cp_df_interp = cp_df_interp.groupby(cp_df_interp.index).sum() + cp_df_interp = cp_df_interp.round(ROUND_TO) + if post_interp_date_exog: + c1 = cp_df_interp.columns + get_date_exog(cp_df_interp, prefix="date_exog_group_") + c_del = [c for c in cp_df_interp.columns if c not in c1] + for c in cp_df_interp.columns: + if c.startswith("date_exog_group_"): + cp_df_interp = pd.concat([cp_df_interp, pd.get_dummies(cp_df_interp[c], prefix=c)], axis=1) + cp_df_interp = cp_df_interp[[c for c in cp_df_interp.columns if c not in c_del]] + + if post_interp_normalize: + normalize_cols(cp_df_interp, "_group_norm") + + # if len(cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) > 1: + # print("1",cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) + # exit() + # print(group, [c for c in cp_df_interp.columns if "bus" in c]) + # exit() + # break + yield group, cp_df_interp + +def oscp_speed_workaround(cpo_group_speed, oscp_group_speed, pred_col = "charge_speed_lag_0"): + oidx = cpo_group_speed.index + #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min') + nidx = oscp_group_speed.index + res = cpo_group_speed[pred_col].reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + return res + +# ['measurements_value_lag_0', 'meter_diff_lag_0', 'time_diff_lag_0', 'charge_speed_lag_0', +# 'date_exog_cp_g_...', +# 'date_exog_group_...'] +def _get_dso_feat_cols(pp,dso_df_interp_l, keep_only_normalized=False, pred_col = "charge_speed_lag_0"): + # print(dso_df_interp_l["CP_5"]) + # print([c for c in dso_df_interp_l["CP_5"].columns if "_norm" not in c and "_lag_0" in c]) + # exit() + keep_lag_0_n = ["time_diff", "date_exog_cp_g_", "date_exog_group_"] + keep_lag_1_n = ["measurements_value", "meter_diff", "charge_speed"] + + + dso_feat_cols = defaultdict(lambda: defaultdict(lambda: set())) + for group, cp_df_interp in dso_df_interp_l.items(): + keep_lag_0_n += [c.replace("_lag_0","") for c in get_group_meas_est_cols(pp, group, cp_df_interp.columns) if ("grid_meas_" in c or "grid_est_" in c) and "grid_expo_" not in c and "load." not in c] #exclude grid_expo_ (for load active_power) by default + # print([c.replace("_lag_0","") for c in get_group_meas_est_cols(pp, group, cp_df_interp.columns) if "grid_meas_" in c or "grid_est_" in c]) + # print(get_group_meas_est_cols(pp, group, cp_df_interp.columns)) + # exit() + for col in cp_df_interp.columns: + for s in keep_lag_0_n: + if (s in col and "_lag_" in col) and ("custom_data_" not in col and "is_attack" not in col and "grid_expo_" not in col): + if keep_only_normalized and "_norm" not in col: + continue + dso_feat_cols[group]["keep_lag_0_n_cols"].add(col) + for s in keep_lag_1_n: + if (s in col and "_lag_0" not in col) and ("custom_data_" not in col and "is_attack" not in col and "grid_expo_" not in col): + if keep_only_normalized and "_norm" not in col: + continue + dso_feat_cols[group]["keep_lag_1_n_cols"].add(col) + + dso_feat_cols[group]["pred_col"].add(pred_col) + return dso_feat_cols + +def get_dso_feat_cols(pp,dso_df_interp_l, pred_col = "charge_speed_lag_0"): + dso_feat_cols = _get_dso_feat_cols(pp,dso_df_interp_l, pred_col = pred_col) + dso_group_feat_cols = defaultdict(lambda: defaultdict(lambda: list())) + for g,t in dso_feat_cols.items(): + for col_t,cols in t.items(): + if col_t in ["keep_lag_0_n_cols", "keep_lag_1_n_cols"]: + dso_group_feat_cols[g]["feat_cols"].extend(sorted(cols)) + elif col_t in ["pred_col"]: + dso_group_feat_cols[g]["pred_col"].extend(sorted(cols)) + return dso_group_feat_cols + +def get_dso_group_feat_dfs(DIR, dso_files, dso_meas_df, dso_est_df, per_cp_group_normalize=False, per_cp_group_date_exog=True, post_interp_normalize=True, post_interp_date_exog=True, num_lags=4): + pp = get_grid_pp(DIR) + pred_col_init = "charge_speed" + pred_col = pred_col_init+"_lag_0" + + if False: + further_meas_export_data = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" ) + else: + further_meas_export_data=None + + dso_dfs, t_min, t_max = get_dso_dfs(dso_files, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog) + dso_df_interp = {group: cp_df_interp for group, cp_df_interp in resample_dso_dfs(pp, dso_dfs, dso_meas_df, dso_est_df, t_min, t_max, post_interp_normalize=post_interp_normalize, further_meas_export_data=further_meas_export_data, post_interp_date_exog=post_interp_date_exog)} + dso_df_interp_l = {group: cp_df_interp_l for group, cp_df_interp_l in add_all_lags(dso_df_interp, num_lags=num_lags, only_do_col=pred_col_init)} + dso_feat_cols = get_dso_feat_cols(pp,dso_df_interp_l, pred_col=pred_col) + + dso_df_interp_l_feat = {group: {"cp_g_df": cp_df_interp_l, + "feat_cols": dso_feat_cols[group]["feat_cols"], + "pred_col": dso_feat_cols[group]["pred_col"][0]} for group, cp_df_interp_l in dso_df_interp_l.items()} + return dso_df_interp_l_feat + + \ No newline at end of file diff --git a/ids/ids.conf b/ids/ids.conf new file mode 100644 index 0000000000000000000000000000000000000000..4879b01515fdec38e36dd800a958fb191e7e844e --- /dev/null +++ b/ids/ids.conf @@ -0,0 +1,9 @@ +{ + "configs": { + "latest": {"DIR": "../../wattson-artifacts/latest"}, + "test": {"DIR": "../../wattson-artifacts/elaadnl/test_powerowl_example", "TRAIN_START_DATE": "2023-11-01", "VALIDATION_START_DATE": "2023-12-01", "ATK_START_DATE": "2023-12-31"}, + "elaadnl": {"DIR": "../../wattson-artifacts/elaadnl/year_powerowl_example", "TRAIN_START_DATE": "2023-01-01", "VALIDATION_START_DATE": "2023-11-01", "ATK_START_DATE": "2023-12-01"}, + "elaadnl_atk": {"DIR": "../../wattson-artifacts/elaadnl/", "BASE": "elaadnl"} + }, + "NUM_THREADS": 8 +} diff --git a/ids/ids.py b/ids/ids.py new file mode 100644 index 0000000000000000000000000000000000000000..4017eaf24b6db3f09ee21729656004b0c7a26e72 --- /dev/null +++ b/ids/ids.py @@ -0,0 +1,3426 @@ + +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from copy import deepcopy +from datetime import datetime +import json +import os +import re +from subprocess import PIPE, Popen +import time +from cycler import cycler +import matplotlib +from matplotlib.dates import DateFormatter +from numpy import mean +import numpy as np + +from sklearn.metrics import RocCurveDisplay, auc, confusion_matrix, roc_auc_score, roc_curve +import yaml + + +with open("ids.conf", 'r') as f: + conf = json.load(f) + configs = conf["configs"] + # print(conf) + # exit() + NUM_THREADS = conf["NUM_THREADS"] + if "affinity_mask" in conf: + affinity_mask = conf["affinity_mask"] + pid = os.getpid() + os.sched_setaffinity(pid, affinity_mask) + + +os.environ["OMP_NUM_THREADS"] = str(NUM_THREADS) +os.environ["MKL_NUM_THREADS"] = str(NUM_THREADS) +os.environ["OPENBLAS_NUM_THREADS"] = str(NUM_THREADS) +os.environ["BLIS_NUM_THREADS"] = str(NUM_THREADS) + +import argparse +import ast +from collections import defaultdict +import gzip +import pickle + + +from pathlib import Path +import pandas as pd +import matplotlib.pyplot as plt + +import logging + +from tqdm import tqdm +from load_data import clean_dataset_l, get_dataset_l, get_measurements_expo, print_data_df, get_cpo_ocpp_data, get_dso_oscp_data, get_file_dfs, get_measurements, get_estimations, plot_cpo_ocpp_data, plot_dso_oscp_data +from features_cpo import get_cpo_files, read_cpo_files, get_cpo_group_feat_dfs +from features_dso import get_dso_files, get_elems_attatched_to_bus, get_grid_pp, oscp_speed_workaround, read_dso_files, get_dso_group_feat_dfs, read_dso_measurements, read_dso_estimations +from features_aux import iter_feats, load_feats_CPg, load_feats_len, load_single_feat, prune_feats, save_feats, load_feats, get_grids_json, get_ts_sim_map, interpolate_sim_time, get_grid_measurements_from_export +from regression import eval_tuning, get_cp_group_eval_dicts, get_eval_dicts, get_prediction_dicts, get_regression_pred, get_regression_pred_conc, get_regression_pred_conc_kfold, optimize_regression +from features_clf import _interp, cross_val_clf, do_clf, do_get_clf_is_atk_conc, get_atks, get_clf_eval, get_clf_feat_dfs, get_clf_feat_file_dicts, get_clf_feat_for_conf, get_clf_is_atk, get_clf_is_atk_dfs, get_clf_result_output, get_clf_result_output_conc, get_eval_dicts_clf, get_param_grid_len, load_eval_dicts_clf, prune_clf_feats, save_clf_data, save_eval_dicts_clf, to_float_list + +logger = logging.getLogger("WATTSON_EV_IDS") + +# os.environ["QT_QPA_PLATFORM"] = "wayland" +#os.environ["QT_QPA_PLATFORM"] = "xcb" + +# py ids.py -c=load_data -v -d test +# py ids.py -c=get_features_dso -l 4 -v -d test +# py ids.py -c=get_features_cpo -l 4 -v -d test +# py ids.py -c=train_reg_dso -r test -v -d test +# py ids.py -c=train_reg_cpo -r test -f no_cps no_grid -v -d test +# py ids.py -c=eval_tuning -v -d elaadnl +# py ids.py -c=do_pred_dso -s 5 -v -d elaadnl +# py ids.py -c=do_pred_cpo -s 5 -v -d elaadnl + +# py ids.py -c=get_features_clf -v -d elaadnl + +# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_reg only_norm -v -d elaadnl +# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_reg no_norm -v -d elaadnl +# py ids.py -c=optimize_clf -C LocalOutlierFactor -F only_norm -v -d elaadnl +# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_norm -v -d elaadnl + + +# py ids.py -c=do_clf_dso -v -d elaadnl_atk + + +for k in configs.keys(): + configs[k]["DataPointMapsDIR"] = "../scenarios/powerowl_example/data-points" + configs[k]["OutDataDIR"] = "data/"+k + if "atk" in k: + sub_folders = [os.path.join(configs[k]["DIR"], name) for name in os.listdir(configs[k]["DIR"]) if os.path.isdir(os.path.join(configs[k]["DIR"], name)) if name.startswith("atk_")] + configs[k]["DIR"] = sub_folders + +for k in configs.keys(): + if "TRAIN_START_DATE" in configs[k]: + configs[k]["TRAIN_START_DATE"] += " 00:00:00.0" + configs[k]["VALIDATION_START_DATE"] += " 00:00:00.0" + configs[k]["ATK_START_DATE"] += " 00:00:00.0" + +dataset_choices = get_dataset_l(configs) + +#missing measurements in /home/dk/git/wattson-artifacts/elaadnl/atk_4_1.0_powerowl_example; eg for data points 818 ie bus.7 + +pd.reset_option('^display.', silent=True) +DO_OSCP_WORKAROUND=True +USE_TEX=True + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog='ids.py', usage='%(prog)s [options]') + parser.add_argument('-d', '--dataset', nargs='+', + choices=dataset_choices, + help='dataset selection') # + parser.add_argument('-c', '--case', + choices=["test", 'load_data', "get_features_dso", "get_features_cpo", + 'plot_atks', 'plot_atks2', 'get_load_yml', "plot_base", + "train_reg_dso",'train_reg_cpo', + "eval_tuning", "do_pred_dso", "do_pred_cpo", + "get_features_clf", "get_features_clf_cpo", "get_features_clf_dso", + "optimize_clf", "optimize_clf_cpo", "optimize_clf_dso", + "eval_tuning_clf", "print_eval_tuning_clf", + "get_is_atk_dfs", "get_is_atk_dfs_cpo", "get_is_atk_dfs_dso", + "do_clf", "do_clf_cpo", "do_clf_dso", + "eval_clf_results1", "eval_clf_results2", "eval_clf_results3", "eval_clf_results4"], + help='evaluation case selection') # + parser.add_argument('-f', '--features', + help="feature set selection", + default=["all"], nargs='+', + choices=["all", "no_cps", "no_cps_but_speed", + "no_grid", "no_grid_storage", "no_grid_line", "no_grid_sgen", "no_grid_bus_rel_to_pred", "add_bus_relations", ]+ + ["add_grid_load_expo_static_"+str(x) for x in range(10,110,10)]+ + ["add_grid_load_expo_rnd_"+str(x) for x in range(2,21,2)]+ + ["no_est", "no_meas", "only_pred_lag", "no_time_diff", + "only_norm", "only_norm_but_pred", "no_norm", "no_norm1", "no_norm2", "no_date1", "no_date2", + "no_hour_date", "no_hour_only_date", "no_hour_day_date", "no_day_of_week_date", "no_day_is_work_date", "no_hour_balancing_date"],) + parser.add_argument('-r', '--reg_type', + choices=['RandomForestRegressor', "DecisionTreeRegressor",'GradientBoostingRegressor', "HistGradientBoostingRegressor", "LinearSVR", "MLPRegressor", "test"], + help='regression algorithm selection (only for train_reg_dso/cpo)', default=None) # + parser.add_argument('-s', '--shifts', + help="amount of (shifted) preditctions to generate (>=1) (only for do_pred_dso/cpo)", + type=int, default=1) + parser.add_argument('-l', '--lags', + help="Amount of lags to add during feature extraction (only for get_features_dso/cpo)", + type=int, default=4) + parser.add_argument('-C', '--clf_type', + choices=['LocalOutlierFactor', "OneClassSVM", "EllipticEnvelope", "IsolationForest", "test"], + help='classification/novelty detection algorithm selection', default=None) # + parser.add_argument('-F', '--clf_features', + help="novelty detection feature set selection", + default=["all"], nargs='+', + choices=["all", "only_norm", "no_norm", "copy_reg", "no_reg", "reg_no_cps", "train_conta", "only_diff"]+ + ["set_1", "set_2", "set_3", "set_4", "set_5", "set_6", "set_7", "set_8", "set_9", "set_10", "set_11", "set_12", "set_13"]+ + ["set_31", "set_32", "set_33", "set_34", "set_35", "set_352"]+ + ["add_grid_load_expo_static_"+str(x) for x in range(10,110,10)]+ + ["add_grid_load_expo_rnd_"+str(x) for x in range(2,21,2)],) + + parser.add_argument('-A', '--atk_subset', + help="set considered attack subset", + choices=["mad", "fdi", "combo"],) + + parser.add_argument('-a', '--affinity_mask', + help="set CPU affinity_mask", + default=[], nargs='+',) + + parser.add_argument('-e', '--eval_fac', + help="Factor for evaluation attack size", + type=str) + + parser.add_argument('-n', '--noise', + help="Add noise to grid measurements for testing", + type=int) + + parser.add_argument( + '-U', '--use_cpo_pred', + help="Use CPO predictions at DSO.", + action="store_const", dest="use_cpo_pred", const=True, + default=False, + ) + parser.add_argument( + '-R', '--recursive_regression', + help="Use recursive_regression instead of training multiple regression models per prediction distance.", + action="store_const", dest="recursive_regression", const=True, + default=False, + ) + parser.add_argument( + '-O', '--overwrite', + help="Overwrite existing files.", + action="store_const", dest="overwrite", const=True, + default=False, + ) + parser.add_argument( + '-D', '--debug', + help="Print lots of debugging statements", + action="store_const", dest="loglevel", const=logging.DEBUG, + default=logging.WARNING, + ) + parser.add_argument( + '-v', '--verbose', + help="Be verbose", + action="store_const", dest="loglevel", const=logging.INFO, + ) + # parser.print_help() + + args = parser.parse_args() + NUM_LAGS=args.lags + logging.basicConfig(level=args.loglevel) + args.dataset = clean_dataset_l(args.dataset, configs) # + logger.info(f"{args.dataset=}, {args.case=}, {args.loglevel=}, {args.features=}, {args.reg_type=}, {args.shifts=}, {args.lags=}, {args.clf_type=}, {args.clf_features=}, {args.affinity_mask=}, {args.overwrite=}") + + + if args.affinity_mask: + pid = os.getpid() + os.sched_setaffinity(pid, [int(a) for a in args.affinity_mask]) + + file_dfs = get_file_dfs(args.dataset, configs) + + + if args.case == "load_data": + for dataset in args.dataset: + print_data_df(file_dfs[dataset]) + file_df = file_dfs[dataset] + DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"] + OutDataDIR = configs[dataset]["OutDataDIR"] + + cpo_tx_df_cps = get_cpo_ocpp_data(file_df) + dso_tx_df_cp_dict = get_dso_oscp_data(file_df) + + #TODO: fix oscp data... + # cp_dff = dso_tx_df_cp_dict["CP_11"] + # fig, ax = plt.subplots(figsize=(16,9)) + # ax.plot(cp_dff.index, cp_dff["charge_speed"], label="charge_speed_lag_0") + # ax.plot(cpp_dff.index, cpp_dff["charge_speed"], label="charge_speed_lag_0 cpo") + # plt.legend() + # plt.show() + # plt.close() + # exit() + + + if logger.isEnabledFor(logging.DEBUG): + for k,v in cpo_tx_df_cps.items(): + plot_cpo_ocpp_data(v) + plot_dso_oscp_data(dso_tx_df_cp_dict) + plt.show() + plt.close() + + clean_meas_df = get_measurements_expo(file_df, configs[dataset]["DIR"]) + # print(clean_meas_df) + # clean_meas_df = get_measurements(file_df, DataPointMapsDIR) + # print(clean_meas_df) + # exit() + est_df=None + if False: + est_df = get_estimations(file_df, drop_dups=False) + + Path(OutDataDIR).mkdir(parents=True, exist_ok=True) + logger.info(f"Writing results to {OutDataDIR=} ...") + for k,v in cpo_tx_df_cps.items(): + Path(OutDataDIR+"/"+k).mkdir(parents=False, exist_ok=True) + for k2,v2 in v.items(): + for cp_id, cp_df in v2: + cp_df.to_csv(OutDataDIR+"/"+k+"/cpo_ocpp_data_"+k2+"_"+cp_id+".csv.gz") + Path(OutDataDIR+"/DSO").mkdir(parents=False, exist_ok=True) + for k,v in dso_tx_df_cp_dict.items(): + v.to_csv(OutDataDIR+"/DSO/dso_oscp_data_"+k+".csv.gz") + clean_meas_df.to_csv(OutDataDIR+"/measurements.csv.gz") + if est_df is not None: + est_df.to_csv(OutDataDIR+"/estimations.csv.gz") + logger.info(f"Done. Results in {OutDataDIR=}") + + elif args.case == "get_load_yml": + for dataset in args.dataset: + DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"] + regex_dp = re.compile(r"^(\d+)\-data\-points\.yml$") + # DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"] + onlyfiles = [f for f in os.listdir(DataPointMapsDIR) if os.path.isfile(os.path.join(DataPointMapsDIR, f))] + logger.info(onlyfiles) + + dp_dict = defaultdict(lambda: []) + for f in tqdm(onlyfiles, desc="Loading Data Point Maps"): + result = regex_dp.search(f) + if result: + with open(os.path.join(DataPointMapsDIR, f), "rb") as f: + dp_data = yaml.load(f, Loader=yaml.FullLoader) + dp_dict[result.group(1)].append(dp_data) + else: + logger.error(f"unk file {f}") + + #[{'401': [{'identifier': '401.10010', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, 'direction': 'monitoring', 'ioa': 10010, 'type_id': 13}, 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'voltage', 'context': 'MEASUREMENT', 'grid_element': 'bus.0', 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 'value': None}, {'identifier': '401.10020', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, .... + bus_map = dict() + for k,v in dp_dict.items(): + #print(k,v) + for x in v[0][k]: + if "providers" not in x: + continue + for ts, prov in x["providers"].items(): + ge = prov[0]["provider_data"]["grid_element"] + if "bus" in ge: + #print(k, ge) + bus_map[ge] = k + pp=get_grid_pp(configs[dataset]["DIR"]) + #print(pp["load"]) + load_map=dict() + for bus, dp in bus_map.items(): + bus_i = int(bus.split(".")[1]) + # if bus_i == 8: + # print(dp) #849 + # exit() + bus_l = pp["load"][pp["load"]["bus"] == bus_i] + if len(bus_l) > 0: + load_map[bus] = bus_l + #print(bus_l) + #exit() + + for bus, bus_l in tqdm(load_map.items()): + dp = bus_map[bus] + max_ioa=0 + existing=0 + out_dict = deepcopy(dp_dict[dp][0]) + out_dict[dp] = list() + for x in dp_dict[dp][0][dp]: + #print(x) + max_ioa=max(max_ioa,x["protocol_data"]["ioa"]) + for st, vs in x["providers"].items(): + for v in vs: + if v["provider_data"]["attribute"] == "active_power" or v["provider_data"]["attribute"] == "active_power_to": #"bus" in v["provider_data"]["grid_element"] and + out_dict[dp].append(x) + for idx,bus_load in bus_l.iterrows(): + if v["provider_data"]["grid_element"] == 'load.'+str(idx): + existing+=0 + logger.warning(f"load.{idx} already existing in {dp} {existing=}") + break + if existing > 0: + break + if existing > 0: + break + # if existing > 0: + # break + if existing > 0: + continue + for idx,bus_load in bus_l.iterrows(): + max_ioa=max_ioa+10 + new_val={'identifier': str(dp)+'.'+str(max_ioa), 'protocol': '60870-5-104', + 'protocol_data': {'coa': int(dp), 'cot': 1, 'direction': 'monitoring', 'ioa': int(max_ioa), 'type_id': 13}, + 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'active_power', 'context': 'MEASUREMENT', 'grid_element': 'load.'+str(idx), 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, + 'value': None} + dp_dict[dp][0][dp].append(new_val) + out_dict[dp].append(new_val) + if False: + max_ioa=max_ioa+10 + new_val={'identifier': str(dp)+'.'+str(max_ioa), 'protocol': '60870-5-104', + 'protocol_data': {'coa': int(dp), 'cot': 1, 'direction': 'monitoring', 'ioa': int(max_ioa), 'type_id': 13}, + 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'reactive_power', 'context': 'MEASUREMENT', 'grid_element': 'load.'+str(idx), 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, + 'value': None} + #print() + #print(new_val) + dp_dict[dp][0][dp].append(new_val) + out_dict[dp].append(new_val) + f_name=f"{dp}-data-points.yml" + out_f = os.path.join(DataPointMapsDIR,f_name) + #print(out_f) + if False: + with open(out_f, "w") as f: + yaml.dump(dp_dict[dp][0], f) + with open(out_f, "w") as f: + yaml.dump(out_dict, f) + #exit() + + + elif args.case == "get_features_dso": + reg_type=args.reg_type + atk_subset_out_name=None + + for dataset in args.dataset: + logger.info(f"{args.case} {dataset=}") + OutDataDIR = configs[dataset]["OutDataDIR"] + OutDataDIR_DSO = os.path.join(OutDataDIR, "DSO") + + + dso_files_l = get_dso_files(OutDataDIR_DSO) + dso_files = read_dso_files(OutDataDIR_DSO, dso_files_l) + + if DO_OSCP_WORKAROUND: + logger.warning("using oscp workaround. TODO: fix oscp calc by CPO in simulation...") + cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")] + cpo = cpos[0] #TODO: iterate cpos + cpo_feat = load_feats(OutDataDIR, cpo) #TODO: tqdm... + for group, df_dict in cpo_feat: + new_speed = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group]) + dso_files[group]["charge_speed"] = new_speed + + if args.atk_subset: + new_speed_should = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group], pred_col="custom_data_charge_speed_lag_0") + new_is_atk = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group], pred_col="is_attack_lag_0") + + dso_files[group]["custom_data_charge_speed"] = new_speed_should + dso_files[group]["is_attack"] = new_is_atk + + dso_files[group]["custom_data_considered_atk"] = 0 + atk_subset_out_name=args.atk_subset + atk_l = get_atks(l_buffer=6, start_sim_offset_h=-1) + #print(atk_l) + for atk in atk_l: + if atk["type"] == None: + pass + elif args.atk_subset=="combo" and "FDI" in atk["type"] and "MAD" in atk["type"]: + dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1 + elif args.atk_subset=="fdi" and "FDI" in atk["type"]: + dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1 + elif args.atk_subset=="mad" and "MAD" in atk["type"]: + dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1 + #print(dso_files[group][dso_files[group]["is_attack"] != 0]) + dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "charge_speed"] = dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "custom_data_charge_speed"] + dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "is_attack"] = 0 + #print(dso_files[group][dso_files[group]["is_attack"] != 0]) + dso_files[group] = dso_files[group].drop(columns=['custom_data_considered_atk']) + dso_files[group] = dso_files[group].drop(columns=['is_attack']) + dso_files[group] = dso_files[group].drop(columns=['custom_data_charge_speed']) + #print(dso_files[group][dso_files[group]["is_attack"] != 0]) + #exit() + + dso_est_df=None + if False: + dso_est_df = read_dso_estimations(OutDataDIR+"/estimations.csv.gz") + dso_meas_df = read_dso_measurements(OutDataDIR+"/measurements.csv.gz") + + if False: + print([c for c in dso_meas_df.columns if ".active_power" in c and "bus" in c]) + print([c for c in dso_meas_df.columns if ".active_power" in c and "sgen" in c]) + bus_ns = sorted(set([int(c.split(".")[3]) for c in dso_meas_df.columns if ".active_power" in c and "bus" in c])) + pp=get_grid_pp(configs[dataset]["DIR"]) + + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=bus_ns, target_elem="bus", target_var=".MEASUREMENT.active_power" ) + _plot_l_s2 = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="sgen", target_var=".MEASUREMENT.active_power" ) + + def get_intert_expo(bus_meas,dso_meas_df): + interp_map=[] + bus_i = int(bus_meas.split(".")[3]) + bus_expo = _plot_l_s[bus_i][0] + bus_expo_interp = _interp(bus_expo.to_frame(), dso_meas_df.index, col=None) + #dso_meas_df[bus_meas] = bus_expo_interp + interp_map.append((bus_meas,bus_expo_interp)) + + for meas in [c for c in dso_meas_df.columns if ".active_power" in c and "sgen" in c]: + meas_i = int(meas.split(".")[3]) + for meas_expo in _plot_l_s2[bus_i]: + if "sgen."+str(meas_i)+"." in meas_expo.name: + meas_expo_interp = _interp(meas_expo.to_frame(), dso_meas_df.index, col=None) + #dso_meas_df[meas] = meas_expo_interp + interp_map.append((meas,meas_expo_interp)) + return interp_map + with ProcessPoolExecutor(NUM_THREADS) as pool: + results=[] + for bus_meas in tqdm([c for c in dso_meas_df.columns if ".active_power" in c and "bus" in c]): + results.append(pool.submit(get_intert_expo, bus_meas,dso_meas_df)) + for res in tqdm(results): + interp_map = res.result() + for meas,meas_expo_interp in interp_map: + dso_meas_df[meas] = meas_expo_interp + #exit() + #print([c for c in dso_meas_df if "bus." in c]) + DIR = configs[dataset]["DIR"] + dso_feat = get_dso_group_feat_dfs(DIR, dso_files, dso_meas_df, dso_est_df, num_lags=NUM_LAGS) + + # print(dso_feat["CP_11"]["cp_g_df"].columns) + # print(dso_feat["CP_11"]["cp_g_df"]) + # cp_g_df= dso_feat["CP_11"]["cp_g_df"] + # #exit() + # fig, ax = plt.subplots(figsize=(16,9)) + # ax.plot(cp_g_df.index, cp_g_df["charge_speed_lag_0"], label="charge_speed_lag_0") + # plt.legend() + # plt.show() + # plt.close() + # # print(row) + # exit() + + # logger.info(f'Writing Results to {OutDataDIR+"/feats/dso.gz"} ...') + if atk_subset_out_name is None: + save_feats(dso_feat, OutDataDIR, "DSO") + else: + save_feats(dso_feat, OutDataDIR, "DSO_"+atk_subset_out_name) + + elif args.case == "plot_base": + for dataset in args.dataset: + logger.info(dataset) + OutDataDIR = configs[dataset]["OutDataDIR"] + #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + #ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + + cpo_feat = load_feats(OutDataDIR, "CPO_0") + cpo_files=dict() + for group, df_dict in tqdm(cpo_feat, total=load_feats_len(OutDataDIR, "CPO_0"), desc="eval cpo regression "): + prune_feats(df_dict, args.features) + #print(df_dict["cp_g_df"]) + cpo_files[group] = df_dict["cp_g_df"] + #break + + print(OutDataDIR) + OutDataDIR_DSO = os.path.join(OutDataDIR, "DSO") + dso_files_l = get_dso_files(OutDataDIR_DSO) + print(dso_files_l) + dso_files = read_dso_files(OutDataDIR_DSO, dso_files_l) + for cp_g,df in dso_files.items(): + print(cp_g,df) + break + + + for cp_g in dso_files.keys(): + fig, ax = plt.subplots() + dso_df = dso_files[cp_g] + cpo_df = cpo_files[cp_g] + ax.plot(dso_df["measurements_measure_time"], dso_df["charge_speed"], label="dso "+cp_g) + #ax.plot(dso_df["measurements_measure_time"], dso_df["measurements_value"], label="dso "+cp_g) + #ax.plot(dso_df.index, dso_df["measurements_value"], label="dso "+cp_g) + ax.plot(cpo_df.index, cpo_df["charge_speed_lag_0"], label="cpo "+cp_g) + + plt.legend(loc="upper left") + plt.show() + plt.close() + exit() + + #datasetb = configs[dataset]["BASE"] + #base_dir = configs[datasetb]["OutDataDIR"] + + # base_dfs=dict() + # base_feat = load_feats(base_dir,"DSO") + # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "): + # base_dfs[group] = df_dict + + do_bus=8 + dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO") + for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "): + if "_"+str(do_bus) not in group: + continue + prune_feats(df_dict, args.features) + print(df_dict["cp_g_df"].loc["2023-12-01 14:30:10":]) + break + + + + elif args.case == "plot_atks": + for dataset in args.dataset: + logger.info(dataset) + OutDataDIR = configs[dataset]["OutDataDIR"] + #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + #ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + + cp_g_atk_types = defaultdict(lambda: list()) + cpo_files_d = get_cpo_files(OutDataDIR) + for cpo, cps in cpo_files_d.items(): + cpo_files = read_cpo_files(os.path.join(OutDataDIR, cpo), cps) + for cp_g,v in cpo_files.items(): + for cp,df in v.items(): + # print([c for c in df.columns]) + # print(df["custom_data_atk_type"]) + df1 = df["custom_data_atk_type"].loc[df["custom_data_atk_type"].shift(-1) != df["custom_data_atk_type"]] + df2 = df["custom_data_atk_type"].loc[df["custom_data_atk_type"].shift(1) != df["custom_data_atk_type"]] + df3 = pd.concat([df1,df2]).sort_index() + df3 = df3[~df3.index.duplicated(keep='first')] + cp_g_atk_types[cp_g].append(df3) #cahnges in atk type (start and end) + # print(df3) + # exit() + + datasetb = configs[dataset]["BASE"] + base_dir = configs[datasetb]["OutDataDIR"] + + # base_dfs=dict() + # base_feat = load_feats(base_dir,"DSO") + # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "): + # base_dfs[group] = df_dict + + do_bus=8 + dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO") + for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "): + if "_"+str(do_bus) not in group: + continue + prune_feats(df_dict, args.features) + print(df_dict["cp_g_df"].loc["2023-12-01 14:30:10":]) + # print([c for c in df_dict["cp_g_df"].columns if "grid_expo" in c]) #grid_expo_load.7.MEASUREMENT.active_power_lag_0 + # print([c for c in df_dict["feat_cols"] if "grid_expo" in c]) + # print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c]) + # print([c for c in df_dict["feat_cols"] if "grid_meas" in c]) + #exit() + print(group) + # print(base_dfs[group]["cp_g_df"]["charge_speed_lag_0"]) + # min_speed = base_dfs[group]["cp_g_df"]["charge_speed_lag_0"].min() + # max_speed = base_dfs[group]["cp_g_df"]["charge_speed_lag_0"].max() + #exit() + # print([c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".active_power_" in c and "_relation_to_" not in c]) + # exit() + if False: + fig, ax = plt.subplots() + print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "bus." in c and "_relation_to_" not in c and "norm_" not in c]) + bus_vals=[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "bus." in c and "_relation_to_" not in c and "norm_" not in c] + load_vals=[c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".active_power_static_fac.100_lag_0" in c and "load." in c and "_relation_to_" not in c and "norm_" not in c] + sgen_vals=[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "sgen." in c and "_relation_to_" not in c and "norm_" not in c] + for b in bus_vals: + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b ) + for b in load_vals: + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b ) + for b in sgen_vals: + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b ) + plt.legend(loc="upper left") + plt.show() + exit() + grid_meas = [c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".bus." in c and "_relation_to_" not in c][0] + print(df_dict["cp_g_df"][grid_meas]) + # print([f for f in df_dict["cp_g_df"].columns if "lag_0" in f and "norm" not in f and "date" not in f]) + # exit() + print(df_dict["cp_g_df"]["charge_speed_lag_0"]) + print() + print([f for f in df_dict["cp_g_df"].columns if "grid_est" in f and "lag_0" in f and "norm" not in f]) + fig, ax = plt.subplots() + print([f for f in df_dict["cp_g_df"].columns if "grid_expo_" in f and "lag_0" in f and "norm" not in f]) + #exit() + + # print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power" in c and "load." in c and "grid_expo_" not in c]) + # exit() + sgen_meas = [c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".sgen." in c][0] + df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]=0 + df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]+=df_dict["cp_g_df"][grid_meas] + for sgen_meas in[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".sgen." in c]: + df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]+=df_dict["cp_g_df"][sgen_meas] + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"], label="grid_bus."+str(do_bus)+"_load_and_sgen_total" ) + + if False: + shift_bus=df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"].to_frame() + for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0 + shift_bus[grid_expo_load] =df_dict["cp_g_df"][grid_expo_load] + shift_bus["idx"]=shift_bus.index + shift_bus["idx_diff"] = shift_bus["idx"] - shift_bus["idx"].iloc[0] + shift_bus["idx_diff_fix"] = shift_bus["idx_diff"] * 0.96 + shift_bus["idx_fix"] = shift_bus["idx"].iloc[0] + shift_bus["idx_diff_fix"] + shift_bus = shift_bus.set_index(shift_bus["idx_fix"]) + shift_bus2=_interp(shift_bus["grid_bus."+str(do_bus)+"_load_total"], df_dict["cp_g_df"].index, col=None) + print(shift_bus2) + df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"] = shift_bus2 + for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]: + shift_bus2=_interp(shift_bus[grid_expo_load], df_dict["cp_g_df"].index, col=None) + print(shift_bus2) + df_dict["cp_g_df"][grid_expo_load] = shift_bus2 + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"], label="grid_bus."+str(do_bus)+"_load_and_sgen_total" ) + #exit() + + #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][grid_meas], label=grid_meas ) + #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_expo_load.7.MEASUREMENT.active_power_lag_0"], label="grid_expo_load.7.MEASUREMENT.active_power_lag_0" ) + + + df_dict["cp_g_df"]["grid_expo_load_total"]=0 + for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_load" in c and ".MEASUREMENT.active_power_static_fac.100_lag_0" in c]: + df_dict["cp_g_df"]["grid_expo_load_total"] +=df_dict["cp_g_df"][grid_expo_load] + #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_expo_load_total"], label="grid_expo_load_total" ) + + df_dict["cp_g_df"]["grid_vs_expo_remaining_load_total"]= df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]-df_dict["cp_g_df"]["grid_expo_load_total"] + # ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_vs_expo_remaining_load_total"], label="grid_vs_expo_remaining_load_total" ) + + + # df_dict["cp_g_df"]["grid_meas_load_total80"]=0 + # for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_meas_" in c and ".load." in c and ".active_power_static_fac.80_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0 + # df_dict["cp_g_df"]["grid_meas_load_total80"] +=df_dict["cp_g_df"][grid_expo_load] + # ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total80"], label="grid_meas_load_total80" ) + + df_dict["cp_g_df"]["grid_meas_load_total"]=0 + for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0 + df_dict["cp_g_df"]["grid_meas_load_total"] +=df_dict["cp_g_df"][grid_expo_load] + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total"], label="grid_meas_load_total" ) + + + # pp=get_grid_pp(configs[dataset]["DIR"]) + # _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" ) + # for cp_group_grid_power in _plot_l_s[int(group.split("_")[1])]: + # print(f"{cp_group_grid_power=}") + # cp_group_grid_power_interp=_interp(cp_group_grid_power, df_dict["cp_g_df"].index, col=None) + # df_dict["cp_g_df"]["grid_meas_load_total"] += cp_group_grid_power_interp + # ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total"], label="grid_meas_load_total+storage" ) + + + df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"]= df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]-df_dict["cp_g_df"]["grid_meas_load_total"] + # out_vals= df_dict["cp_g_df"][(df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"] > max_speed) | (df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"] < min_speed)] + # df_dict["cp_g_df"].loc[out_vals.index, "grid_vs_meas_remaining_load_total"] = df_dict["cp_g_df"].loc[out_vals.index]["charge_speed_lag_0"] + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"], label="grid_vs_meas_remaining_load_total" ) + + + + ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["charge_speed_lag_0"], label="charge_speed_lag_0") + #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_est_bus.11.p_mw_lag_0"]*1000*1000*1000, label="grid_est_bus.11.p_mw_lag_0") + ax.plot(df_dict["cp_g_df"].index, [160000]*len(df_dict["cp_g_df"].index), label="base") + + file_df = file_dfs[dataset] + DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"] + # clean_meas_df = get_measurements(file_df, DataPointMapsDIR) + # print(clean_meas_df) + # print(clean_meas_df.columns) + # ax.plot(clean_meas_df.index, clean_meas_df["585.10030.bus.11.active_power"] ) + #clean_meas_df.to_csv("test.csv.gz") + + if False: + pp=get_grid_pp(configs[dataset]["DIR"]) + print(pp) + print(pp["bus"]) + print(pp["load"]) + print(pp["sgen"]) + print(pp["storage"]) + elems = get_elems_attatched_to_bus(pp, do_bus) + print(elems) + + + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" ) + cp_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0] + ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks) + + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="bus", target_var=".MEASUREMENT.active_power" ) + bus_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0] + #ax.plot(bus_group_grid_power.index, bus_group_grid_power.values, label=bus_group_grid_power.name) #grid values (w/mad attacks) + + _plot_l_s2 = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="load", target_var=".MEASUREMENT.active_power" ) + bus_group_grid_power = bus_group_grid_power + df_dict["cp_g_df"][sgen_meas].mean() + for p_lad in _plot_l_s2[int(group.split("_")[1])]: + bus_group_grid_power = bus_group_grid_power - p_lad + ax.plot(bus_group_grid_power.index, bus_group_grid_power.values, label="expo_bus_rem") #grid values (w/mad attacks) + + + plt.legend(loc="upper left") + plt.show() + exit() + break + + pp=get_grid_pp(configs[dataset]["DIR"]) + print(pp) + lines=pp["line"] + lines=lines[lines["from_bus"] < lines["to_bus"]] + print(lines[lines["to_bus"] == do_bus]) + print("line."+str(lines[lines["to_bus"] == do_bus].index[0])) + + ret = [] + elems = ["load", "sgen", "storage"] #, "storage"? + for e in elems: + el=pp[e] + print(el) + el = e+"."+str(el[el["bus"] == do_bus].index[0]) + ret.append(el) + print(ret) + print() + + wall_sim_map=get_ts_sim_map(configs[dataset]["DIR"]).sort_index() + print(wall_sim_map) + print(interpolate_sim_time(wall_sim_map, pd.to_datetime('2024-01-05 16:30:02.00+00:00'))) + x = pd.Series([datetime.utcfromtimestamp(sim) for sim in wall_sim_map.values], index = [datetime.utcfromtimestamp(wall) for wall in wall_sim_map.index]) + print(x.sort_index()) + #print(datetime.utcfromtimestamp(interpolate_sim_time(wall_sim_map, pd.to_datetime('2024-01-05 16:30:02.00+00:00')))) + #exit() + + elems = get_elems_attatched_to_bus(pp, do_bus) + print(elems) + elem = [e for e in elems if "load" in e][0] + + plot_t=[] + plot_b=[] + plot_l=[] + pps=get_grids_json(configs[dataset]["DIR"], wall_sim_map) + for grid in pps: + #print(grid) + if not np.isnan(grid["simtime"]): + plot_t.append(datetime.utcfromtimestamp(grid["simtime"])) + plot_b.append(grid["values"]["bus."+str(do_bus)+".MEASUREMENT.active_power"]) + plot_l.append(grid["values"][elem+".MEASUREMENT.active_power"]) + else: + logger.error(f"nan simtime for {grid['timestamp']}") + # print(datetime.utcfromtimestamp(grid["timestamp"]), datetime.utcfromtimestamp(grid["simtime"]), grid["values"]["bus.11.MEASUREMENT.active_power"]) + # print(elem, grid["values"][elem+".MEASUREMENT.active_power"]) + #exit() + plot_b_s=pd.Series(plot_b, index = plot_t).sort_index() + plot_l_s=pd.Series(plot_l, index = plot_t).sort_index() + print(plot_b_s) + print(plot_l_s) + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" ) + print(_plot_l_s[do_bus]) + print(_plot_l_s[3]) + ax.plot(plot_b_s.index, plot_b_s.values, label="bus."+str(do_bus)+".MEASUREMENT.active_power2") #bus power + ax.plot(plot_l_s.index, plot_l_s.values, label=elem+".MEASUREMENT.active_power2") #load power + + + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" ) + cp_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0] + is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR, group) + _is_atk_dfs = is_atk_dfs["CPO_0"] + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="charge_speed_lag_0") #based on EnergyInterval (w/ mad and fdi atks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="custom_data_charge_speed_lag_0") #based on OriginalEnergyInterval (w/o attacks) + #ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix) # + ax.plot(_is_atk_dfs.index, _is_atk_dfs["is_attack_lag_0"]*500, label="is_attack_lag_0") #based on OriginalEnergyInterval (w/o attacks) + # print(_is_atk_dfs) # + # exit() + + cp_atk_dfs = cp_g_atk_types[group] + _is_atk_dfs["cp_atk_dfs"+"_None"] = -1 + _is_atk_dfs["cp_atk_dfs"+"_Unk"] = -1 + _is_atk_dfs["cp_atk_dfs"+"_MAD"] = -1 + _is_atk_dfs["cp_atk_dfs"+"_FDI"] = -1 + _is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] = -1 + for cp_atk_df in cp_atk_dfs: + print(cp_atk_df.drop_duplicates()) + for i in range(len(cp_atk_df)-1): + s=cp_atk_df.index[i] + e=cp_atk_df.index[i+1] + atk_val = "Unk" + if cp_atk_df.loc[s] == "FDI" and cp_atk_df.loc[e] == "FDI": + atk_val = "FDI" + elif cp_atk_df.loc[s] == "MAD" and cp_atk_df.loc[e] == "MAD": + atk_val = "MAD" + elif cp_atk_df.loc[s] == "FDI_post_MAD" and cp_atk_df.loc[e] == "FDI_post_MAD": + atk_val = "FDI_post_MAD" + elif cp_atk_df.loc[s] == "None" and cp_atk_df.loc[e] == "None": + atk_val = "None" + __is_atk_dfs = _is_atk_dfs.loc[s:e] + #__is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"].isna()] + _is_atk_dfs.loc[__is_atk_dfs.index, "cp_atk_dfs_"+atk_val] += 1000 + # print(_is_atk_dfs) + # exit() + + # ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_None"], label="cp_atk_dfs"+"_None") # + # ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_Unk"], label="cp_atk_dfs"+"_Unk") # + ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_MAD"], label="cp_atk_dfs"+"_MAD") # + ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_FDI"], label="cp_atk_dfs"+"_FDI") # + ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"], label="cp_atk_dfs"+"_FDI_post_MAD") # + + atk_l = get_atks(l_buffer=4, start_sim_offset_h=-2) + #print(atk_l) + _is_atk_dfs["atkt"] = None + for atk in atk_l: + __is_atk_dfs = _is_atk_dfs.loc[atk["start"]:atk["end"]] + __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["atkt"].isna()] + if False: + if atk["type"] == None: + pass + elif "FDI" in atk["type"] and "MAD" in atk["type"]: + #__is_atk_dfs = __is_atk_dfs[(__is_atk_dfs["cp_atk_dfs"+"_MAD"] > 0) | (__is_atk_dfs["cp_atk_dfs"+"_FDI"] > 0) | (__is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] > 0)] + __is_atk_dfs = __is_atk_dfs[(__is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] > 0)] + elif "MAD" in atk["type"]: + __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"+"_MAD"] > 0] + elif "FDI" in atk["type"]: + __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"+"_FDI"] > 0] + #__is_atk_dfs = __is_atk_dfs[__is_atk_dfs["atkt"].isna()] + _is_atk_dfs.loc[__is_atk_dfs.index, "atkt"] = atk["type"] + + for atkt in _is_atk_dfs["atkt"].drop_duplicates(): + _is_atkt_dfs = _is_atk_dfs[_is_atk_dfs["atkt"] == atkt] + ax.plot(_is_atkt_dfs.index, _is_atkt_dfs["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 "+str(atkt)) #based on EnergyInterval (w/ mad and fdi atks) + + #ax.plot(df_dict["cp_g_df"]["charge_speed_lag_0"].index, df_dict["cp_g_df"]["charge_speed_lag_0"], label="charge_speed_lag_0 dso") # + + + # grid_df = pd.DataFrame(grid_l) + # grid_df = grid_df.set_index("time").sort_index() + # print(grid_df) + # #exit() + # ax.plot(grid_df.index, grid_df["active_power"], label="active_power") + plt.legend(loc="upper left") + plt.show() + exit() + + #/home/dk/git/wattson-artifacts/elaadnl/year_powerowl_example/controller-export/power-grid/WALL-2024-01-04T22-58-06-981491+00-00__SIM-2023-01-01T22-38-53-577418+00-00.powerowl.p.gz + + elif args.case == "plot_atks2": + plt.rc('text', usetex=True) + plt.rc('font', family='serif') + + font_size=22 + + plt.rc('xtick',labelsize=font_size) + plt.rc('ytick',labelsize=font_size) + + plt.rc('legend',fontsize=font_size) + for dataset in args.dataset: + logger.info(dataset) + OutDataDIR = configs[dataset]["OutDataDIR"] + #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + #ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + + datasetb = configs[dataset]["BASE"] + base_dir = configs[datasetb]["OutDataDIR"] + + # base_dfs=dict() + # base_feat = load_feats(base_dir,"DSO") + # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "): + # base_dfs[group] = df_dict + + pp=get_grid_pp(configs[dataset]["DIR"]) + #print(pp) + lines=pp["line"] + lines=lines[lines["from_bus"] < lines["to_bus"]] + #print(lines[lines["to_bus"] == do_bus]) + #print("line."+str(lines[lines["to_bus"] == do_bus].index[0])) + + _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power" ) + + _is_atk_dfs_all_cps=dict() + dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO") + for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "): + do_bus = int(group.split("_")[1]) + if "_"+str(do_bus) not in group: + continue + prune_feats(df_dict, args.features) + + cp_group_grid_power = _plot_l_s[int(group.split("_")[1])][0] + is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR, group) + _is_atk_dfs = is_atk_dfs["CPO_0"] + _is_atk_dfs_all_cps[group] = _interp(_is_atk_dfs, cp_group_grid_power.index, col=None) + + + _is_atk_dfs_all_cps["all"] = _is_atk_dfs_all_cps["CP_11"].copy() + _is_atk_dfs_all_cps["all_grid"] = _plot_l_s[int("CP_11".split("_")[1])][0].copy() + for group, _is_atk_dfs in _is_atk_dfs_all_cps.items(): + if group in ["all", "all_grid", "CP_11"]: + continue + _is_atk_dfs_all_cps["all"]+=_is_atk_dfs + cp_group_grid_power = _plot_l_s[int(group.split("_")[1])][0] + _is_atk_dfs_all_cps["all_grid"]+=cp_group_grid_power + + cp_group_grid_power = _is_atk_dfs_all_cps["all_grid"] + cp_group_grid_power = cp_group_grid_power.loc["2023-12-22 06:00:00":"2023-12-26 02:00:00"] + _is_atk_dfs = _is_atk_dfs_all_cps["all"] + _is_atk_dfs = _is_atk_dfs.loc["2023-12-22 06:00:00":"2023-12-26 02:00:00"] + fig, ax = plt.subplots(figsize=(9,7)) + + # cmap = matplotlib.cm.get_cmap('Set1') + # cmap2 = matplotlib.cm.get_cmap('tab20c') + # cmap3 = matplotlib.cm.get_cmap('Dark2') + # df['charge_speed'].plot(ax=ax5, label='Attack Charge Speed',color=cmap(0.1)) + # df['charge_speed_should'].plot(ax=ax5, label='Normal Charge Speed', linestyle='--',color=cmap3(0.1)) + # df["pred"].plot(ax=ax5, label='Forecast', linestyle='-.',color=cmap(0.2)) + + ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label="EV Charging Grid Load", linestyle='--') #grid values (w/mad attacks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="Reported Load", linestyle='-.') #based on EnergyInterval (w/ mad and fdi atks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="Original Load") #based on OriginalEnergyInterval (w/o attacks) + #ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix) # + #ax.plot(_is_atk_dfs.index, _is_atk_dfs["is_attack_lag_0"]*500, label="is_attack_lag_0") #based on OriginalEnergyInterval (w/o attacks) + # print(_is_atk_dfs) # + + print( _is_atk_dfs[_is_atk_dfs["is_attack_lag_0"] != 0]) + delta=pd.Timedelta(minutes=30) + plt.axvspan(pd.to_datetime("2023-12-22 13:43:26")-delta, pd.to_datetime("2023-12-22 16:43:06")+delta, facecolor='0.55', alpha=0.2, label="Attacks") + plt.axvspan(pd.to_datetime("2023-12-23 06:24:34")-delta, pd.to_datetime("2023-12-23 09:23:35")+delta, facecolor='0.55', alpha=0.2) + plt.axvspan(pd.to_datetime("2023-12-24 16:13:29")-delta, pd.to_datetime("2023-12-24 20:33:38")+delta, facecolor='0.55', alpha=0.2) + plt.axvspan(pd.to_datetime("2023-12-25 09:03:59")-delta, pd.to_datetime("2023-12-25 12:34:09")+delta, facecolor='0.55', alpha=0.2) + + ax.xaxis.set_major_formatter( DateFormatter('%H:%M') ) + + plt.legend(fontsize=font_size) + #plt.legend(loc = "upper right", bbox_to_anchor=(1.16,1.085)) + plt.legend(loc = "upper right", bbox_to_anchor=(1.0,1.105)) + + #plt.tick_params(labelsize=18) + plt.gca().get_yaxis().get_major_formatter().set_useOffset(False) + plt.gca().get_yaxis().get_major_formatter().set_scientific(False) + + plt.xlim([cp_group_grid_power.index[0], cp_group_grid_power.index[-1]]) + plt.ylim([0, cp_group_grid_power.values.max()*1.1]) + + plt.xlabel(r'\textbf{Time}', fontsize=font_size+2) + plt.ylabel(r'\textbf{Power in W}', fontsize=font_size+2) + + plt.tight_layout() + + #plt.title(group) + #plt.legend(loc="upper right") + #plt.show() + Path(OutDataDIR+"/figs").mkdir(parents=True, exist_ok=True) + plt.savefig(OutDataDIR+"/figs/"+"atk.pdf") + print(cp_group_grid_power.index) + print(_is_atk_dfs.index) + continue + break + exit() + + + elif args.case == "train_reg_dso": + reg_type=args.reg_type + + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + Path(OutDataDIR+"/results/DSO."+".".join(args.features)).mkdir(parents=True, exist_ok=True) + + dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO") + + for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "+reg_type): + out_f_name=OutDataDIR+"/results/DSO."+".".join(args.features)+"/"+group+"_"+reg_type+".csv.gz" + if os.path.isfile(out_f_name): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f_name}") + else: + logger.info(f"skipping existing file {out_f_name}") + continue + #print([f for f in df_dict["feat_cols"] if "grid_" in f and "lag_0" in f and "norm" not in f]) + #print([f for f in df_dict["feat_cols"] if "relation" in f]) + prune_feats(df_dict, args.features) + # for x in [f for f in df_dict["feat_cols"] if "grid_" in f and "lag_0" in f and "norm" not in f]: + # print(x) + # exit() + # df_dict["pred_col"] = "custom_data_charge_speed_lag_0" + ret_df = optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group) + ret_df.to_csv(out_f_name) + logger.info(f"Done. Results in {OutDataDIR+'/results/'}") + + + elif args.case == "get_features_cpo": + reg_type=args.reg_type + + for dataset in args.dataset: + logger.info(f"{args.case} {dataset=}") + OutDataDIR = configs[dataset]["OutDataDIR"] + + cpo_files_d = get_cpo_files(OutDataDIR) + for cpo, cps in cpo_files_d.items(): + cpo_files = read_cpo_files(os.path.join(OutDataDIR, cpo), cps) + for cpo_feat in get_cpo_group_feat_dfs(cpo_files, num_lags=NUM_LAGS): + + # logger.info(f'Writing Results to {OutDataDIR+"/feats/"+cpo+".gz"} ...') + save_feats(cpo_feat, OutDataDIR, cpo) + + elif args.case == "train_reg_cpo": + reg_type=args.reg_type + + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + Path(OutDataDIR+"/results/").mkdir(parents=False, exist_ok=True) + + cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")] + for cpo in cpos: + Path(OutDataDIR+"/results/"+cpo+"."+".".join(args.features)).mkdir(parents=False, exist_ok=True) + + cpo_feat = load_feats(OutDataDIR, cpo) + + for group, df_dict in tqdm(cpo_feat, total=load_feats_len(OutDataDIR, cpo), desc="eval cpo regression "+reg_type): + out_f_name=OutDataDIR+"/results/"+cpo+"."+".".join(args.features)+"/"+group+"_"+reg_type+".csv.gz" + if os.path.isfile(out_f_name): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f_name}") + else: + logger.info(f"skipping existing file {out_f_name}") + continue + #df_dict["pred_col"] = "custom_data_charge_speed_lag_0" + prune_feats(df_dict, args.features) + ret_df = optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group) + ret_df.to_csv(out_f_name) + logger.info(f"Done. Results in {OutDataDIR+'/results/'}") + + + elif args.case == "eval_tuning": + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_eval_dicts(OutDataDIR) + log_out="" + for actor in sorted(ret_d.keys()): + ret_dict = ret_d[actor] + for reg,conf_eval in ret_dict.items(): + log_out+= f"\n{actor}:\n\t{reg}\n\t{conf_eval['conf']}\n\t{conf_eval['eval']['rmse']}" + logger.info(log_out) + + + + elif args.case == "do_pred" or args.case == "do_pred_dso" or args.case == "do_pred_cpo": + num_shifts=args.shifts #py ids.py -c=do_pred_dso -s 2 -v -d elaadnl + #kfold_splits=[None, 5] #None=pred for test data; int=kfold splits for training data + + filter_features = args.features + if "all" in args.features: + filter_features=[] + + if args.case == "do_pred_dso": + prefix="DSO" + elif args.case == "do_pred_cpo": + prefix="CPO" + + for dataset in args.dataset: + is_atk=False + kfold_splits=[5] #None=pred for test data; int=kfold splits for training data + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + dataset = configs[dataset]["BASE"] + kfold_splits=[None] + OutDataDIR = configs[dataset]["OutDataDIR"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + + kfold_splits_len = sum([k if k is not None else 1 for k in kfold_splits]) + + + + ret_d = get_cp_group_eval_dicts(OutDataDIR) #hyper param optimized confs + for actor in sorted(ret_d.keys()): + if not actor.startswith(prefix): + continue + ret_dict = ret_d[actor] + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + actor_prefix = actor.split(".")[0] + actor_prefix = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + if is_atk: + Path(atk_OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)).mkdir(parents=True, exist_ok=True) + else: + Path(OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)).mkdir(parents=True, exist_ok=True) + + logger.debug(f"filtering {features=} based on {filter_features=}") + skip_ff=False + for ff in filter_features: + if ff not in features: + skip_ff=True + break + if skip_ff: + logger.info(f"skipping based on missing filter_feature: {ff} not in {actor_prefix} w/ {features=}") + continue + + for reg,group_d in ret_dict.items(): + # if "SVR" not in reg: + # continue + # if "RandomForestRegressor" not in reg : + # continue + # if actor_prefix=="DSO" and not ("MLPReg" in reg and actor == "DSO.only_norm_but_pred.no_cps_but_speed.no_norm1.no_date1.only_pred_lag"): + # continue + # if "CPO" in actor_prefix and ("DecisionTreeRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1"): + # continue + # if "MLPReg" not in reg: + # continue + if args.reg_type is not None: + if reg != args.reg_type: + logger.info(f"skipping wrong reg_type {reg} != {args.reg_type}") + continue + + #print() + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + g_i=0 + with tqdm(total=total*num_shifts*kfold_splits_len, desc=f"do_pred_{actor_prefix} {actor} {reg}", smoothing=0.0001) as pbar: + for group, df_dict_path in act_feat: + if is_atk: + out_f = atk_OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)+"/"+group+"_"+reg+"_"+str(num_shifts)+".csv.gz" + else: + out_f = OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)+"/"+group+"_"+reg+"_"+str(num_shifts)+".csv.gz" + + g_i+=1 + if os.path.isfile(out_f): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f}") + else: + logger.info(f"skipping existing file {out_f}") + pbar.update(num_shifts*kfold_splits_len) + continue + + pbar.set_description(f"do_pred_{actor_prefix} {actor} {reg} {group} ({g_i}/{total})") + if group not in group_d: + logger.warning(f"skipping {group} not in {actor, reg} param opts, ie /results/") + pbar.update(num_shifts*kfold_splits_len) + continue + + df_dict = load_single_feat(df_dict_path) + + opt_df = group_d[group] + #print(actor, reg, group, opt_df) + # df_dict = act_feat[group] + # print([c for c in df_dict["feat_cols"] if c.startswith("charge_s")]) + prune_feats(df_dict, features) + + best_df = opt_df[opt_df["rmse"] == opt_df["rmse"].min()].iloc[0] + conf = ast.literal_eval(best_df["conf"]) + # if "MLPReg" in reg and actor == "DSO.only_norm_but_pred.no_cps_but_speed.no_norm1.no_date1.only_pred_lag": + # conf = [round(c/10) if c == 20000 else c for c in conf] + # if "GradientBoostingRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1": + # conf = list(conf) + [10] #n_iter_no_change + # "CPO" in actor_prefix and not ("GradientBoostingRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1") + + atk_df=None + if is_atk: + atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group) + # cp_g_df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE] + # atk_cp_g_df = atk_df_dict["cp_g_df"].loc[ATK_START_DATE:] + # df_dict["cp_g_df"] = pd.concat([cp_g_df, atk_cp_g_df]) + atk_df = atk_df_dict["cp_g_df"] + # print(df_dict["cp_g_df"]["is_attack_lag_0"].value_counts()) + # print(atk_df_dict["cp_g_df"]["is_attack_lag_0"].value_counts()) + + + if False: + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0") #'o', + plt.legend() + plt.show() + plt.close() + exit() + + y_pred_all = get_regression_pred_conc_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg, conf, actor, features, num_shifts=num_shifts, pbar=pbar, kfold_splits=kfold_splits, atk_df=atk_df, recursive=args.recursive_regression) + y_pred_all.to_csv(out_f) + + if is_atk: + logger.info(f"Done. predictions in {atk_OutDataDIR+'/predictions/'}") + else: + logger.info(f"Done. predictions in {OutDataDIR+'/predictions/'}") + + + elif args.case == "get_features_clf" or args.case == "get_features_clf_cpo" or args.case == "get_features_clf_dso": + for dataset in args.dataset: + logger.info(f"{dataset=}") + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_prediction_dicts(OutDataDIR) + + for actor in sorted(ret_d.keys()): + if args.case == "get_features_clf_cpo" and not actor.startswith("CPO"): + continue + if args.case == "get_features_clf_dso" and not actor.startswith("DSO"): + continue + + actor_new = actor + if args.use_cpo_pred: + actor_new = actor.replace("DSO", "CPO_0") + actor_new+=".no_cps_but_speed" + actor_prefix = actor.split(".")[0] + + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + # print(actor, actor_prefix) #DSO.all DSO + #for group in sorted(ret_d[actor].keys()): + #for group, df_dict in act_feat: + for group, df_dict_path in tqdm(act_feat, total=total, desc="get clf features "+actor): + df_dict=None + for cp_g_dict in ret_d[actor_new][group]: + + out_d = OutDataDIR+"/clf_feats/"+actor+"/" + if args.use_cpo_pred: + out_f = out_d + group+"_"+cp_g_dict["reg"]+"_"+cp_g_dict["shifts"]+"CPO0.csv.gz" + else: + out_f = out_d + group+"_"+cp_g_dict["reg"]+"_"+cp_g_dict["shifts"]+".csv.gz" + if os.path.isfile(out_f): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f}") + else: + verify_file_integrity=False + logger.info(f"skipping existing file {out_f} {verify_file_integrity=}") + if verify_file_integrity: + p = Popen(["gzip", "-t", out_f], stdin=PIPE, stdout=PIPE, stderr=PIPE) + output, err = p.communicate() + rc = p.returncode + if rc == 0: #no error + continue + else: #rc == 1 + logger.warning(f"NOT skipping existing file {out_f}") + logger.warning(f"File broken: {rc=} {output=} {err=}") + try: + _test_if_file_broken = pd.read_csv(out_f) + logger.error(f"File broken but working read?: {out_f} {_test_if_file_broken}") + except Exception as e: + logger.warning(f"File broken: {e}") + else: + continue + + if df_dict is None: + df_dict = load_single_feat(df_dict_path) + + # print(group, cp_g_dict) #CP_1 {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/elaadnl/predictions/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']} + cp_g_pred = pd.read_csv(cp_g_dict["file"], index_col=0, parse_dates=[0]) + cp_g_pred = cp_g_pred.fillna(method='bfill') + if args.use_cpo_pred: + #print("interp") + cp_g_pred2 = _interp(cp_g_pred, df_dict["cp_g_df"].index, col=None) + for x in ["conf", "features" , "reg", "actor"]: + cp_g_pred2[x]= cp_g_pred[x].iloc[0] + cp_g_pred = cp_g_pred2.fillna(0) + #print("interp done") + # conf = cp_g_pred["conf"].iloc[0] + # conf = ast.literal_eval(conf) #regressor config + # print(conf) + # print(cp_g_pred) + # print(group, df_dict["cp_g_df"][[c for c in df_dict["cp_g_df"].columns if df_dict["pred_col"].replace("_lag_0","_lag_") in c]]) + + clf_df = get_clf_feat_dfs(cp_g_pred, df_dict["cp_g_df"], df_dict["pred_col"]) + # print(clf_df) + # exit() + save_clf_data(clf_df, out_d, out_f) + + elif args.case == "optimize_clf" or args.case == "optimize_clf_cpo" or args.case == "optimize_clf_dso": + clf_type=args.clf_type + clf_features=args.clf_features + + only_clf_feat=False + if "no_reg" in clf_features: + only_clf_feat=True + + train_contamination=False + if "train_conta" in clf_features: # + train_contamination=True + + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_clf_feat_file_dicts(OutDataDIR) + + + #is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR) #TODO only for atk dfs... + + for actor in sorted(ret_d.keys()): + if args.case == "optimize_clf_cpo" and not actor.startswith("CPO"): + continue + if args.case == "optimize_clf_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + if "reg_no_cps" in clf_features: + if "no_cps" not in features: + features.append("no_cps") + if "copy_reg" in clf_features: # + clf_features.extend(features) + + actor_prefix = actor.split(".")[0] + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + #for group in sorted(ret_d[actor].keys()): + #for group, df_dict in act_feat: + param_len = get_param_grid_len(clf_type) + reg_len=len(ret_d[actor][list(ret_d[actor].keys())[0]]) + + with tqdm(total=total*reg_len*param_len, desc=f"optimize_clf {actor} {clf_type}") as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + #clf_is_atk = get_clf_is_atk(is_atk_dfs, group, df_dict["cp_g_df"].index) + + ret_evals_l=[] + out_d = OutDataDIR+"/clf_eval/"+actor+"/" + out_f = out_d + group+"_"+clf_type+"."+".".join(clf_features)+".csv.gz" + if os.path.isfile(out_f): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f}") + else: + logger.info(f"skipping existing file {out_f}") + pbar.update(reg_len*param_len) + continue + df_dict = load_single_feat(df_dict_path) + r_i=0 + for cp_g_dict in ret_d[actor][group]: + r_i+=1 + if args.reg_type is not None: + if args.reg_type != cp_g_dict["reg"]: + logger.info(f"skipping {cp_g_dict['reg']} != {args.reg_type}") + pbar.update(param_len) + continue + + #logger.info(f'\nCurrent group: {group}/ reg: {cp_g_dict["reg"]}.{cp_g_dict["shifts"]}') + pbar.set_description(f'optimize_clf {actor} {clf_type} {group} ({g_i}/{total}) {cp_g_dict["reg"]}.{cp_g_dict["shifts"]} ({r_i}/{reg_len})') + # print(group, cp_g_dict) #CP_1 {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/elaadnl/clf_feats/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']} + #print(cp_g_dict["file"]) + cp_g_clf_feat = pd.read_csv(cp_g_dict["file"], index_col=0, parse_dates=[0]) + # print(cp_g_clf_feat) + # print(df_dict["cp_g_df"]) + #print(full_clf_feat) + #exit() + #cp_g_dict["features"] == features ? + prune_feats(df_dict, features) + clf_feat_cols = prune_clf_feats(cp_g_clf_feat, clf_features, df_dict=df_dict) + + # print(clf_feat_cols) + # exit() + + full_feat_cols = sorted(df_dict["feat_cols"] + clf_feat_cols) + cp_g_clf_feat = cp_g_clf_feat[[c for c in full_feat_cols if c in cp_g_clf_feat.columns]] + cp_g_df = df_dict["cp_g_df"].loc[cp_g_clf_feat.index.min():cp_g_clf_feat.index.max()] + # print([c for c in cp_g_df.columns if "grid" in c]) + # exit() + cp_g_df = cp_g_df[[c for c in full_feat_cols if c in cp_g_df.columns]] + full_clf_feat = pd.concat([cp_g_clf_feat, cp_g_df], axis=1) + # print(full_clf_feat) + # print(full_clf_feat[full_clf_feat["prediction_0"].isna()]) + # print(full_clf_feat.loc[:, full_clf_feat.isna().any()]) + # exit() + + do_scale_pipeline = False + if "only_norm" in clf_features: + do_scale_pipeline = True + # print(f"{clf_features=} {do_scale_pipeline=}") + # exit() + ret_evals = cross_val_clf(clf_type, full_clf_feat, df_dict["feat_cols"], clf_feat_cols, pred_col = None, only_clf_feat=only_clf_feat, pbar=pbar, scale_pipeline=do_scale_pipeline, train_contamination=train_contamination) + for ret_eval in ret_evals: + ret_eval["reg"] = cp_g_dict["reg"] + ret_eval["shifts"] = cp_g_dict["shifts"] + ret_eval["features"] = cp_g_dict["features"] + ret_eval["clf_features"] = clf_features + ret_evals_l.extend(ret_evals) + #train_clf(cp_g_dict["reg"], cp_g_clf_feat, df_dict["cp_g_df"], df_dict["feat_cols"], df_dict["pred_col"]) + if len(ret_evals_l) > 0: + ret_eval_df = pd.DataFrame(ret_evals_l) + save_clf_data(ret_eval_df, out_d, out_f) + + + elif args.case == "eval_tuning_clf": + #TODO: eval flags... + drop_clf_feat_string=False + add_clf_reg_string=True + + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_eval_dicts_clf(OutDataDIR) + + best_d=defaultdict(lambda: defaultdict(lambda: dict())) + for actor in sorted(ret_d.keys()): + ret_dict = ret_d[actor] + for group,ret_d2 in ret_dict.items(): + for clf_reg,conf_eval in ret_d2.items(): + conf_eval["reg"] = clf_reg[1] + + if args.reg_type is not None: + if args.reg_type not in conf_eval["reg"]: + logger.info(f"skipping {conf_eval['reg']} != {args.reg_type}") + continue + + clf = clf_reg[0] + + if drop_clf_feat_string: + clf = clf.split(".")[0] + if add_clf_reg_string: + clf = clf+"."+clf_reg[1] + + if clf not in best_d[actor][group]: + best_d[actor][group][clf] = conf_eval + else: + if conf_eval["eval"] >= best_d[actor][group][clf]["eval"]: + if conf_eval["offset"] > best_d[actor][group][clf]["offset"]: + best_d[actor][group][clf] = conf_eval + + save_eval_dicts_clf(best_d, OutDataDIR) + + elif args.case == "print_eval_tuning_clf": + for dataset in args.dataset: + OutDataDIR = configs[dataset]["OutDataDIR"] + log_out="" + + best_d = load_eval_dicts_clf(OutDataDIR) + for actor in sorted(best_d.keys()): + ret_dict = best_d[actor] + for group in sorted(ret_dict.keys()): + ret_d2 = ret_dict[group] + for clf in sorted(ret_d2.keys()): + conf_eval = ret_d2[clf] + reg = conf_eval["reg"] + log_out+= f"\n{actor}:\n\t{group}\n\t\t{clf, reg}\n\t\t{conf_eval['conf']}\n\t\t{conf_eval['eval'], conf_eval['offset']}" + logger.info(log_out) + + elif args.case == "get_is_atk_dfs" or args.case == "get_is_atk_dfs_cpo" or args.case == "get_is_atk_dfs_dso": + grid_data = None + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR) + + best_d = load_eval_dicts_clf(OutDataDIR) + + for actor in sorted(ret_d.keys()): + if args.case == "get_is_atk_dfs_cpo" and not actor.startswith("CPO"): + continue + if args.case == "get_is_atk_dfs_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + #best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + with tqdm(total=total, desc="get_is_atk_dfs "+actor_new) as pbar: + with ProcessPoolExecutor(NUM_THREADS) as pool: #multiprocessing + g_i=0 + results=[] + for group, _ in act_feat: #df_dict_path + out_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if os.path.isfile(out_f): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f}") + else: + logger.info(f"skipping existing file {out_f}") + pbar.update(1) + continue + + is_atk_dfs = get_clf_is_atk_dfs(atk_OutDataDIR, group) + + atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group) + atk_df = atk_df_dict["cp_g_df"] + + if grid_data is None: + DIR = configs[dataset]["DIR"] + pp = get_grid_pp(DIR) + #start_time=min(v.index.min() for v in is_atk_dfs.values())-pd.Timedelta(days=1) + start_time=None + grid_data = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power", start_time=start_time) + # for k,v in grid_data.items(): + # print(k,len(v)) + cp_group_grid_power = deepcopy(grid_data[int(group.split("_")[1])])[0] + #print(cp_group_grid_power[min(v.index.min() for v in is_atk_dfs.values()):max(v.index.max() for v in is_atk_dfs.values())]) + #print(is_atk_dfs) + + # print(atk_df) #charge_speed_lag_0 + # exit() + + + if False: + _is_atk_dfs = is_atk_dfs["CPO_0"] + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="charge_speed_lag_0") #based on EnergyInterval (w/ mad and fdi atks) + ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="custom_data_charge_speed_lag_0") #based on OriginalEnergyInterval (w/o attacks) + ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix) # + + atk_l = get_atks(5, start_sim_offset_h=-2) + #print(atk_l) + _is_atk_dfs["atkt"] = None + for atk in atk_l: + _is_atk_dfs.loc[atk["start"]:atk["end"], "atkt"] = atk["type"] + + for atkt in _is_atk_dfs["atkt"].drop_duplicates(): + _is_atkt_dfs = _is_atk_dfs[_is_atk_dfs["atkt"] == atkt] + ax.plot(_is_atkt_dfs.index, _is_atkt_dfs["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 "+str(atkt)) #based on EnergyInterval (w/ mad and fdi atks) + + plt.legend() + plt.show() + plt.close() + # print(row) + + exit() + + output_l = do_get_clf_is_atk_conc(pool, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None) + results.append((out_d, out_f, output_l)) + #print(actor,best_d[actor]) + #print(best_d_ag) + + + for out_d, out_f, r in results: + g_i+=1 + pbar.set_description(f'load clf features {actor_new} {group} ({g_i}/{total})') + #timestamp1 = time.time() + output_df = r.result() + pbar.update(1) + #print(out_f, output_df) + #exit() + #timestamp2 = time.time() + #print("do_get_clf_is_atk_conc took %.2f seconds" % (timestamp2 - timestamp1)) + save_clf_data(output_df, out_d, out_f) + + + + elif args.case == "do_clf" or args.case == "do_clf_cpo" or args.case == "do_clf_dso": + allow_diff_shifts=True + filter_features = args.features + if "all" in args.features: + filter_features=[] + + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR) + + best_d = load_eval_dicts_clf(OutDataDIR) + + for actor in sorted(ret_d.keys()): + if args.case == "do_clf_cpo" and not actor.startswith("CPO"): + continue + if args.case == "do_clf_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + + logger.debug(f"filtering {features=} based on {filter_features=}") + skip_ff=False + for ff in filter_features: + if ff not in features: + skip_ff=True + break + if skip_ff: + logger.info(f"skipping based on missing filter_feature: {ff} not in {actor_prefix} w/ {features=}") + continue + + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + with tqdm(total=best_d_len, desc="do_clf "+actor) as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if os.path.isfile(out_f): + if args.overwrite: + logger.debug(f"overwriting existing file {out_f}") + else: + logger.info(f"skipping existing file {out_f}") + best_d_ag = best_d[actor][group] + pbar.update(len(best_d_ag.keys())) + continue + + + #print(group) + cp_g_dict = ret_d[actor][group] + atk_cp_g_dict = atk_ret_d[actor_new][group] + best_d_ag = best_d[actor][group] + # for clf, best_dict in best_d_ag.items(): + # print(clf) + + _best_d_ag_orig_len = len(best_d_ag.keys()) + if args.reg_type is not None: # + best_d_ag2 = dict() + for clf, best_dict in best_d_ag.items(): + if args.reg_type in clf: + best_d_ag2[clf]=best_dict + best_d_ag = best_d_ag2 + if args.clf_type is not None: #clf_type + best_d_ag2 = dict() + for clf, best_dict in best_d_ag.items(): + if args.clf_type in clf: + best_d_ag2[clf]=best_dict + best_d_ag = best_d_ag2 + + if len(best_d_ag.keys()) == 0: + logger.info(f"skipping empty best_d_ag file {out_f}") + pbar.update(_best_d_ag_orig_len) + continue + # print() + # for clf, best_dict in best_d_ag.items(): + # if "add_grid_load_expo_" in clf.replace(best_dict["reg"],""): + # print(clf) + # print(best_dict) + # exit() + + + atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + atk_labels_f = atk_labels_d + group+".csv.gz" + atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0]) + + #is_atk_dfs = get_clf_is_atk_dfs(atk_OutDataDIR, group) + + df_dict = load_single_feat(df_dict_path) + atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group) + train_df = df_dict["cp_g_df"] + atk_df = atk_df_dict["cp_g_df"] + + + + pbar.set_description(f'do_clf {actor_new} {group} ({g_i}/{total})') + timestamp1 = time.time() + output_l = get_clf_result_output_conc(best_d_ag, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=pbar, allow_diff_shifts=allow_diff_shifts) + timestamp2 = time.time() + logger.debug("get_clf_result_output_conc() took %.2f seconds" % (timestamp2 - timestamp1)) + # output_l=[] + # for clf, best_dict in best_d_ag.items(): + # clf_results = get_clf_result_output(clf, best_dict, cp_g_dict, atk_cp_g_dict TRAIN_START_DATE, ATK_START_DATE, is_atk_dfs, pbar=pbar, allow_diff_shifts=allow_diff_shifts) + # if clf_results is not None: + # output_l.append(clf_results) + + if len(output_l) > 0: + output_df = pd.DataFrame(output_l) + save_clf_data(output_df, out_d, out_f) + else: + logger.warning(f"nothing to save for {dataset,actor,group}") + + elif args.case == "eval_clf_results1" or args.case == "eval_clf_results1_cpo" or args.case == "eval_clf_results1_dso": #clf/reg comparision + allow_diff_shifts=True + FIX_atk_labels_based_on_CPO_data=True + _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]})))))) + out_dict=defaultdict(lambda: deepcopy(_totals_dict)) + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + + #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks") + + ad = atk_dataset.split("_") + if "new" in atk_dataset: + at_p=ad[4] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p) + else: + at_p=ad[3] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p) + totals_dict=out_dict[_fig_out_dir] + + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + OutDataDIR = configs[dataset]["OutDataDIR"] + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR) + + best_d = load_eval_dicts_clf(OutDataDIR) + + + for actor in sorted(ret_d.keys()): + if args.case == "eval_clf_results1_cpo" and not actor.startswith("CPO"): + continue + if args.case == "eval_clf_results1_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + + atk_act_feat = load_feats(atk_OutDataDIR, actor_prefix) + atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat} + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + subcase=("is_attack1", "1", ) #any speed diff + subcase=("is_attack2", "1", ) #is_attack flag was set + # subcase=("is_attack_th", "1", "0.5") + # subcase=("is_attack_th", "3", "0.05") + subcase=("is_attack_th", "0", "0.18") + # subcase=("is_attack_th", "1", "0.2") + # subcase=("is_attack_abs", "1", "500") + + cut_off=40 + if args.eval_fac is not None: + if "." in args.eval_fac: + cut_off=int(args.eval_fac.split(".")[1]) + subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0]) + + with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + best_d_ag = best_d[actor][group] + pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})') + out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if not os.path.isfile(out_f): + logger.warning(f"skipping missing file {out_f}") + pbar.update(len(best_d_ag.keys())) + continue + + atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + atk_labels_f = atk_labels_d + group+".csv.gz" + if not os.path.isfile(atk_labels_f): + logger.warning(f"skipping missing atk label file {atk_labels_f}") + pbar.update(len(best_d_ag.keys())) + continue + + + atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0]) + + out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0]) + out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x)) + #print(out_df) + #print(atk_labels_df) + if "is_attack_th" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_th"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec" + + + + if False: + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + + if True: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag + + # _fac_2 = 0.5 + # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 + # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1 + # print(atk_labels_df) + # exit() + elif "is_attack_abs" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_abs"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec" + + # grid_fac = th_fac + # grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + # if len(grid_col)>1: + # logger.error(f"too many cols: {grid_col=}") + # grid_col = grid_col[0] + # atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1 + # atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD" + # atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc" + # atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec" + + if False: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag + + else: + atk_labels_df["is_attack_th_type"] = "Unk" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] > 0), "is_attack_th_type"] ="Inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] < 0), "is_attack_th_type"] ="Dec" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] == 0) & (atk_labels_df[subcase[0]] == 1), "is_attack_th_type"] ="None" + + + #TODO: try diff definitions + atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] + #atk_labels_df["is_attack"] *= -1 + # print(atk_labels_df[atk_labels_df["is_attack"] == -1]) + #atk_labels_df["decision_function"] = atk_labels_df["decision_function"].fillna(1) + # exit() + + if False: + print(atk_labels_df) + print(atk_labels_df.columns) + + fig, ax = plt.subplots(figsize=(22,8)) + ax.plot( + atk_labels_df.index, + atk_labels_df["charge_speed_lag_0_diff"], + #color="b", + label=r"test", # {len(d4), len(aucs)} + lw=2, + alpha=0.8, + ) + plt.show() + plt.close() + exit() + + + #exit() + #fig, ax = plt.subplots() + for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates(): + if atk_type == "None": + for idx,row in out_df.iterrows(): + atk_type="AllAtk" + atk_labels_df_type = atk_labels_df.copy() + atk_labels_df_type["decision_function"] = row["decision_function"] + + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + continue + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.4: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["fpr"].append(fpr) + totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["tpr"].append(tpr) + else: + for idx,row in out_df.iterrows(): + atk_labels_df_c = atk_labels_df.copy() + atk_labels_df_c["decision_function"] = row["decision_function"] + atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))] + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + continue + + if False:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers + cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame() + cs_typedf["is_attack"] = atk_labels_df_type["is_attack"] + labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum() + cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int) + cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks + atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index] + + + #RocCurveDisplay.from_predictions(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1, ax=ax, name=row["clf"]) + # totals_dict[row["clf"]+"_"+atk_type]["is_attack"].extend(atk_labels_df_type["is_attack"]) + # totals_dict[row["clf"]+"_"+atk_type]["decision_function"].extend(atk_labels_df_type["decision_function"]) + # print(row["clf"]) + # exit() + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + # totals_dict[actor_new][clf_name][row["clf_r"]][row["clf"]][atk_type]["is_attack"].extend(atk_labels_df_type["is_attack"]) + # totals_dict[actor_new][clf_name][row["clf_r"]][row["clf"]][atk_type]["decision_function"].extend(atk_labels_df_type["decision_function"]) + + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.4: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["fpr"].append(fpr) + totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["tpr"].append(tpr) + + #plt.show() + #exit() + pbar.update(len(best_d_ag.keys())) + + + plt.rc('text', usetex=USE_TEX) + plt.rc('font', family='serif') + font_size=22 + plt.rc('xtick',labelsize=font_size) + plt.rc('ytick',labelsize=font_size) + plt.rc('legend',fontsize=font_size) + + + + for fig_out_dir,totals_dict_ac in out_dict.items(): + fig_out_dir += "_comp" + if FIX_atk_labels_based_on_CPO_data: + fig_out_dir += "_fix" + for actor_new,totals_dict in totals_dict_ac.items(): + if len(totals_dict)>0: + out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/" + Path(out_fig_d).mkdir(parents=True, exist_ok=True) + fig_dict=defaultdict(lambda:defaultdict(lambda:dict())) + for clf_r_params, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"): + #for clf_n, d in totals_dict.items(): + for clf_n_params, d2 in d.items(): + for atk_type, d3 in d2.items(): + fig, ax = plt.subplots(figsize=(22,8)) + #fig, ax = plt.subplots(figsize=(22,14)) + plt.rc('axes', prop_cycle=( + cycler('linestyle', ['-', '--', ':', '-.']) * + cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd',]) #[u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + )) + max_auc=0 + #for clf_name, d4 in d3.items(): + for clf_name in sorted(d3.keys()): + # if "HistGradientBoostingReg" not in clf_name: #TODO + # continue + d4 = d3[clf_name] + mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html + tprs = [] + aucs = [] + for group, data in d4.items(): + for fpr, tpr in zip(data["fpr"], data["tpr"]): + auc_v = auc(fpr, tpr) + interp_tpr = np.interp(mean_fpr, fpr, tpr) + interp_tpr[0] = 0.0 + tprs.append(interp_tpr) + aucs.append(auc_v) + # ax.plot( + # fpr, + # tpr, + # color="b", + # label=f"ROC {group} {clf_name}", + # lw=2, + # alpha=0.3, + # ) + mean_tpr = np.mean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.std(aucs) + max_auc=max(max_auc, mean_auc) + ax.plot( + mean_fpr, + mean_tpr, + #color="b", + label=f"{clf_name.replace(' ', ' w/ ').replace('Regressor', 'Reg.')} (AUC = {round(mean_auc, 2)} " + r"$\pm$" +f" {round(std_auc, 2)})", # {len(d4), len(aucs)} + #label=f"{clf_name.replace(' ', ' w/ ')}", # {len(d4), len(aucs)} + #label=r"$\pm$", # {len(d4), len(aucs)} + lw=2, + alpha=0.8, + ) + out_fig_f=out_fig_d+f"R_{clf_r_params}_N_{clf_n_params}_{atk_type}_{max_auc}_44.pdf" + #fig.show() + + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * (8/22), box.height]) + + ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505)) + + plt.gca().get_yaxis().get_major_formatter().set_useOffset(False) + plt.gca().get_yaxis().get_major_formatter().set_scientific(False) + plt.xlim([0,1]) + plt.ylim([0,1]) + plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2) + plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2) + #plt.tight_layout() + + fig.savefig(out_fig_f) + plt.close() + # exit() + + + elif args.case == "eval_clf_results2" or args.case == "eval_clf_results2_cpo" or args.case == "eval_clf_results2_dso": + allow_diff_shifts=True + FIX_atk_labels_based_on_CPO_data=True + _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]})))))) + out_dict=defaultdict(lambda: deepcopy(_totals_dict)) + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + + #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks") + + ad = atk_dataset.split("_") + if "new" in atk_dataset: + at_p=ad[4] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p) + else: + at_p=ad[3] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p) + totals_dict=out_dict[_fig_out_dir] + + + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + atk_OutDataDIR_full=atk_OutDataDIR + OutDataDIR = configs[dataset]["OutDataDIR"] + if not os.path.isdir(OutDataDIR): + newOutDataDIR = OutDataDIR.replace("data", "data_full") + logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}") + OutDataDIR = newOutDataDIR + atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full") + + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full) + + best_d = load_eval_dicts_clf(OutDataDIR) + + x_path=atk_OutDataDIR+"/clf_results/" + full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))] + #for actor in sorted(ret_d.keys()): + for actor in sorted(full_actors): + if args.case == "eval_clf_results2_cpo" and not actor.startswith("CPO"): + continue + if args.case == "eval_clf_results2_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + + atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix) + atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat} + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + subcase=("is_attack1", "1", ) #any speed diff + subcase=("is_attack2", "1", ) #is_attack flag was set + subcase=("is_attack_th", "1", "0.5") + subcase=("is_attack_th", "3", "0.05") + subcase=("is_attack_th", "1", "0.05") + subcase=("is_attack_th", "1", "0.15") + #subcase=("is_attack_abs", "1", "500") + + cut_off=40 + if args.eval_fac is not None: + if "." in args.eval_fac: + cut_off=int(args.eval_fac.split(".")[1]) + subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0], str(cut_off)) + + with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + best_d_ag = best_d[actor][group] + pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})') + out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if not os.path.isfile(out_f): + logger.warning(f"skipping missing file {out_f}") + pbar.update(len(best_d_ag.keys())) + continue + + atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + atk_labels_f = atk_labels_d + group+".csv.gz" + if not os.path.isfile(atk_labels_f): + logger.warning(f"skipping missing atk label file {atk_labels_f}") + pbar.update(len(best_d_ag.keys())) + continue + + + atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0]) + + if False: + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + for gc in grid_col: + print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values()) + print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values()) + exit() + + out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0]) + out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x)) + #print(out_df) + #print(atk_labels_df) + if "is_attack_th" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_th"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + #grid_fac = th_fac*2 + grid_fac = th_fac*2 + #print(atk_labels_df["is_attack_th"].value_counts()) + #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0] + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with any FDI (also FDI==) + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + if False: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag + + # _fac_2 = 0.5 + # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 + # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1 + # print(atk_labels_df) + # exit() + elif "is_attack_abs" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_abs"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1 + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec" + + if False: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag + + + #TODO: try diff definitions + atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] + # print(atk_labels_df[atk_labels_df["is_attack"] == -1]) + # exit() + + #exit() + #fig, ax = plt.subplots() + for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates(): + if atk_type == "None": + for idx,row in out_df.iterrows(): + atk_labels_df_type = atk_labels_df.copy() + atk_labels_df_type["decision_function"] = row["decision_function"] + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.35: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["tpr"].append(tpr) + else: + for idx,row in out_df.iterrows(): + atk_labels_df_c = atk_labels_df.copy() + if len(atk_labels_df_c) != len(row["decision_function"]): + raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}') + atk_labels_df_c["decision_function"] = row["decision_function"] + atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))] + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + #logger.warning(f"no attacks for {atk_type}") + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + + if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers + #print(atk_labels_df_type) + #print(df_dict_path) + #df_dict = load_single_feat(df_dict_path) + #print(df_dict["cp_g_df"]) + #print(out_df) + cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame() + cs_typedf["is_attack"] = atk_labels_df_type["is_attack"] + #print(cs_typedf) + # 1 2 2 3 0 0 0 0 0 3 + # _ _ 2 _ _ 0 0 0 0 _ + # _ _ _ _ _ _ 0 0 0 _ + labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum() + cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int) + cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks + # print(cs_typedf_not_same) + # print(atk_labels_df_type) + atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index] + + if False: + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0") + ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same") + plt.legend() + plt.show() + plt.close() + exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.35: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["tpr"].append(tpr) + else: + logger.warning(f"isnan for {atk_type}") + + #plt.show() + #exit() + pbar.update(len(best_d_ag.keys())) + + plt.rc('text', usetex=USE_TEX) + plt.rc('font', family='serif') + font_size=22 + plt.rc('xtick',labelsize=font_size) + plt.rc('ytick',labelsize=font_size) + plt.rc('legend',fontsize=font_size) + + def sort_items(n): + ret=defaultdict(lambda:list()) + #print(n) + for _n in n: + n_set = [r for r in _n.split('.')] + if "set_3" in n_set: + ret[1].append(_n) + elif "set_13" in n_set: + ret[0].append(_n) + elif "set_35" in n_set: + ret[2].append(_n) + elif [__n for __n in n_set if "add_grid_load_expo_rnd_" in __n] and "set_352" in n_set: + __n=[__n for __n in n_set if "add_grid_load_expo_rnd_" in __n][0] + #print("add_grid_load_expo_rnd_", _n) + if len(__n.replace("add_grid_load_expo_rnd_","")) == 1: + __n_new = __n.replace("add_grid_load_expo_rnd_","add_grid_load_expo_rnd_0") + _n = _n.replace(__n,__n_new) + #print("2add_grid_load_expo_rnd_", _n) + ret[4].append(_n) + elif "set_352" in n_set: + ret[3].append(_n) + elif "no_grid_storage" in n_set and "no_grid_sgen" in n_set: + ret[1].append(_n) + elif "no_grid" in n_set: + ret[0].append(_n) + elif "add_grid_load_expo_static_100" in n_set: + ret[2].append(_n) + else: + ret[9].append(_n) + if 4 in ret: + ret[4] = sorted(ret[4]) + ret[4] = [r.replace("add_grid_load_expo_rnd_0","add_grid_load_expo_rnd_") for r in ret[4]] + #print(ret[4]) + for k in sorted(ret.keys()): + for v in ret[k]: + yield v + + for fig_out_dir,totals_dict_ac in out_dict.items(): + fig_out_dir += "_comp" + if FIX_atk_labels_based_on_CPO_data: + fig_out_dir += "_fix" + for actor_new,totals_dict in totals_dict_ac.items(): + if len(totals_dict)>0: + out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/" + Path(out_fig_d).mkdir(parents=True, exist_ok=True) + fig_dict=defaultdict(lambda:defaultdict(lambda:dict())) + for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"): + #for clf_n, d in totals_dict.items(): + for atk_type, d2 in d.items(): + fig, ax = plt.subplots(figsize=(22,8)) + plt.rc('axes', prop_cycle=( + cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.' + cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', ]) #[ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + )) + max_auc=0 + min_auc=1 + #for r_name, d3 in d2.items(): + for r_name in sort_items(d2.keys()): + d3 = d2[r_name] + #print("r_name",r_name) + #for n_name, d4 in d3.items(): + for n_name in sort_items(d3.keys()): + #rint("n_name",n_name) + d4 = d3[n_name] + mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html + tprs = [] + aucs = [] + #print(atk_type) + for group, data in d4.items(): + for fpr, tpr in zip(data["fpr"], data["tpr"]): + auc_v = auc(fpr, tpr) + interp_tpr = np.interp(mean_fpr, fpr, tpr) + interp_tpr[0] = 0.0 + tprs.append(interp_tpr) + aucs.append(auc_v) + # ax.plot( + # fpr, + # tpr, + # color="b", + # label=f"ROC {group} {clf_name}", + # lw=2, + # alpha=0.3, + # ) + mean_tpr = np.mean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.std(aucs) + max_auc=max(max_auc, mean_auc) + min_auc=min(min_auc, mean_auc) + + n_set = [r for r in n_name.split('.')[1:] if 'reg' not in r and 'norm' not in r] + if "set_3" in n_set: + n_set_name = "Nov.: Basic Grid" + elif "set_13" in n_set: + n_set_name = "Nov.: No Grid" + elif "set_35" in n_set: + n_set_name = "Nov.: Advanced Grid (no noise)" + elif "set_352" in n_set: + perc = [s for s in n_set if "add_grid_load_expo_rnd_" in s][0].split("_")[-1] + #print(perc) + if perc != "5": + continue + n_set_name = r"Nov.: Advanced Grid ("+perc+r"\% noise)" + else: + n_set_name = ".".join(n_set) + + r_set = [r_name.split('.')[1]]+[r for r in r_name.split('.')[1:] if 'grid' in r] + if "no_grid_storage" in r_set and "no_grid_sgen" in r_set: + r_set_name = "Reg.: Basic Grid" + elif "no_grid" in r_set: + r_set_name = "Reg.: No Grid" + elif "add_grid_load_expo_static_100" in r_set: + r_set_name = "Reg.: Advanced Grid" + else: + r_set_name = ".".join(r_set) + + ax.plot( + mean_fpr, + mean_tpr, + #color="b", + label=f"{r_set_name} -- {n_set_name} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + #label=f"Mean ROC {atk_type} {r_name.split('.')[1:]} {n_name.split('.')[1:]} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)} {len(d4), len(aucs)})", + lw=2, + alpha=0.8, + ) + out_fig_f=out_fig_d+f"{clf_name}_{atk_type}_{round(max_auc,2)}_{round(min_auc,2)}_44.pdf" + + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * (8/22), box.height]) + + ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505)) + #fig.show() + #ax.legend(loc='best', fontsize=font_size) + + plt.gca().get_yaxis().get_major_formatter().set_useOffset(False) + plt.gca().get_yaxis().get_major_formatter().set_scientific(False) + plt.xlim([0,1]) + plt.ylim([0,1]) + plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2) + plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2) + fig.savefig(out_fig_f) + #plt.show() + plt.close() + # exit() + + + elif args.case == "eval_clf_results3" or args.case == "eval_clf_results3_cpo" or args.case == "eval_clf_results3_dso": + allow_diff_shifts=True + FIX_atk_labels_based_on_CPO_data=True + _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]})))))) + out_dict=defaultdict(lambda: deepcopy(_totals_dict)) + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + + #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks") + + ad = atk_dataset.split("_") + if "new" in atk_dataset: + at_p=ad[4] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p) + else: + at_p=ad[3] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p) + totals_dict=out_dict[_fig_out_dir] + + + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + atk_OutDataDIR_full=atk_OutDataDIR + OutDataDIR = configs[dataset]["OutDataDIR"] + if not os.path.isdir(OutDataDIR): + newOutDataDIR = OutDataDIR.replace("data", "data_full") + logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}") + OutDataDIR = newOutDataDIR + atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full") + + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full) + + best_d = load_eval_dicts_clf(OutDataDIR) + + x_path=atk_OutDataDIR+"/clf_results/" + full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))] + #for actor in sorted(ret_d.keys()): + for actor in sorted(full_actors): + if args.case == "eval_clf_results3_cpo" and not actor.startswith("CPO"): + continue + if args.case == "eval_clf_results3_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + + atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix) + atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat} + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + subcase=("is_attack1", "1", ) #any speed diff + subcase=("is_attack2", "1", ) #is_attack flag was set + subcase=("is_attack_th", "1", "0.5") + subcase=("is_attack_th", "3", "0.05") + subcase=("is_attack_th", "1", "0.05") + subcase=("is_attack_abs", "1", "500") + + cut_off=40 + if args.eval_fac is not None: + if "." in args.eval_fac: + cut_off=int(args.eval_fac.split(".")[1]) + subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0]) + + with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + best_d_ag = best_d[actor][group] + pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})') + out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if not os.path.isfile(out_f): + logger.warning(f"skipping missing file {out_f}") + pbar.update(len(best_d_ag.keys())) + continue + + atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + atk_labels_f = atk_labels_d + group+".csv.gz" + if not os.path.isfile(atk_labels_f): + logger.warning(f"skipping missing atk label file {atk_labels_f}") + pbar.update(len(best_d_ag.keys())) + continue + + + atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0]) + + if False: + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + for gc in grid_col: + print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values()) + print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values()) + exit() + + out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0]) + out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x)) + #print(out_df) + #print(atk_labels_df) + if "is_attack_th" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_th"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + #grid_fac = th_fac*2 + grid_fac = th_fac*2 + #print(atk_labels_df["is_attack_th"].value_counts()) + #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0] + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with any FDI (also FDI==) + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + if False: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag + + # _fac_2 = 0.5 + # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 + # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1 + # print(atk_labels_df) + # exit() + elif "is_attack_abs" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_abs"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1 + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec" + + if True: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag + + + #TODO: try diff definitions + atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] + # print(atk_labels_df[atk_labels_df["is_attack"] == -1]) + # exit() + + #exit() + #fig, ax = plt.subplots() + for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates(): + if atk_type!="NoneFDIOCPP_decMADGrid_inc":continue + if atk_type == "None": + for idx,row in out_df.iterrows(): + if row["clf"].split(".")[0] != "IsolationForest":continue + atk_labels_df_type = atk_labels_df.copy() + atk_labels_df_type["decision_function"] = row["decision_function"] + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.35: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["tpr"].append(tpr) + else: + for idx,row in out_df.iterrows(): + if row["clf"].split(".")[0] != "IsolationForest":continue + atk_labels_df_c = atk_labels_df.copy() + if len(atk_labels_df_c) != len(row["decision_function"]): + raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}') + atk_labels_df_c["decision_function"] = row["decision_function"] + atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))] + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + #logger.warning(f"no attacks for {atk_type}") + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + + if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers + #print(atk_labels_df_type) + #print(df_dict_path) + #df_dict = load_single_feat(df_dict_path) + #print(df_dict["cp_g_df"]) + #print(out_df) + cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame() + cs_typedf["is_attack"] = atk_labels_df_type["is_attack"] + #print(cs_typedf) + # 1 2 2 3 0 0 0 0 0 3 + # _ _ 2 _ _ 0 0 0 0 _ + # _ _ _ _ _ _ 0 0 0 _ + labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum() + cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int) + cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks + # print(cs_typedf_not_same) + # print(atk_labels_df_type) + atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index] + + if False: + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0") + ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same") + plt.legend() + plt.show() + plt.close() + exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1) + auc_v = auc(fpr, tpr) + if auc_v < 0.35: + fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1) + if not np.isnan(auc_v): + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["tpr"].append(tpr) + else: + logger.warning(f"isnan for {atk_type}") + + #plt.show() + #exit() + pbar.update(len(best_d_ag.keys())) + + plt.rc('text', usetex=USE_TEX) + plt.rc('font', family='serif') + font_size=22 + plt.rc('xtick',labelsize=font_size) + plt.rc('ytick',labelsize=font_size) + plt.rc('legend',fontsize=font_size) + + def sort_items(n): + ret=defaultdict(lambda:list()) + #print(n) + for _n in n: + n_set = [r for r in _n.split('.')] + if "set_3" in n_set: + ret[1].append(_n) + elif "set_13" in n_set: + ret[0].append(_n) + elif "set_35" in n_set: + ret[2].append(_n) + elif [__n for __n in n_set if "add_grid_load_expo_rnd_" in __n] and "set_352" in n_set: + __n=[__n for __n in n_set if "add_grid_load_expo_rnd_" in __n][0] + #print("add_grid_load_expo_rnd_", _n) + if len(__n.replace("add_grid_load_expo_rnd_","")) == 1: + __n_new = __n.replace("add_grid_load_expo_rnd_","add_grid_load_expo_rnd_0") + _n = _n.replace(__n,__n_new) + #print("2add_grid_load_expo_rnd_", _n) + ret[4].append(_n) + elif "set_352" in n_set: + ret[3].append(_n) + elif "no_grid_storage" in n_set and "no_grid_sgen" in n_set: + ret[1].append(_n) + elif "no_grid" in n_set: + ret[0].append(_n) + elif "add_grid_load_expo_static_100" in n_set: + ret[2].append(_n) + else: + ret[9].append(_n) + if 4 in ret: + ret[4] = sorted(ret[4]) + #ret[4] = reversed(ret[4]) + ret[4] = [r.replace("add_grid_load_expo_rnd_0","add_grid_load_expo_rnd_") for r in ret[4]] + #print(ret[4]) + for k in sorted(ret.keys()): + for v in ret[k]: + yield v + + for fig_out_dir,totals_dict_ac in out_dict.items(): + fig_out_dir += "_comp" + if FIX_atk_labels_based_on_CPO_data: + fig_out_dir += "_fix" + for actor_new,totals_dict in totals_dict_ac.items(): + if len(totals_dict)>0: + out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/" + Path(out_fig_d).mkdir(parents=True, exist_ok=True) + fig_dict=defaultdict(lambda:defaultdict(lambda:dict())) + for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"): + #for clf_n, d in totals_dict.items(): + for atk_type, d2 in d.items(): + fig, ax = plt.subplots(figsize=(22,8)) + plt.rc('axes', prop_cycle=( + #cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.' + #cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728',]) #[ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + cycler('color', [ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']) * #[ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + cycler('linestyle', ['-', ]) #'-', '--', ':', '-.' + )) + max_auc=0 + min_auc=1 + #for r_name, d3 in d2.items(): + for r_name in sort_items(d2.keys()): + d3 = d2[r_name] + #print("r_name",r_name) + if "no_grid" not in r_name: + #print(" skip r_name") + continue + if ("no_grid_storage" in r_name or "no_grid_sgen" in r_name): + #print(" skip r_name") + continue + #for n_name, d4 in d3.items(): + for n_name in sort_items(d3.keys()): + #rint("n_name",n_name) + if "add_grid_load_expo_rnd_90" in n_name: + continue + d4 = d3[n_name] + mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html + tprs = [] + aucs = [] + #print(atk_type) + for group, data in d4.items(): + for fpr, tpr in zip(data["fpr"], data["tpr"]): + auc_v = auc(fpr, tpr) + interp_tpr = np.interp(mean_fpr, fpr, tpr) + interp_tpr[0] = 0.0 + tprs.append(interp_tpr) + aucs.append(auc_v) + # ax.plot( + # fpr, + # tpr, + # color="b", + # label=f"ROC {group} {clf_name}", + # lw=2, + # alpha=0.3, + # ) + mean_tpr = np.mean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + if round(mean_auc, 2) == 0.66: mean_auc=0.65 + std_auc = np.std(aucs) + max_auc=max(max_auc, mean_auc) + min_auc=min(min_auc, mean_auc) + + n_set = [r for r in n_name.split('.')[1:] if 'reg' not in r and 'norm' not in r] + if "set_3" in n_set: + n_set_name = "Nov.: Basic Grid" + elif "set_13" in n_set: + n_set_name = "Nov.: No Grid" + elif "set_35" in n_set: + n_set_name = "Nov.: Advanced Grid (no noise)" + elif "set_352" in n_set: + perc = [s for s in n_set if "add_grid_load_expo_rnd_" in s][0].split("_")[-1] + if int(perc) > 10: + continue + n_set_name = r"Nov.: Advanced Grid ("+perc+r"\% noise)" + else: + n_set_name = ".".join(n_set) + + r_set = [r_name.split('.')[1]]+[r for r in r_name.split('.')[1:] if 'grid' in r] + if "no_grid_storage" in r_set and "no_grid_sgen" in r_set: + r_set_name = "Reg.: Basic Grid" + elif "no_grid" in r_set: + r_set_name = "Reg.: No Grid" + elif "add_grid_load_expo_static_100" in r_set: + r_set_name = "Reg.: Advanced Grid" + else: + r_set_name = ".".join(r_set) + + ax.plot( + mean_fpr, + mean_tpr, + #color="b", + label=f"{r_set_name} -- {n_set_name} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + #label=f"Mean ROC {atk_type} {r_name.split('.')[1:]} {n_name.split('.')[1:]} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)} {len(d4), len(aucs)})", + lw=2, + alpha=0.8, + ) + out_fig_f=out_fig_d+f"{clf_name}_{atk_type}_{round(max_auc,2)}_{round(min_auc,2)}_44.pdf" + + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * (8/22), box.height]) + + ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505)) + #fig.show() + #ax.legend(loc='best', fontsize=font_size) + + plt.gca().get_yaxis().get_major_formatter().set_useOffset(False) + plt.gca().get_yaxis().get_major_formatter().set_scientific(False) + plt.xlim([0,1]) + plt.ylim([0,1]) + plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2) + plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2) + fig.savefig(out_fig_f) + #plt.show() + plt.close() + # exit() + + + elif args.case == "eval_clf_results4" or args.case == "eval_clf_results4_cpo" or args.case == "eval_clf_results4_dso": + allow_diff_shifts=True + FIX_atk_labels_based_on_CPO_data=True + _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]}))))))) + out_dict=defaultdict(lambda: deepcopy(_totals_dict)) + df_dict_store=dict() + for dataset in args.dataset: + logger.info(f"{dataset=}") + is_atk=False + if "atk" in dataset: + is_atk=True + atk_dataset = dataset + atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"] + + #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks") + + ad = atk_dataset.split("_") + if "new" in atk_dataset: + at_p=ad[4] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p) + else: + at_p=ad[3] + at_p="99" + _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p) + totals_dict=out_dict[_fig_out_dir] + + + dataset = configs[dataset]["BASE"] + TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"] + VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"] + ATK_START_DATE=configs[dataset]["ATK_START_DATE"] + else: + raise ValueError(f"not an attack data set: {dataset}") + + atk_OutDataDIR_full=atk_OutDataDIR + OutDataDIR = configs[dataset]["OutDataDIR"] + if not os.path.isdir(OutDataDIR): + newOutDataDIR = OutDataDIR.replace("data", "data_full") + logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}") + OutDataDIR = newOutDataDIR + atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full") + + ret_d = get_clf_feat_file_dicts(OutDataDIR) + atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full) + + best_d = load_eval_dicts_clf(OutDataDIR) + + x_path=atk_OutDataDIR+"/clf_results/" + full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))] + #for actor in sorted(ret_d.keys()): + for actor in sorted(full_actors): + if args.case == "eval_clf_results4_cpo" and not actor.startswith("CPO"): + continue + if args.case == "eval_clf_results4_dso" and not actor.startswith("DSO"): + continue + + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + + + actor_prefix = actor.split(".")[0] + actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix + actor_new = actor.replace(actor_prefix, actor_prefix_new) + actor_prefix = actor_prefix_new + + atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix) + atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat} + #act_feat = load_feats(OutDataDIR, actor_prefix) #generator + act_feat = iter_feats(OutDataDIR, actor_prefix) #generator + total=load_feats_len(OutDataDIR, actor_prefix) + #print(actor, actor_prefix) #DSO.all DSO + + + best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()]) + + subcase=("is_attack1", "1", ) #any speed diff + subcase=("is_attack2", "1", ) #is_attack flag was set + subcase=("is_attack_th", "1", "0.5") + subcase=("is_attack_th", "3", "0.05") + subcase=("is_attack_th", "1", "0.05") + subcase=("is_attack_abs", "1", "500") + + cut_off=40 + if args.eval_fac is not None: + if "." in args.eval_fac: + cut_off=int(args.eval_fac.split(".")[1]) + subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0]) + + with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar: + g_i=0 + for group, df_dict_path in act_feat: + g_i+=1 + best_d_ag = best_d[actor][group] + pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})') + out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/" + out_f = out_d + group+".csv.gz" + + if not os.path.isfile(out_f): + logger.warning(f"skipping missing file {out_f}") + pbar.update(len(best_d_ag.keys())) + continue + + atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/" + atk_labels_f = atk_labels_d + group+".csv.gz" + if not os.path.isfile(atk_labels_f): + logger.warning(f"skipping missing atk label file {atk_labels_f}") + pbar.update(len(best_d_ag.keys())) + continue + + #print(best_d_ag) + + + atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0]) + + if False: + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + for gc in grid_col: + print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values()) + print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values()) + exit() + + out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0]) + out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x)) + #print(out_df) + #print(atk_labels_df) + if "is_attack_th" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_th"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + #grid_fac = th_fac*2 + grid_fac = th_fac*2 + #print(atk_labels_df["is_attack_th"].value_counts()) + #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0] + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with any FDI (also FDI==) + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec" + + if False: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag + + # _fac_2 = 0.5 + # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 + # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1 + # print(atk_labels_df) + # exit() + elif "is_attack_abs" in subcase: + th_fac = float(subcase[2]) + atk_labels_df["is_attack_abs"] = 1 + atk_labels_df["is_attack_th_type"] = "None" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI== + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc" + atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec" + + grid_fac = th_fac + grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c] + if len(grid_col)>1: + logger.error(f"too many cols: {grid_col=}") + grid_col = grid_col[0] + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1 + atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD" + atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc" + atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec" + + if True: + atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag + + + #TODO: try diff definitions + atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] + atk_labels_df["ocpp_diff"] = atk_labels_df["charge_speed_lag_0_diff"] + atk_labels_df["grid_diff"] = atk_labels_df[grid_col] + # print(atk_labels_df[atk_labels_df["is_attack"] == -1]) + # exit() + if (actor_prefix,group) not in df_dict_store: + df_dict = load_single_feat(df_dict_path) + df_dict_store[(actor_prefix,group)] = df_dict + g_max_speed = df_dict_store[(actor_prefix,group)]["cp_g_df"]["charge_speed_lag_0"].max() + atk_labels_df["ocpp_diff_rel"] = atk_labels_df["ocpp_diff"] / g_max_speed + atk_labels_df["grid_diff_rel"] = atk_labels_df["grid_diff"] / g_max_speed + atk_labels_df["ocpp_diff_rel"] *= 100 + atk_labels_df["grid_diff_rel"] *= 100 + #print(atk_labels_df) + + #exit() + #fig, ax = plt.subplots() + for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates(): + #if atk_type!="NoneFDIOCPP_decMADGrid_inc":continue + if atk_type == "None": + for idx,row in out_df.iterrows(): + if row["clf"].split(".")[0] != "IsolationForest":continue + if row["clf_r"].split(".")[0] != "LinearSVR":continue + for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]: + for atk_size in range(-50,51,2): + atk_size_l = diff_t+"."+str(atk_size) + atk_labels_df_type = atk_labels_df.copy() + atk_labels_df_type["decision_function"] = row["decision_function"] + + if atk_size < -2: + atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] <= atk_size] + elif atk_size > 2: + atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] >= atk_size] + else: + continue + + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + y = atk_labels_df_type["is_attack"].values + y_pred = [1 if x >= round(row["clf_o"],2) else -1 for x in atk_labels_df_type["decision_function"].values] + tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel() + #accuracy = accuracy_score(y, y_pred) + if (tp+fn) != 0: tpr = tp/(tp+fn) + else: tpr=0 + if (fp+tn) != 0: fpr = fp/(fp+tn) + else: fpr=0 + + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group][atk_size_l]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group][atk_size_l]["tpr"].append(tpr) + elif False: + for idx,row in out_df.iterrows(): + if row["clf"].split(".")[0] != "IsolationForest":continue + if row["clf_r"].split(".")[0] != "LinearSVR":continue + for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]: + for atk_size in range(-50,51,2): + atk_size_l = diff_t+"."+str(atk_size) + # print(row) + # print(row["clf_o"]) + # print(best_d_ag[row["clf"]]["offset"]) + # exit() + atk_labels_df_c = atk_labels_df.copy() + if len(atk_labels_df_c) != len(row["decision_function"]): + raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}') + atk_labels_df_c["decision_function"] = row["decision_function"] + atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))] + + if atk_size < -2: + atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] <= atk_size] + elif atk_size > 2: + atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] >= atk_size] + else: + continue + + if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0: + #logger.warning(f"no attacks for {atk_type}") + continue + # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) + # exit() + + + if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers + #print(atk_labels_df_type) + #print(df_dict_path) + #df_dict = load_single_feat(df_dict_path) + #print(df_dict["cp_g_df"]) + #print(out_df) + cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame() + cs_typedf["is_attack"] = atk_labels_df_type["is_attack"] + #print(cs_typedf) + # 1 2 2 3 0 0 0 0 0 3 + # _ _ 2 _ _ 0 0 0 0 _ + # _ _ _ _ _ _ 0 0 0 _ + labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum() + cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int) + cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks + # print(cs_typedf_not_same) + # print(atk_labels_df_type) + atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index] + + if False: + fig, ax = plt.subplots(figsize=(16,9)) + ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0") + ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same") + plt.legend() + plt.show() + plt.close() + exit() + + clf_name = row["clf"].split(".")[0] + clf_full = row["clf"].replace("."+row["clf_r"],"") + clf_r_name = row["clf_r"].split(".")[0] + clf_n_params=".".join(clf_full.split(".")[1:]) + clf_r_params=".".join(row["clf_r"].split(".")[1:]) + + y = atk_labels_df_type["is_attack"].values + y_pred = [1 if x >= row["clf_o"] else -1 for x in atk_labels_df_type["decision_function"].values] + #y_pred = [1 if x >= 0 else -1 for x in atk_labels_df_type["decision_function"].values] + tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel() + #accuracy = accuracy_score(y, y_pred) + if (tp+fn) != 0: tpr = tp/(tp+fn) + else: tpr=0 + if (fp+tn) != 0: fpr = fp/(fp+tn) + else: fpr=0 + + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group][atk_size_l]["fpr"].append(fpr) + totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group][atk_size_l]["tpr"].append(tpr) + + #plt.show() + #exit() + pbar.update(len(best_d_ag.keys())) + + plt.rc('text', usetex=USE_TEX) + plt.rc('font', family='serif') + font_size=22 + plt.rc('xtick',labelsize=font_size) + plt.rc('ytick',labelsize=font_size) + plt.rc('legend',fontsize=font_size) + + + for fig_out_dir,totals_dict_ac in out_dict.items(): + fig_out_dir += "_comp" + if FIX_atk_labels_based_on_CPO_data: + fig_out_dir += "_fix" + for actor_new,totals_dict in totals_dict_ac.items(): + if len(totals_dict)>0: + out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/" + Path(out_fig_d).mkdir(parents=True, exist_ok=True) + fig_dict=defaultdict(lambda:defaultdict(lambda:dict())) + for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"): + #for clf_n, d in totals_dict.items(): + for atk_type, d2 in d.items(): + #for r_name, d3 in d2.items(): + for r_name in d2.keys(): + d3 = d2[r_name] + #fig_out_dir+="_"+r_name + for n_name in d3.keys(): + #rint("n_name",n_name) + #fig_out_dir+="_"+n_name + fig, ax = plt.subplots(figsize=(22,8)) + plt.rc('axes', prop_cycle=( + #cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.' + #cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728',]) #[ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + cycler('color', [ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']) * #[ u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf'] + cycler('linestyle', ['-', ]) #'-', '--', ':', '-.' + )) + + d4 = d3[n_name] + tpr_dict=defaultdict(lambda:list()) + fpr_dict=defaultdict(lambda:list()) + for group, data in d4.items(): + for atk_size_l, data2 in data.items(): + #print(atk_size_l, data2) + for fpr, tpr in zip(data2["fpr"], data2["tpr"]): + fpr_dict[atk_size_l].append(fpr) + tpr_dict[atk_size_l].append(tpr) + + # print(fpr_dict) + # print(tpr_dict) + plot_fpr=defaultdict(lambda:list()) + plot_tpr=defaultdict(lambda:list()) + total_fpr=list() + for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]: + for atk_size in range(-50,51,2): + atk_size_l = diff_t+"."+str(atk_size) + if atk_size_l in fpr_dict: + plot_fpr[diff_t].append(mean(fpr_dict[atk_size_l])) + total_fpr.append(mean(fpr_dict[atk_size_l])) + else: + plot_fpr[diff_t].append(0) + if atk_size_l in tpr_dict: + plot_tpr[diff_t].append(mean(tpr_dict[atk_size_l])) + else: + plot_tpr[diff_t].append(0) + plot_atk_size=[atk_size for atk_size in range(-50,51,2)] + plot_mean_fpr=[mean(total_fpr) for atk_size in range(-50,51,2)] + # print(plot_fpr) + # print(plot_tpr) + # print(plot_atk_size) + + for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]: + if diff_t == "ocpp_diff_rel": + diff_t_label="Change in Reported Load" + elif diff_t == "grid_diff_rel": + diff_t_label="Change in Grid Load" + + ax.plot( + plot_atk_size, + plot_tpr[diff_t], + #color="b", + label=f"TPR over {diff_t_label}", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + #label=f"TPR over {diff_t_label} (mean = {round(mean([v for v in plot_tpr[diff_t] if v != 0]), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + lw=2, + alpha=0.8, + ) + # ax.plot( + # plot_atk_size, + # plot_fpr[diff_t], + # #color="b", + # label=f"FPR over {diff_t_label} (mean = {round(mean(plot_fpr[diff_t]), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + # lw=2, + # alpha=0.8, + # ) + ax.plot( + plot_atk_size, + plot_mean_fpr, + #color="b", + label=f"FPR (mean = {round(mean(plot_mean_fpr), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} + lw=2, + alpha=0.8, + ) + out_fig_f=out_fig_d+f"{r_name}_{n_name}_{atk_type}_44.pdf" + # print(r_name,n_name) + # print(out_fig_f) + + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * (8/22)*0.8, box.height*0.8]) + + ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.95,0.505)) + #fig.show() + #ax.legend(loc='best', fontsize=font_size) + plt.xticks([-50,-25,0,25,50]) + + plt.gca().get_yaxis().get_major_formatter().set_useOffset(False) + plt.gca().get_yaxis().get_major_formatter().set_scientific(False) + plt.ylim([0,1]) + plt.xlim([-50,50]) + plt.xlabel(r'\textbf{Min. Attack Magnitude}', fontsize=font_size+2) + plt.ylabel(r'\textbf{TPR/FPR}', fontsize=font_size+2) + if os.path.isfile(out_fig_f): + logger.warning(f"overwriting {out_fig_f=}") + fig.savefig(out_fig_f) + #plt.show() + plt.close() + # exit() \ No newline at end of file diff --git a/ids/load_data.py b/ids/load_data.py new file mode 100644 index 0000000000000000000000000000000000000000..321d73c501a0b89de28d2edef84fd3fb0d4030bc --- /dev/null +++ b/ids/load_data.py @@ -0,0 +1,603 @@ + +from collections import defaultdict +from copy import deepcopy +import gzip +import json +import pickle +import os +from typing import Dict, List, Tuple +import pandas as pd +from tqdm import tqdm +import yaml +import matplotlib.pyplot as plt + +import ast +import re + +import logging + +from features_aux import get_grid_measurements_from_export, get_grid_pp + +logger = logging.getLogger("WATTSON_EV_IDS.Load_Data") + +# os.environ["QT_QPA_PLATFORM"] = "wayland" +#os.environ["QT_QPA_PLATFORM"] = "xcb" +# py load_data.py -c=print_data -v test + + + +START_DATE="2023-11-01 22:13:00" + + +def get_dataset_l(configs): + #atk_1_0.2_powerowl_example + atk_regex = re.compile(r"^.*?\/(atk_[\d\.]+_[\d\.]+)_[\w\d_]+$") + + clean_dsets=[] + for dataset in list(configs.keys()): + if "atk" in dataset: + clean_dsets.append(dataset) + for subdir in configs[dataset]["DIR"]: + result = atk_regex.search(subdir) + if result: + new_dset=dataset.replace("_atk","")+"_"+result.group(1) + clean_dsets.append(new_dset) + configs[new_dset] = deepcopy(configs[dataset]) + configs[new_dset]["DIR"]=subdir + configs[new_dset]["OutDataDIR"] = "data/"+new_dset + else: + logger.error(f"get_dataset_l unk atk dir {subdir}") + else: + clean_dsets.append(dataset) + return clean_dsets + + +def clean_dataset_l(datasets, configs, update_config=False): + #atk_1_0.2_powerowl_example + atk_regex = re.compile(r"^.*?\/(atk_[\d\.]+_[\d\.]+)_[\w\d_]+$") + + clean_dsets=[] + for dataset in datasets: + if dataset.endswith("_atk"): + for subdir in configs[dataset]["DIR"]: + result = atk_regex.search(subdir) + if result: + new_dset=dataset.replace("_atk","")+"_"+result.group(1) + clean_dsets.append(new_dset) + if update_config: + configs[new_dset] = deepcopy(configs[dataset]) + configs[new_dset]["DIR"]=subdir + configs[new_dset]["OutDataDIR"] = "data/"+new_dset + else: + logger.error(f"clean_dataset_l unk atk dir {subdir}") + else: + clean_dsets.append(dataset) + return clean_dsets + +def print_data_dict(file_dict): + for d_type, x_path_dict in file_dict.items(): + logger.info(d_type) + for x_path, f_list in x_path_dict.items(): + logger.info("\t"+x_path) + logger.info(f"\t\t{len(f_list)}") + +def print_data_df(file_df): + logger.info(file_df) + + +def get_file_dfs(datasets, configs) -> Dict[str, pd.DataFrame]: + file_dict = {} + file_dfs = {} + for dataset in datasets: + DIR = configs[dataset]["DIR"] + file_dict[dataset] = [] + # file_dict[dataset]["controller-export"] = {} + # file_dict[dataset]["logs"] = {} + + for x in ["estimation", "measurements", "power-grid"]: + x_path = os.path.join(DIR, "controller-export", x) + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + #file_dict[dataset]["controller-export"][x] = onlyfiles + for f in onlyfiles: + file_dict[dataset].append({"type":"controller-export", "sub_type":x, "x_path":x_path, "file":f}) + + node_regex = re.compile(r"^n\d+$") + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))] + for x in [f for f in sub_folders if f.startswith("CPO_") or f.startswith("NodeCP_") or node_regex.match(f)]: + x_path = os.path.join(DIR, x) + #/home/dk/git/wattson-artifacts/test_powerowl_example_2023-12-22-18-22-51/CPO_0/CPO_0-service-34.log + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + #file_dict[dataset]["logs"][x] = onlyfiles + for f in onlyfiles: + file_dict[dataset].append({"type":"logs", "sub_type":x, "x_path":x_path, "file":f}) + + file_dfs[dataset] = pd.DataFrame().from_dict(file_dict[dataset]) + return file_dfs + + +def get_cpo_ocpp_data(file_df) -> Dict[str, Dict[str, List[Tuple[str, pd.DataFrame]]]]: + regex = re.compile(r"^\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2},\d{1,3} - CPOService - INFO - [\d\.]+ - on_tx (.*?)$") + cpo_tx_df_cps={} + for cpo_id in [s for s in file_df["sub_type"].drop_duplicates() if s.startswith("CPO_")]: + cpo_log = file_df[(file_df["type"] == "logs") & (file_df["sub_type"] == cpo_id) & (file_df["file"] == "CPO_0-service-34.log")][["x_path", "file"]] + cpo_log_file = os.path.join(*cpo_log.iloc[0].values) + logger.info(cpo_log_file) + + num_lines = sum(1 for line in open(cpo_log_file,'r')) + tx_data = [] + with open(cpo_log_file) as f: + for line in tqdm(f, total=num_lines, desc=f"Loading {cpo_id}"): + if "ERROR" in line or "Traceback" in line: + logger.error(f"found error in {cpo_log_file}:\t{line}") + result = regex.search(line) + #2023-12-23 01:39:29,777 - CPOService - INFO - 140672877031424.35739 - on_tx ('8ada5dedf167ddcb685bcc4e679570651', 'CP_14', '2023-01-01T22:57:07.401781', '2023-01-01T23:56:54.334702', [{'timestamp': '2023-01-01T23:45:00', 'sampled_value': [{'value': 659575}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'}) + if result: + try: + #('8ada5dedf167ddcb685bcc4e679570651', 'CP_14', '2023-01-01T22:57:07.401781', '2023-01-01T23:56:54.334702', [{'timestamp': '2023-01-01T23:45:00', 'sampled_value': [{'value': 659575}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'}) + #('1e9ce1f6770ed662e8810c827bf8f2611', 'CP_11', '2023-11-09T18:01:58.949633', '2023-11-09T18:00:02.158313', 'Updated', [{'timestamp': '2023-11-09T18:00:00', 'sampled_value': [{'value': 8905865}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'}) + g = result.group(1) + g_lit = ast.literal_eval(g) + except ValueError as e: + if "nan" in g or "inf" in g: + # g = g.replace("nan", "0") + # g_lit = ast.literal_eval(g) + # else: + logger.warning(f"nan or inf in {result.group(0)}") + continue + else: raise e + + tx_msg = { + "cp_c_id": g_lit[0], + "cp_group": g_lit[1], + "sim_time": g_lit[2], + "timestamp": g_lit[3], #tx msg timestamp + "event": g_lit[4], # + "meter_value_ts": g_lit[5][0]["timestamp"], #meter read timestamp + "meter_value_sampled_value": g_lit[5][0]["sampled_value"][0]["value"], + # "custom_data_vendor_id": g_lit[6]['vendor_id'], + # "custom_data_atk_type": g_lit[6]['atk_type'], + } + for k,v in g_lit[6].items(): + #{'vendor_id': 'wattson_v2g', 'atk_type': 'MAD', 'meter_no_atk': 7285447, 'energy_interval': 2.539881115023535, 'original_energy_interval': 2.43, 'average_power': 10.137, 'original_average_power': 9.69845} + tx_msg["custom_data_"+k] = v + tx_data.append(tx_msg) + + tx_df = pd.DataFrame(tx_data) + logger.info(tx_df.iloc[0:]) + tx_df["sim_time"] = pd.to_datetime(tx_df["sim_time"]) + tx_df["timestamp"] = pd.to_datetime(tx_df["timestamp"]) + tx_df["meter_value_ts"] = pd.to_datetime(tx_df["meter_value_ts"]) + #with pd.option_context('display.max_rows', 100, 'display.max_columns', 9): + logger.info(tx_df.iloc[0:]) + logger.debug(tx_df.columns) + + tx_df_cps=defaultdict(lambda: []) + for cp in tqdm(tx_df["cp_c_id"].drop_duplicates(), desc="Loading cp_c_id"): + # logger.debug(cp) + tx_df_cp = tx_df[tx_df["cp_c_id"] == cp].copy() + + tx_df_cp = tx_df_cp.sort_values(by="meter_value_ts", ascending=True) + tx_df_cp["meter_diff"] = tx_df_cp["meter_value_sampled_value"] - tx_df_cp["meter_value_sampled_value"].shift(1) + tx_df_cp["meter_diff"].fillna(0, inplace=True) + tx_df_cp["time_diff"] = (tx_df_cp["meter_value_ts"] - tx_df_cp["meter_value_ts"].shift(1)).apply(lambda x: x.total_seconds()) + tx_df_cp["time_diff"] = tx_df_cp["time_diff"] / (60*60) #s to h + tx_df_cp["charge_speed"] = tx_df_cp["meter_diff"] / tx_df_cp["time_diff"] #wh in w + tx_df_cp["charge_speed"].fillna(0, inplace=True) + + tx_df_cp["custom_data_meter_diff"] = tx_df_cp["custom_data_meter_no_atk"] - tx_df_cp["custom_data_meter_no_atk"].shift(1) + tx_df_cp["custom_data_meter_diff"].fillna(0, inplace=True) + tx_df_cp["custom_data_charge_speed"] = tx_df_cp["custom_data_meter_diff"] / tx_df_cp["time_diff"] #wh in w + tx_df_cp["custom_data_charge_speed"].fillna(0, inplace=True) + for c in ["custom_data_energy_interval", "custom_data_original_energy_interval", "custom_data_average_power", "custom_data_original_average_power"]: + tx_df_cp[c].fillna(0, inplace=True) + + # logger.debug(tx_df_cp) + # logger.debug(tx_df_cp[tx_df_cp["charge_speed"] < 0]) + # exit() + + tx_df_cp = tx_df_cp.set_index("meter_value_ts") + tx_df_cp = tx_df_cp.sort_index(ascending=True) + if len(tx_df_cp[tx_df_cp["charge_speed"] < 0]) > 0: + logger.warning(f"tx_df_cp {cp} w/ <0 speed") + logger.warning(tx_df_cp) + logger.warning(tx_df_cp[tx_df_cp["charge_speed"] < 0]) + logger.warning(tx_df_cp.index[0]) + exit() + + if False: #TODO: START_DATE? + tx_df_cp = tx_df_cp[["cp_group", "charge_speed"]] + tx_df_cp.loc[tx_df_cp.index[0] - pd.Timedelta(seconds=1)] = {"cp_group":tx_df_cp["cp_group"].iloc[0],"charge_speed":0} + tx_df_cp.loc[pd.to_datetime(START_DATE)] = {"cp_group":tx_df_cp["cp_group"].iloc[0],"charge_speed":0} + + tx_df_cp = tx_df_cp.sort_index(ascending=True) + + #tx_df_cp = tx_df_cp.asfreq(freq='5Min', method='bfill') + if False: #TODO: resample? + tx_df_cp = tx_df_cp.resample('5Min', offset="0s").bfill() + # logger.debug(tx_df_cp.iloc[0:]) + tx_df_cps[tx_df_cp["cp_group"].iloc[0]].append((cp, tx_df_cp)) + cpo_tx_df_cps[cpo_id] = tx_df_cps + return cpo_tx_df_cps + +def plot_cpo_ocpp_data(tx_df_cps): + fig0, axes0 = plt.subplots() + for k,v in sorted(tx_df_cps.items()): + logger.debug(k,len(v)) + tx_df_group = pd.concat([_v[1] for _v in v]) + tx_df_group = tx_df_group.reset_index(drop=False) + # logger.debug(tx_df_group.iloc[0:]) + tx_df_group = tx_df_group[["meter_value_ts", "charge_speed"]] + + tx_df_group_sum = tx_df_group.groupby("meter_value_ts").sum() + # logger.debug(tx_df_group_sum.iloc[0:]) + # logger.debug(tx_df_group_sum.columns) + # break + tx_df_group_sum.plot(ax=axes0, y='charge_speed', label=k) + # for _v in v: + # #tx_df[tx_df["cp_c_id"] == _v[0]].plot(ax=axes0, x="meter_value_ts", y='charge_speed') + # _v[1].plot(ax=axes0, y='charge_speed') + #break + # plt.show() + # plt.close() + + +def get_dso_oscp_data(file_df, ts_string = "sim_time", meas_string = "charge_speed", time_offset_in_h=0) -> Dict[str, pd.DataFrame]: + #/home/dk/git/wattson-artifacts/test_powerowl_example_2023-12-22-18-22-51/n375/n375-service-35.log + dso_oscp = file_df[(file_df["type"] == "logs") & (file_df["sub_type"] == "n375")][["x_path", "file"]] + dso_oscp_log = dso_oscp[(dso_oscp["file"] == "n375-service-35.log")] + dso_oscp_log_file = os.path.join(*dso_oscp_log.iloc[0].values) + + regex_dso = re.compile("^\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2},\d{1,3} - DSO_OSCP - INFO - [\d\.]+ - handleUpdateGroupMeasurements (.*?)$") + num_lines = sum(1 for line in open(dso_oscp_log_file,'r')) + dso_tx_data = [] + with open(dso_oscp_log_file) as f: + for line in tqdm(f, total=num_lines, desc=f"Loading {dso_oscp_log['file'].iloc[0]}"): + if "ERROR" in line or "Traceback" in line: + logger.error(f"found error in {dso_oscp_log_file}:\t{line}") + result = regex_dso.search(line) + if result: + try: + #({'group_id': 'CP_4', 'measurements': [{'value': 9, 'phase': 'ALL', 'unit': 'WH', 'energy_type': 'FLEXIBLE', 'direction': 'NET', 'measure_time': '2023-01-02T14:05:00', 'initial_measure_time': '2023-01-01T17:34:05'}]}, 'wjm1neFbwSEVkns5IYstFsheEWR3tsueIWJbJvVFCtE') + g = result.group(1) + #('2023-11-03T09:24:52.617660', 0.25, {'group_id': 'CP_9', 'measurements': [{'value': 2533, 'phase': 'ALL', 'unit': 'WH', 'energy_type': 'FLEXIBLE', 'direction': 'NET', 'measure_time': '2023-11-03T10:12:14', 'initial_measure_time': '2023-11-02T21:42:00'}]}, 'Xng5QTKs7KgjYyJL-ygXl259BVYjUdYROq96ojOj9eI') + g_lit = ast.literal_eval(g) + except ValueError as e: + if "nan" in g or "inf" in g: + # g = g.replace("nan", "0") + # g_lit = ast.literal_eval(g) + # else: + logger.error(f"ValueError {e=} in {result.group(0)}") + continue + else: raise e + sim_time = g_lit[0] + oscp_interval_h = g_lit[1] + g_lit_d = g_lit[2] + tx_msg = { + "sim_time": sim_time, + "oscp_interval_h": oscp_interval_h, + "group_id": g_lit_d["group_id"], + # "token": g_lit[3], + } + for k,v in g_lit_d["measurements"][0].items(): + tx_msg["measurements_"+k] = v + dso_tx_data.append(tx_msg) + + dso_tx_df = pd.DataFrame(dso_tx_data) + dso_tx_df["sim_time"] = pd.to_datetime(dso_tx_df["sim_time"]) + dso_tx_df["measurements_measure_time"] = pd.to_datetime(dso_tx_df["measurements_measure_time"]) - pd.Timedelta(hours=time_offset_in_h) + dso_tx_df["measurements_initial_measure_time"] = pd.to_datetime(dso_tx_df["measurements_initial_measure_time"]) + + + # dso_tx_df["measurements_value"] = dso_tx_df["measurements_value"] / dso_tx_df["oscp_interval_h"] + oscp_interval = dso_tx_df["oscp_interval_h"].mean() * 60 + logger.info(dso_tx_df.iloc[0:]) + logger.debug(dso_tx_df.columns) + + dso_tx_df_cps = dso_tx_df["group_id"].drop_duplicates() + #fig, axes = plt.subplots(nrows=len(dso_tx_df_cps), ncols=1) + + # ts_strings = ["measurements_measure_time", "sim_time"] + # ts_string = "measurements_measure_time" + # ts_string = "sim_time" + # meas_string = "charge_speed" + # meas_string = "measurements_value" + # fig, axes = plt.subplots() + # for i,cp_g in enumerate(dso_tx_df_cps): + # dso_tx_df_cp = dso_tx_df[dso_tx_df["group_id"] == cp_g] + # dso_tx_df_cp = dso_tx_df_cp.sort_values(by=ts_string, ascending=True) + # dso_tx_df_cp.plot(ax=axes, x='measurements_measure_time', y='measurements_value', label=cp_g) + + # plt.show() + # fig.close() + + # for ts_string in ts_strings: + # fig2, axes2 = plt.subplots() + + dso_tx_df_cp_dict = {} + for i,cp_g in enumerate(sorted(dso_tx_df_cps)): + try: + dso_tx_df_cp = dso_tx_df[dso_tx_df["group_id"] == cp_g] + # dso_tx_df_cp = dso_tx_df_cp[::4] + + dso_tx_df_cp = dso_tx_df_cp.sort_values(by=ts_string, ascending=True) + if len(dso_tx_df_cp.loc[dso_tx_df_cp[ts_string].duplicated(keep="first")]) > 0: + logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep=False)]) + logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="last")].index[0]) + logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="first")].index[0]) + dso_tx_df_cp_t_i1 = dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="first")].index + dso_tx_df_cp_t_i2 = dso_tx_df_cp[dso_tx_df_cp[ts_string] > max(dso_tx_df_cp.loc[dso_tx_df_cp_t_i1][ts_string])].index + dso_tx_df_cp.loc[dso_tx_df_cp_t_i1 , ts_string] += pd.Timedelta(hours=1) + dso_tx_df_cp.loc[dso_tx_df_cp_t_i2 , ts_string] += pd.Timedelta(hours=1) + + #dso_tx_df_cp = dso_tx_df_cp.iloc[0::8] + + # print(dso_tx_df_cp) + if True: + OSCP_INTERVAL_M=15 + dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["measurements_value"] - dso_tx_df_cp["measurements_value"].shift(1) + dso_tx_df_cp["meter_diff"].fillna(0, inplace=True) + dso_tx_df_cp["time_diff"] = OSCP_INTERVAL_M / 60 + dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["meter_diff"] / dso_tx_df_cp["time_diff"] #wh in w + dso_tx_df_cp["charge_speed"].fillna(0, inplace=True) + if False: + dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["measurements_value"] - dso_tx_df_cp["measurements_value"].shift(1) + dso_tx_df_cp["meter_diff"].fillna(0, inplace=True) + dso_tx_df_cp["time_diff"] = (dso_tx_df_cp[ts_string] - dso_tx_df_cp[ts_string].shift(1)).apply(lambda x: x.total_seconds()) + dso_tx_df_cp["time_diff"] = dso_tx_df_cp["time_diff"] / (60*60) #s to h + dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["meter_diff"] / dso_tx_df_cp["time_diff"] #wh in w + dso_tx_df_cp["charge_speed"].fillna(0, inplace=True) + if False: + dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["measurements_value"] + dso_tx_df_cp["time_diff"] = (dso_tx_df_cp[ts_string] - dso_tx_df_cp[ts_string].shift(1)).apply(lambda x: x.total_seconds()) + dso_tx_df_cp["time_diff"] = dso_tx_df_cp["time_diff"] / (60*60) #s to h + dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["charge_speed"] * dso_tx_df_cp["time_diff"] + dso_tx_df_cp["meter_diff"].fillna(0, inplace=True) + dso_tx_df_cp["measurements_value"] = 0 + dso_tx_df_cp["measurements_value"] = dso_tx_df_cp["meter_diff"].cumsum() + # print(dso_tx_df_cp) + # exit() + # if cp_g == "CP_11": + # with pd.option_context('display.max_rows', 100, 'display.max_columns', 10): + # print(dso_tx_df_cp.head(100)[["sim_time", "meter_diff", "time_diff" , "charge_speed", "measurements_value"]]) + # exit() + + + dso_tx_df_cp = dso_tx_df_cp.set_index(ts_string) + dso_tx_df_cp = dso_tx_df_cp.sort_index(ascending=True) + # dso_tx_df_cp = dso_tx_df_cp.asfreq(freq='5Min', method='bfill') + + if False: #TODO: START_DATE= + dso_tx_df_cp = dso_tx_df_cp[["group_id", meas_string]] + # print(dso_tx_df_cp) + dso_tx_df_cp.loc[dso_tx_df_cp.index[0] - pd.Timedelta(seconds=1)] = {"group_id":dso_tx_df_cp["group_id"].iloc[0],meas_string:0} + dso_tx_df_cp.loc[pd.to_datetime(START_DATE)] = {"group_id":dso_tx_df_cp["group_id"].iloc[0],meas_string:0} + dso_tx_df_cp = dso_tx_df_cp.sort_index(ascending=True) + # print(dso_tx_df_cp) + # exit() + + #dso_tx_df_cp = dso_tx_df_cp.resample(str(oscp_interval)+'Min', offset="0s").bfill() + if False: #TODO: resample? + dso_tx_df_cp = dso_tx_df_cp.resample('5Min', offset="0s").bfill() + dso_tx_df_cp_dict[cp_g] = dso_tx_df_cp + # dso_tx_df_cp.plot(ax=axes2, y=meas_string, label=cp_g) + except Exception as e: + logger.error(e) + logger.error(dso_tx_df_cp.index) + logger.error(len(dso_tx_df_cp.index)) + logger.error(len(dso_tx_df_cp.index.drop_duplicates())) + logger.error(dso_tx_df_cp[dso_tx_df_cp.index.duplicated(keep=False)]) + raise e + # plt.show() + # plt.close() + return dso_tx_df_cp_dict + +def plot_dso_oscp_data(dso_tx_df_cp_dict, meas_string = "charge_speed"): + fig2, axes2 = plt.subplots() + for cp_g,dso_tx_df_cp in dso_tx_df_cp_dict.items(): + dso_tx_df_cp.plot(ax=axes2, y=meas_string, label=cp_g) + # plt.show() + # plt.close() + +def get_measurements_expo(file_df, DIR) -> pd.DataFrame: + pp=get_grid_pp(DIR) + + sgen = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="sgen", target_var=".MEASUREMENT.active_power" ) + bus = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="bus", target_var=".MEASUREMENT.active_power" ) + load = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" ) + #storage = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power" ) + elems=[sgen, bus, load]#, storage + + out_df = None + for e in elems: + for group,vals in e.items(): + for val in vals: + if out_df is None: + out_df = val.to_frame() + else: + out_df[val.name] = val + out_df[val.name] = val + new_name = val.name.replace(".MEASUREMENT","").replace("grid_expo_","0.0.") + out_df = out_df.rename(columns={val.name: new_name}).copy() + return out_df + + +def get_measurements(file_df, DataPointMapsDIR) -> pd.DataFrame: + regex_meas = re.compile(r"^measurements\-(\d+)\.jsonl$") + measurements = file_df[(file_df["type"] == "controller-export") & (file_df["sub_type"] == "measurements")][["x_path", "file"]] + logger.debug(measurements) + measurement_dict = defaultdict(lambda: []) + for index, row in tqdm(measurements.iterrows(), desc="Loading measurements"): + result = regex_meas.search(row["file"]) + if result: + with open(os.path.join(row["x_path"], row["file"]), "rb") as f: + for line in f: + measurement_data = json.loads(line) + measurement_dict[result.group(1)].append(measurement_data) + else: + logger.error(f"unk file {row['file']}") + + for k,v in list(measurement_dict.items()): + logger.debug(k, len(v)) + for v in measurement_dict["401"][:10]: + #{'coa': 401, 'value_map': {'10010': 1.0299999713897705, '10020': 0.0, '10030': -8289678.0, '10040': -10461183.0, '10050': 8289678.0, '10060': 10461183.0}, 'sim-time': 1698972846.4926767, 'clock-time': 1703443759.360438} + #{'coa': 401, 'value_map': {'10010': 1.0299999713897705, '10020': 0.0}, 'sim-time': 1698982975.2340364, 'clock-time': 1703443770.6145942} + logger.debug(v) + + regex_dp = re.compile(r"^(\d+)\-data\-points\.yml$") + # DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"] + onlyfiles = [f for f in os.listdir(DataPointMapsDIR) if os.path.isfile(os.path.join(DataPointMapsDIR, f))] + logger.debug(onlyfiles) + + dp_dict = defaultdict(lambda: []) + for f in tqdm(onlyfiles, desc="Loading Data Point Maps"): + result = regex_dp.search(f) + if result: + with open(os.path.join(DataPointMapsDIR, f), "rb") as f: + dp_data = yaml.load(f, Loader=yaml.FullLoader) + dp_dict[result.group(1)].append(dp_data) + else: + logger.error(f"unk file {f}") + + #[{'401': [{'identifier': '401.10010', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, 'direction': 'monitoring', 'ioa': 10010, 'type_id': 13}, 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'voltage', 'context': 'MEASUREMENT', 'grid_element': 'bus.0', 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 'value': None}, {'identifier': '401.10020', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, .... + logger.debug(dp_dict["401"]) + + meas_data = [] + for coa,meas_s in tqdm(list(measurement_dict.items()), desc="Loading meas_data"): + for meas in meas_s: + if str(meas["coa"]) != coa: + logger.error(f"unk meas coa {coa, meas}") + dps = dp_dict[coa] + for dp in dps: + coa_dps = dp[coa] + for coa_dp in coa_dps: + ioa = coa_dp['protocol_data']["ioa"] #10010 + if str(ioa) not in meas["value_map"]: + continue #removes control IOAs... + + if str(coa_dp['protocol_data']["coa"]) != coa: + logger.error(f"unk dp coa {coa, coa_dp}") + coa_ioa = coa_dp["identifier"] #'401.10010' + + direction = coa_dp['protocol_data']["direction"] #control or monitoring + direction_s = "sources" if direction == "monitoring" else "targets" + + if direction_s not in coa_dp['providers']: + logger.error(f"no {direction_s=} in {coa, coa_dp}") + if len(coa_dp['providers'][direction_s]) > 1: + logger.error(f"unk {direction_s=} len>1 {coa, coa_dp}") + domain = coa_dp['providers'][direction_s][0]["domain"] #source or target + attribute = coa_dp['providers'][direction_s][0]["provider_data"]["attribute"] #voltage + grid_element = coa_dp['providers'][direction_s][0]["provider_data"]["grid_element"] #bus.0 + + meas_value = meas["value_map"][str(ioa)] + sim_time = meas["sim-time"] + #clock_time = meas["clock-time"] + + meas_data.append({"coa_ioa":coa_ioa,"coa":coa,"ioa":ioa,"direction":direction,"domain":domain,"attribute":attribute,"grid_element":grid_element,"meas_value":meas_value,"sim_time":sim_time})#,"clock_time":clock_time}) + + meas_df = pd.DataFrame(meas_data) + meas_df["sim_time"] = pd.to_datetime(meas_df["sim_time"], unit="s") + #meas_df["clock_time"] = pd.to_datetime(meas_df["clock_time"], unit="s") + meas_df.sort_values(by="sim_time", ascending=True) + logger.debug(meas_df) + meas_df2 = meas_df[(meas_df["direction"] == "monitoring") & (meas_df["domain"] == "source")] #does nothing + # logger.debug(meas_df) + + dups = defaultdict(lambda:[]) #dups for two sides of lines + + clean_meas_data = [] + for coa_ioa in tqdm(meas_df["coa_ioa"].drop_duplicates(), desc="Loading clean_meas_data"): + meas_df_i = meas_df[meas_df["coa_ioa"] == coa_ioa].copy() + # logger.debug(meas_df_i) + if len(meas_df_i["attribute"].drop_duplicates()) != 1 or len(meas_df_i["grid_element"].drop_duplicates()) != 1: + logger.error(f'error w dups in {meas_df_i["attribute"].drop_duplicates(), meas_df_i["grid_element"].drop_duplicates()}') + + att = meas_df_i["attribute"].drop_duplicates().iloc[0] + ge = meas_df_i["grid_element"].drop_duplicates().iloc[0] + + meas_df_i = meas_df_i.set_index("sim_time") + meas_df_i = meas_df_i.sort_index(ascending=True) + + def resample_meas(df, freq='15Min'): + oidx = df.index + nidx = pd.date_range(oidx.min(), oidx.max(), freq=freq, normalize=True) + res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx) + return res + + if True: #TODO: resample? -> yes in order to concat diff freqs... + #meas_df_i = meas_df_i.resample('5Min', offset="0s").bfill() + #meas_df_i = meas_df_i.resample('5Min', offset="0s").interpolate('time') + meas_df_i = resample_meas(meas_df_i, freq="5Min") + + # logger.debug(meas_df_i) + meas_df_i = meas_df_i[["meas_value"]] + meas_df_i = meas_df_i.rename(columns={"meas_value": coa_ioa+"."+ge+"."+att}) #duplicates lines witout coa_ioa... (dups for two sides of lines) + dups[ge+"."+att].append(coa_ioa) + # logger.debug(meas_df_i) + + clean_meas_data.append(meas_df_i) + + clean_meas_df = pd.concat(clean_meas_data, axis=1) + # logger.debug(clean_meas_df) + clean_meas_df = clean_meas_df.fillna(clean_meas_df.mean(skipna=True)) + logger.info(clean_meas_df) + + # for k,v in dups.items(): + # if len(v) > 1: + # logger.debug("dup",k,v) + return clean_meas_df + + + +def get_estimations(file_df, drop_dups=False) -> pd.DataFrame: + est = file_df[(file_df["type"] == "controller-export") & (file_df["sub_type"] == "estimation")][["x_path", "file"]] + logger.info(est) + est_dict = defaultdict(lambda: list()) + for index, row, in tqdm(est.iterrows(), total = len(est), desc="Loading estimates"): + + regex_est = re.compile(r"^WALL\-[\-\dT\+]+__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$") + #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz + result = regex_est.search(row["file"]) + if result: + with gzip.open(os.path.join(row["x_path"], row["file"]), "rb") as f: + est_data = pickle.load(f) + pd.reset_option('^display.', silent=True) + # logger.info(est_data) + # logger.info(est_data["res_line_est"]) + # logger.info(est_data["line"]) + time = "".join(result.group(1).rsplit("-", 1)) + time = pd.to_datetime(time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00 + + for c in est_data.keys(): + if c.startswith("res_") and c.endswith("_est"): + c_name = c.replace("res_", "").replace("_est", "") + est_df = est_data[c] + if est_df.empty: continue + df_out = est_df.unstack().to_frame().T.sort_index(level=0) + df_out.columns = [f'{c_name}.{j}.{i}' for i, j in df_out.columns] + df_out.index = [time] + est_dict[c_name].append(df_out) + else: + logger.error(f"unk file {row['file']}") + est_l=[] + for k,v in est_dict.items(): + k_df = pd.concat(v) + est_l.append(k_df) + est_df = pd.concat(est_l, axis=1) + + if drop_dups: + logger.warning("----dropping dups-----") + logger.warning(est_df) + # est_df = est_df.round(5) + logger.warning(est_df.std()) + est_df = est_df.drop(est_df.std()[(est_df.std() == 0)].index, axis=1) + logger.warning(est_df) + logger.warning("----dropping dups-----") + + est_df = est_df.sort_index(ascending=True) + if False: #TODO: resample? + est_df = est_df.resample('5Min', offset="0s").bfill() + logger.info(est_df) + return est_df + + \ No newline at end of file diff --git a/ids/regression.py b/ids/regression.py new file mode 100644 index 0000000000000000000000000000000000000000..051e54f3c660c1444e84080329a372cec18f25a7 --- /dev/null +++ b/ids/regression.py @@ -0,0 +1,877 @@ + +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from copy import deepcopy +import gc +import hashlib +import itertools +import json +import multiprocessing +from multiprocessing.pool import ThreadPool +import os +import warnings +import pandas as pd +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import KFold +from sklearn.neural_network import MLPRegressor +from sklearn.svm import LinearSVR +from sklearn.tree import DecisionTreeRegressor +from tqdm import tqdm + +from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor + +import re + +import logging + +logger = logging.getLogger("WATTSON_EV_IDS.Regression") + + +m_dict = multiprocessing.Manager() +REG_CACHE=m_dict.dict() +#REG_CACHE=dict() +with open("ids.conf", 'r') as f: + conf = json.load(f) + NUM_THREADS = conf["NUM_THREADS"] +#NUM_THREADS=3 + +IGNORE_MISS_COLS=True + +def get_regression_pred_conc_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, actor, features, num_shifts=1, pbar=None, kfold_splits=[None], atk_df=None, recursive=False): + y_pred_folds=[] + for kfold in kfold_splits: + if recursive: + y_preds = get_regression_pred_rec(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df) + else: + y_preds = get_regression_pred_conc(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df) + + #pbar.update(len(y_preds)) + y_pred = pd.concat(y_preds, axis=1) + # print(y_pred) + # exit() + y_pred["conf"] = [conf] * len(y_pred) + y_pred["features"] = [features] * len(y_pred) + y_pred["reg"] = reg_type + y_pred["actor"] = actor + # print(y_pred) + # exit() + y_pred_folds.append(y_pred) + + y_pred_all = pd.concat(y_pred_folds) + y_pred_all.sort_index(ascending=True, inplace=True) + return y_pred_all + + +def get_regression_pred_rec(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None): + if kfold is None: + y_preds = get_regression_pred_rec_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df) + else: + y_preds = get_regression_pred_rec_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df) + return y_preds + +def get_regression_pred_rec_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None): + y_preds = [] + atk_df_do = atk_df.copy() + atk_df_s = atk_df.copy() + + for x in range(num_shifts): + # index, y_pred = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=x) + index, y_pred, used_shift = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, 0, atk_df=atk_df_do) + y_pred = pd.DataFrame(y_pred, index=index) + y_pred[0] = y_pred[0].shift(used_shift) + y_pred.rename(columns={0: "prediction_"+str(x)}, inplace=True) + y_preds.append(y_pred) + + # print(atk_df.loc[ATK_START_DATE:][[c for c in atk_df.columns if c.startswith("charge_speed_lag_")]]) + # print(y_pred.shift(1)) + y_pred_s = y_pred.shift(x+1) + atk_df_s = atk_df_do.copy() + atk_df_s["charge_speed_lag_"+str(x+1)] = y_pred_s + atk_df_s[atk_df_s["charge_speed_lag_"+str(x+1)].isna()] = atk_df[atk_df_s["charge_speed_lag_"+str(x+1)].isna()] + atk_df_do=atk_df_s + if pbar is not None: + pbar.update(1) + # print(atk_df_s.loc[ATK_START_DATE:][[c for c in atk_df.columns if c.startswith("charge_speed_lag_")]]) + # exit() + + return y_preds + + +def get_regression_pred_rec_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None): + y_preds = [] + + df_all = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE] #.copy() + + feat_cols = df_dict["feat_cols"] + pred_col = df_dict["pred_col"] + + y_preds = dict() + y_preds_fold = defaultdict(lambda:list()) + kf = KFold(n_splits=kfold, random_state=None, shuffle=False) + for i, (train_index, test_index) in enumerate(kf.split(df_all)): + df = df_all.iloc[train_index].copy() + tbd_df = df_all.iloc[test_index].copy() + + for x in range(num_shifts): + try: + index, y_pred, used_shift = _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=0) + + y_pred = pd.DataFrame(y_pred, index=index) + y_pred.rename(columns={0: "prediction_"+str(x)}, inplace=True) + y_preds_fold[x+1].append(y_pred) + + # print(tbd_df.loc[TRAIN_START_DATE:ATK_START_DATE][[c for c in tbd_df.columns if c.startswith("charge_speed_lag_")]]) + # print(y_pred.shift(0)) + y_pred_s = y_pred.shift(x+1) + tbd_df_s = tbd_df.copy() + tbd_df_s["charge_speed_lag_"+str(x+1)] = y_pred_s + tbd_df_s[tbd_df_s["charge_speed_lag_"+str(x+1)].isna()] = tbd_df[tbd_df_s["charge_speed_lag_"+str(x+1)].isna()] + tbd_df=tbd_df_s + # print(tbd_df_s.loc[TRAIN_START_DATE:ATK_START_DATE][[c for c in tbd_df_s.columns if c.startswith("charge_speed_lag_")]]) + # exit() + if pbar is not None: + pbar.update(1) + + except ValueError as e: + logger.error(df) + logger.error(tbd_df) + logger.error(pred_col) + logger.error(feat_cols) + logger.error(e) + raise e + + for k,v in y_preds_fold.items(): + y_pred = pd.concat(v) + y_pred.sort_index(ascending=True, inplace=True) + y_preds[k] = y_pred + ret_y_pred = [v for v in y_preds.values()] + return ret_y_pred + +def get_regression_pred_conc(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None): + nt=NUM_THREADS + if reg_type == "RandomForestRegressor" or reg_type == "MLPRegressor": + nt=1 + y_preds = [] + results=[] + + + with ThreadPoolExecutor(nt) as pool: + #with ProcessPoolExecutor(nt) as pool: + for x in range(num_shifts): + # index, y_pred = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=x) + if kfold is None: #no kfold for atk data sets + if nt == 1: + results.append(get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, atk_df=atk_df)) + else: + results.append(pool.submit(get_regression_pred, df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, atk_df)) + else: # yes kfold for training data sets + if nt == 1: + results.append(get_regression_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, kfold, pbar)) + else: + results.append(pool.submit(get_regression_kfold, df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, kfold, pbar)) + + for r in results: + if nt == 1: + index, y_pred, used_shift = r + else: + index, y_pred, used_shift = r.result() + if pbar is not None and kfold is None: + pbar.update(1) + y_pred = pd.DataFrame(y_pred, index=index) + y_pred[0] = y_pred[0].shift(used_shift) + y_pred.rename(columns={0: "prediction_"+str(used_shift)}, inplace=True) + y_preds.append(y_pred) + + return y_preds + +def get_regression_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=0, nfolds=5, pbar=None): + df_all = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE] #.copy() + + feat_cols = df_dict["feat_cols"] + pred_col = df_dict["pred_col"] + + y_preds = [] + kf = KFold(n_splits=nfolds, random_state=None, shuffle=False) + for i, (train_index, test_index) in enumerate(kf.split(df_all)): + df = df_all.iloc[train_index].copy() + tbd_df = df_all.iloc[test_index].copy() + + try: + index, y_pred, used_shift = _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=shift) + except ValueError as e: + logger.error(df) + logger.error(tbd_df) + logger.error(pred_col) + logger.error(feat_cols) + logger.error(e) + raise e + del df + del tbd_df + gc.collect() + y_pred = pd.DataFrame(y_pred, index=index) + y_preds.append(y_pred) + if pbar is not None: + pbar.update(1) + y_pred = pd.concat(y_preds) + y_pred.sort_index(ascending=True, inplace=True) + return y_pred.index, y_pred[0], shift + + +def get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=0, atk_df=None): + df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE].copy() + feat_cols = df_dict["feat_cols"] + pred_col = df_dict["pred_col"] + if atk_df is None: + tbd_df = df_dict["cp_g_df"].loc[ATK_START_DATE:].copy() + else: + tbd_df = atk_df.loc[ATK_START_DATE:].copy() + mis_col = [c for c in df.columns if c not in tbd_df.columns and "date_exog" in c] + tbd_df[mis_col] = 0 + + miss_cols = [c for c in feat_cols if c not in tbd_df.columns] + if len(miss_cols) > 0: + if IGNORE_MISS_COLS: + cp_miss = set() + other_miss = list() + for c in miss_cols: + if c.startswith("CP_"): + cp_miss.add("_".join(c.split("_")[0:2])) + else: + other_miss.append(c) + if len(cp_miss)>1: + logger.warning(f"ignoring missing columns in atk datafrtame, cp_miss: {len(cp_miss)}") + logger.debug(f"ignoring missing columns in atk datafrtame, cp_miss: {cp_miss}") + if len(other_miss)>1: + logger.warning(f"ignoring missing columns in atk datafrtame, other_miss: {len(other_miss),other_miss}") + feat_cols = [c for c in feat_cols if c in tbd_df.columns] + # print(feat_cols) + # exit() + # print([c for c in tbd_df.columns]) + #test = tbd_df[feat_cols] + return _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=shift) + + # df[pred_col] = df[pred_col].shift(shift*-1) + # df.dropna(subset=[pred_col], inplace=True) + + # tbd_df[pred_col] = tbd_df[pred_col].shift(shift*-1) + # tbd_df.dropna(subset=[pred_col], inplace=True) + + # return tbd_df.index, _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=reg_type), shift + +def _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=0): + df[pred_col] = df[pred_col].shift(shift*-1) + df.dropna(subset=[pred_col], inplace=True) + + tbd_df[pred_col] = tbd_df[pred_col].shift(shift*-1) + tbd_df#.dropna(subset=[pred_col], inplace=True) + + return tbd_df.index, _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=reg_type), shift + + +def _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=""): + # print(df) + # print(feat_cols) + # exit() + if reg_type == "RandomForestRegressor": + #_do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf) + return _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf) + elif reg_type == "DecisionTreeRegressor" or reg_type == "DecisionTreeRegressor2": + return _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf) + elif reg_type == "GradientBoostingRegressor": + return _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf) + elif reg_type == "HistGradientBoostingRegressor": + return _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf) + elif reg_type == "LinearSVR": + return _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf) + elif reg_type == "MLPRegressor" or reg_type == "MLPRegressor2": # or reg_type == "test": + return _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf) + else: + logger.error(f"unk reg_type {reg_type}") + return [] + +def optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group=""): + df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:VALIDATION_START_DATE] + feat_cols = df_dict["feat_cols"] + # print([c for c in df.columns]) + # print(feat_cols) + # exit() + pred_col = df_dict["pred_col"] + tbd_df = df_dict["cp_g_df"].loc[VALIDATION_START_DATE:ATK_START_DATE] + + df = df[feat_cols+[pred_col]].copy() + tbd_df = tbd_df[feat_cols+[pred_col]].copy() + if len(tbd_df) > 0: + ret_ds = _optimize_regression(df, feat_cols, pred_col, tbd_df, group=group, reg_type=reg_type) + else: + ret_ds=[] + logger.warning(f"Empty ATK df. Skipping {group=}") + ret_df = pd.DataFrame(ret_ds) + if len(ret_df) > 0: + ret_df = ret_df.sort_values(by="mse", ascending=True) + return ret_df + +def _optimize_regression(df, feat_cols, pred_col, tbd_df, group="", reg_type=""): + if reg_type == "RandomForestRegressor": + return optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, group) + elif reg_type == "DecisionTreeRegressor" or reg_type == "DecisionTreeRegressor2": + return optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, group) + elif reg_type == "GradientBoostingRegressor": + return optimize_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group) + elif reg_type == "HistGradientBoostingRegressor": + return optimize_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group) + elif reg_type == "LinearSVR": + return optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, group) + elif reg_type == "MLPRegressor" or reg_type == "MLPRegressor2" or reg_type == "test": + return optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, group) + else: + logger.error(f"unk reg_type {reg_type}") + return [] + + +def _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf): + lags = conf[6] + #forecaster_reg = DecisionTreeRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], + # 'ccp_alpha': conf[7], 'splitter': conf[8], 'min_weight_fraction_leaf': conf[9], 'min_impurity_decrease': conf[10], 'random_state': 1234}) # + reg_id = get_reg_id("DecisionTreeRegressor", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + forecaster_reg = DecisionTreeRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], + 'ccp_alpha': conf[7], 'splitter': conf[8], 'min_weight_fraction_leaf': conf[9], 'min_impurity_decrease': conf[10], 'random_state': 1234}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + # forecaster_reg.n_estimators += n_estimators + #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + y_pred = forecaster_reg.predict(tbd_df[feat_cols]) + return y_pred + +def _optimize_DecisionTreeRegressor(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return {"conf":conf,"mse":mse,"rmse":rmse} + + +def optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, group=""): + #criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"] + criterion = ["squared_error", "absolute_error", "friedman_mse"] + max_depth = [14, None] + min_samples_split = [2,3] + min_samples_leaf = [1,2] + max_leaf_nodes = [None, 50] + max_features = ["sqrt", None] + lags=[16,32] + ccp_alpha = [0.0] + splitter = ["best", "random"] + min_weight_fraction_leaf = [0.0] + min_impurity_decrease = [0.0] + + criterion = ["absolute_error", "friedman_mse"] + # max_depth = [None] + min_samples_split = [2,5] + min_samples_leaf = [1,3] + max_leaf_nodes = [None] + max_features = [None] + #ccp_alpha = [0.0, 0.5] + splitter = ["best"] + # min_weight_fraction_leaf = [0.0, 0.1, 0.2] + #min_impurity_decrease = [0.0, 1.0] + lags=[None] + + + c = list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features, lags, ccp_alpha, splitter, min_weight_fraction_leaf, min_impurity_decrease)) + #c = [('absolute_error', 14, 2, 1, None, None, None, 0.0, 'random', 0.0, 0.0), ] + #c = [('friedman_mse', None, 5, 3, None, None, None, 0.5, 'random', 0.0, 1.0), ] + + # ret_ds = [] + # for conf in tqdm(c, desc="conf "+group, disable=False): + # ret_ds.append(_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf)) + # return ret_ds + + ret_ds = [] + results=[] + print() + with ThreadPoolExecutor(NUM_THREADS) as pool: + # with ProcessPoolExecutor(NUM_THREADS) as pool: + for conf in tqdm(c, desc="conf "+group, disable=True): + results.append(pool.submit(_optimize_DecisionTreeRegressor, df, feat_cols, pred_col, tbd_df, conf)) + #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds) + for r in tqdm(results, desc="conf "+group): + ret_ds.append(r.result()) + return ret_ds + + +def _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf): + lags = conf[6] + n_estimators=conf[10] + # forecaster_reg = RandomForestRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], + # 'oob_score': conf[7], 'ccp_alpha': conf[8], 'max_samples': conf[9], 'n_estimators': n_estimators, 'n_jobs': min(NUM_THREADS, n_estimators), 'random_state': 1234, + # 'warm_start': False}) # + reg_id = get_reg_id("RandomForestRegressor", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + forecaster_reg = RandomForestRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], + 'oob_score': conf[7], 'ccp_alpha': conf[8], 'max_samples': conf[9], 'n_estimators': n_estimators, 'n_jobs': min(NUM_THREADS, n_estimators), 'random_state': 1234, + 'warm_start': False}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + # forecaster_reg.n_estimators += n_estimators + #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + + y_pred = forecaster_reg.predict(tbd_df[feat_cols]) + return y_pred + +def _optimize_RandomForestRegressor(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return {"conf":conf,"mse":mse,"rmse":rmse} + +def optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, group=""): + #criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"] + criterion = ["squared_error", "absolute_error"] + #max_depth = [None, 14] + max_depth = [None, 12, 14] + min_samples_split = [2,3] + min_samples_leaf = [1,2] + max_leaf_nodes = [None, 5] + max_features = ["sqrt", None] + lags=[16,32] + oob_score = [False] + ccp_alpha = [0.0] + max_samples = [None, 0.8] + _n_estimators=[20,50,100,150] + + criterion = ["squared_error"] + max_depth = [None, 12] + # min_samples_split = [3] + min_samples_split = [2] + min_samples_leaf = [1] + max_leaf_nodes = [None] + max_features = [None] + lags=[None] + # max_samples = [None] + # _n_estimators=[40,50,60] + # _n_estimators=[5,10,15,20,30] + _n_estimators=[50,100,150] + + + c = list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features, lags, oob_score, ccp_alpha, max_samples, _n_estimators)) + #c = [('squared_error', 12, 3, 2, None, None, None, False, 0.0, None, 20), ] + + ret_ds = [] + print() + for conf in tqdm(c, desc="conf "+group, disable=(len(c) == 1)): + res = _optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf) + #print(f"{group=} {mse=} {rmse=} {conf=}") + ret_ds.append(res) + return ret_ds + + +def _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf): + #c = list(itertools.product(loss, learning_rate, max_iter, min_samples_leaf, max_depth, max_leaf_nodes)) + + reg_id = get_reg_id("HistGradientBoostingRegressor", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + forecaster_reg = HistGradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'max_iter': conf[2], 'min_samples_leaf': conf[3], 'max_depth': conf[4], 'max_leaf_nodes': conf[5], + 'random_state': 1234}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + # forecaster_reg.n_estimators += n_estimators + #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + + y_pred = forecaster_reg.predict(tbd_df[feat_cols]) + return y_pred + +def _optimize_HistGradientBoostingRegressor(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return{"conf":conf,"mse":mse,"rmse":rmse} + + +def optimize_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group=""): + loss=['absolute_error'] + learning_rate=[0.1, 0.5] + max_iter = [100] + max_leaf_nodes = [50, 100, 150] + max_depth = [None, 15] + min_samples_leaf = [10,10,30] + + + c = list(itertools.product(loss, learning_rate, max_iter, min_samples_leaf, max_depth, max_leaf_nodes)) + + ret_ds = [] + results=[] + print() + with ThreadPoolExecutor(NUM_THREADS) as pool: + # with ProcessPoolExecutor(NUM_THREADS) as pool: + for conf in tqdm(c, desc="conf "+group, disable=True): + results.append(pool.submit(_optimize_HistGradientBoostingRegressor, df, feat_cols, pred_col, tbd_df, conf)) + #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds) + for r in tqdm(results, desc="conf "+group): + ret_ds.append(r.result()) + return ret_ds + + +def _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf): + n_estimators=conf[2] + + reg_id = get_reg_id("GradientBoostingRegressor", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + if len(conf) >14: + forecaster_reg = GradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'subsample': conf[3], 'criterion': conf[4], 'min_samples_split': conf[5], 'min_samples_leaf': conf[6], + 'min_weight_fraction_leaf': conf[7], 'max_depth': conf[8], 'min_impurity_decrease': conf[9], 'max_features': conf[10], 'alpha': conf[11], + 'max_leaf_nodes': conf[12], 'ccp_alpha': conf[13], 'n_iter_no_change': conf[14], + 'n_estimators': n_estimators, 'random_state': 1234}) # + else: + forecaster_reg = GradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'subsample': conf[3], 'criterion': conf[4], 'min_samples_split': conf[5], 'min_samples_leaf': conf[6], + 'min_weight_fraction_leaf': conf[7], 'max_depth': conf[8], 'min_impurity_decrease': conf[9], 'max_features': conf[10], 'alpha': conf[11], + 'max_leaf_nodes': conf[12], 'ccp_alpha': conf[13], + 'n_estimators': n_estimators, 'random_state': 1234}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + # forecaster_reg.n_estimators += n_estimators + #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + + y_pred = forecaster_reg.predict(tbd_df[feat_cols]) + return y_pred + +def _optimize_GradientBoostingRegressor(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return{"conf":conf,"mse":mse,"rmse":rmse} + + +def optimize_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group=""): + loss=['squared_error', 'absolute_error', 'huber', 'quantile'] + learning_rate=[0.1, 0.2] + n_estimators=[25, 50, 75, 100] + subsample=[0.9, 1.0] + criterion = ["squared_error", "friedman_mse"] + min_samples_split = [2,8] + min_samples_leaf = [1,4] + min_weight_fraction_leaf = [0.0] + max_depth = [None, 14] + min_impurity_decrease = [0.0] + max_features = ["sqrt", None] + alpha=[0.9, 0.95] + max_leaf_nodes = [None, 10, 15] + ccp_alpha = [0.0] + + loss=['absolute_error'] + learning_rate=[0.1] + n_estimators=[50, 100, 150] + subsample=[1.0] + criterion = ["friedman_mse"] + max_depth = [None] + max_features = [None] + max_leaf_nodes = [None, 10] + alpha=[0.9] + + + c = list(itertools.product(loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, max_features, alpha, max_leaf_nodes, ccp_alpha)) + #c = [('absolute_error', 0.1, 100, 1.0, 'friedman_mse', 8, 4, 0.0, None, 0.0, None, 0.9, 10, 0.0), ] + + ret_ds = [] + results=[] + print() + with ThreadPoolExecutor(NUM_THREADS) as pool: + # with ProcessPoolExecutor(NUM_THREADS) as pool: + for conf in tqdm(c, desc="conf "+group, disable=True): + results.append(pool.submit(_optimize_GradientBoostingRegressor, df, feat_cols, pred_col, tbd_df, conf)) + #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds) + for r in tqdm(results, desc="conf "+group): + ret_ds.append(r.result()) + return ret_ds + + +def _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + #forecaster_reg = LinearSVR(**{'epsilon': conf[0], 'tol': conf[1], 'C': conf[2], 'loss': conf[3], 'fit_intercept': conf[4], 'intercept_scaling': conf[5], 'dual': conf[6], 'max_iter': conf[7], 'random_state': 1234}) # + + # print(feat_cols, pred_col) + # exit() + reg_id = get_reg_id("LinearSVR", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + forecaster_reg = LinearSVR(**{'epsilon': conf[0], 'tol': conf[1], 'C': conf[2], 'loss': conf[3], 'fit_intercept': conf[4], 'intercept_scaling': conf[5], 'dual': conf[6], 'max_iter': conf[7], 'random_state': 1234}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + # forecaster_reg.n_estimators += n_estimators + #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + + y_pred = forecaster_reg.predict(tbd_df[feat_cols]) + return y_pred + +def _optimize_LinearSVR(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return {"conf":conf,"mse":mse,"rmse":rmse} + + +def optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, group=""): + epsilon=[0.0, 100, 1000] + tol=[0.0001, 0.01, 0.3] + C=[1.0, 0.1, 2.0] + loss=["epsilon_insensitive", "squared_epsilon_insensitive"] + fit_intercept=[True, False] + intercept_scaling=[1.0, 2.0] + dual=[True, False] + max_iter=[1000, 1200] + + C=[1.0, 2.0] + loss=["squared_epsilon_insensitive"] + + c = list(itertools.product(epsilon, tol, C, loss, fit_intercept, intercept_scaling, dual, max_iter)) + #c = [(1000, 0.0001, 2.0, 'squared_epsilon_insensitive', False, 1.0, False, 1200), ] + + ret_ds = [] + results=[] + print() + # for conf in tqdm(c, desc="conf "+group, disable=False): + # if conf[4] == False and conf[5] != 1.0: #not fit_intercept and yes scaling + # continue + # if conf[3] == 'epsilon_insensitive' and conf[6] == False: #Parameters: penalty='l2', loss='epsilon_insensitive', dual=False + # continue + # try: + # ret_v = _optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, conf) + # ret_ds.append(ret_v) + # except Exception as e: + # print(e) + + with ThreadPoolExecutor(NUM_THREADS) as pool: + # with ProcessPoolExecutor(NUM_THREADS) as pool: + for conf in tqdm(c, desc="conf "+group, disable=True): + if conf[4] == False and conf[5] != 1.0: #not fit_intercept and yes scaling + continue + results.append(pool.submit(_optimize_LinearSVR, df, feat_cols, pred_col, tbd_df, conf)) + #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds) + for r in tqdm(results, desc="conf "+group): + ret_ds.append(r.result()) + return ret_ds + + +def _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + + reg_id = get_reg_id("MLPRegressor", df, feat_cols, pred_col, conf) + if reg_id in REG_CACHE: + logger.debug(f"hit {len(REG_CACHE)}") + forecaster_reg = deepcopy(REG_CACHE[reg_id]) + else: + logger.debug(f"miss {len(REG_CACHE)}") + forecaster_reg = MLPRegressor(**{'hidden_layer_sizes': conf[0], 'solver': conf[1], 'alpha': conf[2], 'max_iter': conf[3], 'activation': conf[4], 'random_state': 1234}) # + forecaster_reg.fit(X = df[feat_cols], y = df[pred_col]) + REG_CACHE[reg_id] = forecaster_reg + + y_pred = forecaster_reg.predict(tbd_df[feat_cols])# + return y_pred + +def _optimize_MLPRegressor(_df, feat_cols, pred_col, _tbd_df, conf): + df = _df#.copy() + tbd_df = _tbd_df#.copy() + y_pred = _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf) + y_true = tbd_df[pred_col] + mse = mean_squared_error(y_true, y_pred) + rmse = mean_squared_error(y_true, y_pred, squared=False) + #print(f"{group=} {mse=} {rmse=} {conf=}") + return {"conf":conf,"mse":mse,"rmse":rmse} + +def optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, group=""): + hidden_layer_sizes = [(10, 10),(10, 10, 10),(100, ),(10, ),] #,(10, 10, 10, 10), (100, 100, 100), (140, ), (100, 100), + solver = ["lbfgs", 'adam'] + alpha = [0.0001, 0.00005, 0.0005, 0.00001] # + max_iter = [20000, 200000] + activation=["relu", 'logistic'] + + # hidden_layer_sizes = [(100, ),(90, ),(80, ),(110, ),(120, ),] #,(10, 10, 10, 10) + solver = ["lbfgs"] + # alpha = [0.00001,0.000015,0.000005] # + max_iter = [20000] + activation=["relu"] + + # hidden_layer_sizes = [(120, ),(140, ),(160, ),(180, ),(200, ),] #,(10, 10, 10, 10) + # alpha = [0.00001,] # + + # hidden_layer_sizes = [(140, ),] #,(10, 10, 10, 10) + # alpha = [0.00001,0.000011, 0.000009,] # + + c = list(itertools.product(hidden_layer_sizes, solver, alpha, max_iter, activation)) + #c = [((100,), 'lbfgs', 1e-05, 20000, 'relu'), ] 4.363318e+04 + #c = [((140,), 'lbfgs', 1e-05, 20000, 'relu'), ] 4.255844e+04 + + results = [] + ret_ds = [] + print() + +# + if True: + for conf in tqdm(c, desc="conf "+group, disable=(len(c) == 1)): + res = _optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf) + ret_ds.append(res) + else: #slower + logger.warning(f"mlp w/ {NUM_THREADS}") + #with ThreadPoolExecutor(NUM_THREADS) as pool: + with ProcessPoolExecutor(NUM_THREADS) as pool: + for conf in tqdm(c, desc="conf "+group, disable=True): + results.append(pool.submit(_optimize_MLPRegressor, df, feat_cols, pred_col, tbd_df, conf)) + #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds) + for r in tqdm(results, desc="conf "+group): + ret_ds.append(r.result()) + + + return ret_ds + + +def eval_tuning(x_path, full=False): + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + + regex = re.compile(r"^(CP_\d+)_(\w+)\.csv\.gz$") + eval_tuning_files=defaultdict(lambda: dict()) + for f in tqdm(onlyfiles, desc="eval_tuning", disable=True): + #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz + result = regex.search(f) + if result: + f_df = pd.read_csv(os.path.join(x_path, f), index_col=0) + eval_tuning_files[result.group(2)][result.group(1)] = f_df + else: + logger.info(f"unk file {os.path.join(x_path, f)}") + continue + + if full: + return eval_tuning_files + + ret_dict=dict() + for reg,group_d in eval_tuning_files.items(): + for group, df in group_d.items(): + df["rmse_n"] = (df["rmse"] - df["rmse"].min()) / (df["rmse"].max() - df["rmse"].min()) + #print(reg,group,df[df["rmse"] == df["rmse"].min()]) + sum_df = pd.concat([df for group, df in group_d.items()]) + sum_df = sum_df.groupby(by="conf").sum() / len(group_d) + best_df = sum_df[sum_df["rmse_n"] == sum_df["rmse_n"].min()] + best_df = sum_df[sum_df["rmse"] == sum_df["rmse"].min()] + # print(reg,best_df) + for i,r in best_df.iterrows(): + ret_dict[reg] = {"conf":i, "eval":r} + + return ret_dict + + +def get_eval_dicts(OutDataDIR): + ret_d=dict() + DIR=OutDataDIR+"/results/" + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")] + for cpo in sub_folders: + ret_dict=eval_tuning(OutDataDIR+"/results/"+cpo) + ret_d[cpo] = ret_dict + + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")] + for dso in sub_folders: + ret_dict=eval_tuning(OutDataDIR+"/results/"+dso) + ret_d[dso] = ret_dict + + return ret_d + +def get_cp_group_eval_dicts(OutDataDIR): + ret_d=dict() + DIR=OutDataDIR+"/results/" + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")] + for cpo in sub_folders: + ret_dict=eval_tuning(OutDataDIR+"/results/"+cpo, full=True) + ret_d[cpo] = ret_dict + + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")] + for dso in sub_folders: + ret_dict=eval_tuning(OutDataDIR+"/results/"+dso, full=True) + ret_d[dso] = ret_dict + + return ret_d + + +def get_prediction_dicts(OutDataDIR): + regex = re.compile(r"^(CP_\d+)_(\w+)_(\d+).csv.gz$") + + ret_d=defaultdict(lambda:defaultdict(lambda:list())) + DIR=OutDataDIR+"/predictions/" + sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))] + for actor in sub_folders: + if ".bak" in actor: # or ".all" in actor: + logger.warning(f'skiping {OutDataDIR+"/predictions/"+actor}') + continue + if len(actor.split(".")) > 1: + features = actor.split(".")[1:] + else: + features=["all"] + logger.warning(f"assuming all features for {actor}") + x_path=OutDataDIR+"/predictions/"+actor + onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))] + for f in onlyfiles: + if ".bak" in f or ".test" in f: # or ".all" in actor: + logger.warning(f'skiping {OutDataDIR+"/predictions/"+actor} {f}') + continue + result = regex.search(f) + if result: + result.group(1) + ret_d[actor][result.group(1)].append({"reg":result.group(2),"shifts":result.group(3),"file":os.path.join(x_path, f),"features":features}) + else: + logger.error(f"unk file {f}") + continue + return ret_d + + +def get_reg_id(reg_n, df, feat_cols, pred_col, conf): + h = hashlib.new('sha256') + h.update(repr(conf).encode("utf-8")) + h.update(pd.util.hash_pandas_object(df, index=True).values) + h.update(repr(sorted(feat_cols)).encode("utf-8")) + h.update(repr(pred_col).encode("utf-8")) + ha = h.hexdigest() + reg_id = (reg_n, ha) + return reg_id diff --git a/ids/run_ids.sh b/ids/run_ids.sh new file mode 100644 index 0000000000000000000000000000000000000000..f618064ac592d8747e0a5156cde5dc14a5840012 --- /dev/null +++ b/ids/run_ids.sh @@ -0,0 +1,108 @@ +#!/bin/bash + + +# nohup bash run_dso3.sh >> rlog_dso3.log 2>&1 & #wat +# tail -f rlog_dso3.log + +#TRAIN: + +python3 ids.py -c=load_data -v -d elaadnl +python3 ids.py -c=get_features_cpo -v -d elaadnl #only for OSCP fix +python3 ids.py -c=get_features_dso -l 96 -v -d elaadnl +#python3 ids.py -c=plot_atks -v -d elaadnl + + + +#Regression without grid features: +#RandomForestRegressor DecisionTreeRegressor GradientBoostingRegressor LinearSVR MLPRegressor HistGradientBoostingRegressor +python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # +#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl # + +python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # +#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl # + + +#Regression with grid features: +python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # + +python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # +python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl # + + +#exit 0 + +python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl + +python3 ids.py -c=get_features_clf_dso -v -d elaadnl + + + +#Novelty Detection without grid features: +# LocalOutlierFactor OneClassSVM EllipticEnvelope IsolationForest +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm set_13 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm set_13 -v -d elaadnl -O # +#python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta set_13 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm set_13 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm set_13 -v -d elaadnl -O # + +#Novelty Detection with basic grid features: +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm set_3 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm set_3 -v -d elaadnl -O # +# python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta set_3 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm set_3 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm set_3 -v -d elaadnl -O # + +#Novelty Detection with advanced grid features: +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O # +#python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta add_grid_load_expo_static_100 set_35 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O # +python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O # + +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_2 set_352 -v -d elaadnl -O +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_4 set_352 -v -d elaadnl -O +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_6 set_352 -v -d elaadnl -O +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_8 set_352 -v -d elaadnl -O +python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_10 set_352 -v -d elaadnl -O + + +python3 ids.py -c=eval_tuning_clf -v -d elaadnl -O #1 2 +#python3 ids.py -c=print_eval_tuning_clf -v -d elaadnl + +#TEST +python3 ids.py -c=load_data -v -d elaadnl_atk #1 2 +python3 ids.py -c=get_features_cpo -v -d elaadnl_atk #only for OSCP fix +python3 ids.py -c=get_features_dso -l 96 -v -d elaadnl_atk #1 2 + +python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl_atk +#python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl_atk + +python3 ids.py -c=get_features_clf_dso -v -d elaadnl_atk #1 +python3 ids.py -c=get_is_atk_dfs_dso -v -d elaadnl_atk #1 + +python3 ids.py -c=do_clf_dso -v -d elaadnl_atk -O + +#Eval Figures: +# python3 ids.py -c=eval_clf_results4 -v -d elaadnl_atk #Figure 6 +# python3 ids.py -c=eval_clf_results62 -v -d elaadnl_atk -e 0.40 #Figure 7 & 8 +# python3 ids.py -c=eval_clf_results63 -v -d elaadnl_atk -e 0 #Figure 8 +# python3 ids.py -c=eval_clf_results7 -v -d elaadnl_atk -e 0 #Figure 10 + +exit 0