diff --git a/README.md b/README.md
index 198badd737e5a44f0b019257f898a09633f55d3c..9efa3a793ae664c6d943fc4d84d79b3c070a2aac 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 
 
-## Run Simulation
+## Install Simulation Extension
 
 
 ```
@@ -43,6 +43,7 @@ wget "https://platform.elaad.io/download-data/filedownload.php?file=elaadnl_open
 
 cp -r ./wattson-ev/wattson ./wattson/
 cp -r ./wattson-ev/scenarios ./wattson/
+cp ./wattson-ev/run_wattson.sh ./wattson/
 
 
 
@@ -50,6 +51,33 @@ sudo python3 wattson/setup.py wattson
 sudo python3 -m pip install -e ./wattson
 
 sudo pip install ujson pyoscp websockets ocpp flask CherryPy cheroot openpyxl
+```
+
+
+## Run Simulation
 
+```
+#Single simulation run with e.g.:
 sudo python3 -m wattson wattson/scenarios/powerowl_example --no-cli --seed 5
+
+#Full run (incl. attack and normal runs) with:
+cd wattson
+sudo bash run_wattson.sh
 ```
+
+
+## IDS Dependencies and Execution:
+
+```
+#Install IDS dependencies with:
+sudo pip install scikit-learn==1.3.2
+
+#Execute full IDS run (training and testing) with:
+cd wattson/ids
+bash run_ids.sh
+```
+
+
+
+
+
diff --git a/ids/features_aux.py b/ids/features_aux.py
new file mode 100644
index 0000000000000000000000000000000000000000..4530bc681e046ba9c0f4c74abe3e3e8d12320887
--- /dev/null
+++ b/ids/features_aux.py
@@ -0,0 +1,512 @@
+
+from collections import defaultdict
+from datetime import datetime
+import gc
+import gzip
+import os
+from pathlib import Path
+import pickle
+import re
+import warnings
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+import json
+from powerowl.simulators.pandapower import PandaPowerGridModel
+
+
+import logging
+
+logger = logging.getLogger("WATTSON_EV_IDS.FeaturesAUX")
+
+
+
+
+def discretize_hour_balancing(ts):
+    hour = ts.hour
+    if hour in [7, 8, 9]:
+        return "peak"
+    elif hour in [16, 17, 18, 19, 20, 21]:
+        return "peak"
+    elif hour in [22, 23, 24, 0, 1, 2, 3, 4, 5, 6]:
+        return "low"
+    elif hour in [10, 11, 12, 13, 14, 15]:
+        return "low"
+def discretize_hour2(hour):
+    if hour in [5, 6, 7, 8, 9, 10, 11]:
+        return "Morning"
+    elif hour in [12, 13, 14, 15, 16]:
+        return "Afternoon"
+    elif hour in [17, 18, 19, 20, 21]:
+        return "Evening"
+    elif hour in [21, 22, 23, 0, 1, 2, 3, 4]:
+        return "Night"
+def discretize_hour_only(hour):
+    if hour in [9, 10, 11, 12, 13, 14, 15, 16]:
+        return "Work"
+    elif hour in [22, 23, 0, 1, 2, 3, 4, 5]:
+        return "Sleep"
+    elif hour in [17, 18, 19, 20, 21, 6, 7, 8]:
+        return "Play"
+def discretize_hour_ts(ts):
+    hour = ts.hour
+    return discretize_hour_only(hour)
+def discretize_day_is_work(ts):
+    day = ts.weekday()
+    if day in [5, 6]:  # weekend
+        return False
+    else:
+        return True
+def discretize_hour_day(ts):
+    hour = ts.hour
+    day = ts.weekday()
+    if day in [5, 6]:  # weekend
+        if hour in [22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+            return "High-Home"
+        elif hour in [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]:
+            return "High-Home"
+            return "High-Leisure"
+    else:
+        if hour in [22, 23, 0, 1, 2, 3, 4, 5, 6, 7]:
+            return "High-Home"
+        elif hour in [8, 9, 10, 11, 12, 13, 14, 15, 16]:
+            return "High-Work"
+        elif hour in [17, 18, 19, 20, 21]:
+            return "High-Home"
+            return "High-Leisure"
+        else:
+            raise Exception("discretize_hour_only", hour)
+
+def get_date_exog(sum_df, prefix):
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        # sum_df['day_name'] = sum_df.index.to_series().dt.day_name()
+        sum_df[prefix+'dayofweek'] = sum_df.index.to_series().dt.dayofweek
+        sum_df[prefix+'hour'] = sum_df.index.to_series().dt.hour
+        sum_df[prefix+'discretize_hour_only'] = sum_df.index.to_series().apply(
+            discretize_hour_ts)
+        sum_df[prefix+'discretize_hour_day'] = sum_df.index.to_series().apply(
+            discretize_hour_day)
+        sum_df[prefix+'discretize_day_is_work'] = sum_df.index.to_series().apply(
+            discretize_day_is_work)
+        sum_df[prefix+'discretize_hour_balancing'] = sum_df.index.to_series().apply(
+            discretize_hour_balancing)
+    sum_df = sum_df.copy()
+
+
+def get_date_exog_col(sum_df, col, prefix):
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        # sum_df['day_name'] = sum_df[[col]].to_series().dt.day_name()
+        sum_df[prefix+col+'_dayofweek'] = sum_df[col].dt.dayofweek
+        sum_df[prefix+col+'_hour'] = sum_df[col].dt.hour
+        sum_df[prefix+col+'_discretize_hour_only'] = sum_df[col].apply(
+            discretize_hour_ts)
+        sum_df[prefix+col+'_discretize_hour_day'] = sum_df[col].apply(
+            discretize_hour_day)
+        sum_df[prefix+col+'_discretize_day_is_work'] = sum_df[col].apply(
+            discretize_day_is_work)
+        sum_df[prefix+col+'_discretize_hour_balancing'] = sum_df[col].apply(
+            discretize_hour_balancing)
+    sum_df = sum_df.copy()
+
+
+def normalize_cols(df, suffix, ntype="std", skip_norm="_norm"):
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        if ntype == "minmax":
+            for c in df.columns:
+                if skip_norm is not None and skip_norm in c:
+                    continue
+                div = (df[c].max() - df[c].min())
+                if div != 0:
+                    df[c+suffix] = (df[c] - df[c].min()) / div
+                else:
+                    df[c+suffix] = 0
+        elif ntype == "std":
+            for c in df.columns:
+                if skip_norm is not None and skip_norm in c:
+                    continue
+                div = df[c].std()
+                if div != 0:
+                    df[c+suffix] = (df[c] - df[c].mean()) / div
+                else:
+                    df[c+suffix] = 0
+        else: raise ValueError(f"unk {ntype=}")
+    df = df.copy()
+
+
+
+def add_lags(df, num_lags=5, do_date=False, fillna="bfill", only_do_col=None):
+    init_cols = df.columns
+    do_cols = [c for c in init_cols if "is_attack" not in c]
+    if not do_date:
+        do_cols = [c for c in do_cols if "date_exog_" not in c]
+    if only_do_col is not None:
+        do_cols = [only_do_col]
+    new_cols=[]
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        for l in range(1, num_lags+1):
+            _new_cols = [(c+"_lag_"+str(l), c) for c in do_cols]
+            df[[c[0] for c in _new_cols]] = df[[c for c in do_cols]].shift(l)
+            new_cols.extend(_new_cols)
+        #df = df.copy()
+        for new_col, c in new_cols:
+            if fillna == "mean":
+                df[new_col].fillna(df[c].mean(), inplace=True)
+            elif fillna == "bfill":
+                df[new_col].fillna(method="bfill", inplace=True)
+        df.rename(columns={c: c+"_lag_0" for c in init_cols}, inplace=True)
+    return df
+
+def add_lags2(df, num_lags=5, do_date=False, fillna="bfill"):
+    init_cols = df.columns
+    do_cols = [c for c in init_cols if "is_attack" not in c]
+    if not do_date:
+        do_cols = [c for c in do_cols if "date_exog_" not in c]
+    new_cols=[]
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        for l in range(1, num_lags+1):
+            for c in do_cols:
+                new_col=c+"_lag_"+str(l)
+                new_cols.append((new_col, c))
+                df[new_col] = df[c].shift(l)
+        #df = df.copy()
+        for new_col, c in new_cols:
+            if fillna == "mean":
+                df[new_col].fillna(df[c].mean(), inplace=True)
+            elif fillna == "bfill":
+                df[new_col].fillna(method="bfill", inplace=True)
+        df.rename(columns={c: c+"_lag_0" for c in init_cols}, inplace=True)
+    return df
+
+def add_all_lags(cpo_df_interp, num_lags=5, do_date=False, fillna="bfill", only_do_col=None):
+    for group, cp_df_interp in tqdm(cpo_df_interp.items(), desc="add_all_lags"):
+        cp_df_interp_l = add_lags(cp_df_interp, num_lags=num_lags, do_date=do_date, fillna=fillna, only_do_col=only_do_col)
+        yield group, cp_df_interp_l
+
+
+
+def get_grid_pp(DIR):
+    grid_dir = os.path.join(DIR,"controller-export/power-grid/")
+    grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))]
+    
+    for f in grid_files[:10]:
+        regex_grid = re.compile(r"^WALL\-[\-\dT\+]+__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$")
+        #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz
+        result = regex_grid.search(f)
+        if result:
+            f_time = "".join(result.group(1).rsplit("-", 1))
+            f_time = pd.to_datetime(f_time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00
+
+            ff = os.path.join(grid_dir, f)
+            p_grid = load_single_feat(ff)
+            pd.reset_option('^display.', silent=True)
+
+            pg = PandaPowerGridModel()
+            pg.from_primitive_dict(p_grid)
+            
+            pp=pg.to_external()
+            #print(pp)
+            return pp
+    logger.error(f"no pp grid found in {grid_dir}")
+    return None
+
+
+
+def interpolate_sim_time(ts, target):
+    target = target.timestamp()
+    if target in ts.index:
+        return ts[target]
+    ts1 = ts.sort_index()
+    b = (ts1.index > target).argmax() # index of first entry after target
+    s = ts1.iloc[b-1:b+1]
+    # Insert empty value at target time.
+    s = s.reindex(list(s.index.values) + [target])
+    return s.interpolate(method='index', limit_direction="both").loc[target]
+
+def get_grids_json(DIR, wall_sim_map, start_time=None): 
+    grid_dir = os.path.join(DIR,"power_grid_exports/")
+    grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))]
+    
+    pps=[]
+    for f in tqdm(grid_files[:], desc="get_grids_json"):
+        #power_grid_2024-02-03T19-04-14-008259+00-00.json
+        regex_grid = re.compile(r"^power_grid_([\-\dT\+]+)\.json$")
+        result = regex_grid.search(f)
+        if result:
+            f_time = "".join(result.group(1).rsplit("-", 1))
+            f_time = pd.to_datetime(f_time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00
+            sim_time=interpolate_sim_time(wall_sim_map, pd.to_datetime(f_time))
+            if start_time is not None:
+                if np.isnan(sim_time):
+                    continue
+                s_t = datetime.utcfromtimestamp(sim_time)
+                #print(f"{start_time} {s_t}")
+                if s_t < start_time.tz_localize(None):
+                    continue
+
+            ff = os.path.join(grid_dir, f)
+            
+            #p_grid = load_single_feat(ff)
+            with open(ff, 'r') as pg_f:
+                p_grid = json.load(pg_f)
+            
+            p_grid["simtime"] = sim_time
+            pd.reset_option('^display.', silent=True)
+            pps.append(p_grid)
+            #yield p_grid
+    return pps
+
+
+def get_elems_attatched_to_bus(pp, group_n):
+    ret=[]
+    lines=pp["line"]
+    try:
+        lines=lines[lines["from_bus"] < lines["to_bus"]]
+        if (lines["to_bus"] == group_n).any():
+            line = "line."+str(lines[lines["to_bus"] == group_n].sort_values(by="from_bus", ascending=True).index[0])
+            ret.append(line)
+    except Exception as e:
+        logger.error(group_n)
+        logger.error(pp["line"])
+        logger.error(lines)
+        raise e
+
+    elems = ["load", "sgen", "storage"] #, "storage" only for non CP storages...; loads not included in default wattson
+    for e in elems:
+        try:
+            el=pp[e]
+            if e == "storage":
+                el = el[el["type"] == "CP"]
+                # print(el)
+                # exit()
+            if (el["bus"] == group_n).any():
+                for el_idx in el[el["bus"] == group_n].index:
+                    el = e+"."+str(el_idx)
+                    ret.append(el)
+        except Exception as e:
+            logger.error(group_n)
+            logger.error(el)
+            raise e
+    return ret
+
+
+def get_ts_sim_map(DIR):
+    grid_dir = os.path.join(DIR,"controller-export/power-grid/")
+    grid_files = [f for f in os.listdir(grid_dir) if os.path.isfile(os.path.join(grid_dir, f))]
+    
+    lst=[]
+    ind=[]
+    for f in grid_files[:]:
+        regex_grid = re.compile(r"^WALL\-([\-\dT\+]+)__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$")
+        #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz
+        result = regex_grid.search(f)
+        if result:
+            wall_time = "".join(result.group(1).rsplit("-", 1))
+            wall_time = pd.to_datetime(wall_time, format="%Y-%m-%dT%H-%M-%S-%f%z").timestamp() #2023-11-23T19-05-29-585438+00-00
+
+            sim_time = "".join(result.group(2).rsplit("-", 1))
+            sim_time = pd.to_datetime(sim_time, format="%Y-%m-%dT%H-%M-%S-%f%z").timestamp() #2023-11-23T19-05-29-585438+00-00
+
+            lst.append(sim_time)
+            ind.append(wall_time)
+
+            #print(pp)
+    x = pd.Series(lst, index = ind) 
+    return x
+
+def get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power", start_time=None, time_delta_m=60):
+    plot_l_dict=defaultdict(lambda:list())
+    if not bus_ns:
+        if "bus" in pp[target_elem]:
+            bus_ns = sorted(set(i for i in pp[target_elem]["bus"].values))
+        else:
+            bus_ns = sorted(set(i for i in pp["load"]["bus"].values))
+        logger.debug(f"no bus list provided, using all: {bus_ns}")
+
+    wall_sim_map=get_ts_sim_map(DIR).sort_index()
+    pps=get_grids_json(DIR, wall_sim_map, start_time=start_time)
+    for bus_n in bus_ns:
+
+        elems = get_elems_attatched_to_bus(pp, bus_n)
+        if target_elem == "bus":
+            elems = ["bus."+str(bus_n)]
+        # print(elems)
+        for elem in [e for e in elems if target_elem in e]:
+            # print(elem)
+            # exit()
+
+            plot_t=[]
+            #plot_b=[]
+            plot_l=[]
+            for grid in pps:
+                if not np.isnan(grid["simtime"]):
+                    plot_t.append(datetime.utcfromtimestamp(grid["simtime"]) + pd.Timedelta(minutes=time_delta_m))
+                    #plot_b.append(grid["values"]["bus."+str(bus_n)".MEASUREMENT.active_power"])
+                    plot_l.append(grid["values"][elem+target_var])
+                else:
+                    logger.debug(f"nan simtime for {grid['timestamp']}")
+            #plot_b_s=pd.Series(plot_b, index = plot_t).sort_index()
+            plot_l_s=pd.Series(plot_l, index = plot_t, name="grid_expo_"+elem+target_var).sort_index()
+            plot_l_dict[bus_n].append(plot_l_s)
+    return plot_l_dict
+
+    
+def save_feats(feat, OutDataDIR, t_name):
+    Path(OutDataDIR+"/feats/"+t_name+"/").mkdir(parents=True, exist_ok=True)
+    
+    groups=[k for k in feat.keys()]
+    for group in tqdm(groups, desc="save_feats", disable=(len(groups) == 1)):
+        fdf_dict = feat[group]
+        logger.debug(fdf_dict["cp_g_df"].columns)
+        with gzip.open(OutDataDIR+"/feats/"+t_name+"/"+group+".gz", "wb") as f:
+            pickle.dump(fdf_dict, f)
+        del feat[group]
+        gc.collect()
+
+def load_feats_len(OutDataDIR, t_name):
+    x_path = OutDataDIR+"/feats/"+t_name+"/"
+    onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+    return len(onlyfiles)
+
+# def load_feats2(OutDataDIR, t_name):
+#     x_path = OutDataDIR+"/feats/"+t_name+"/"
+#     onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+#     for fi in onlyfiles:
+#         fdf_dict = load_single_feat(x_path+fi)
+#         yield (fi.replace(".gz", ""), fdf_dict)
+
+
+
+
+def load_feats(OutDataDIR, t_name):
+    for fi, fi_p in iter_feats(OutDataDIR, t_name):
+        fdf_dict = load_single_feat(fi_p)
+        yield (fi, fdf_dict)
+
+def iter_feats(OutDataDIR, t_name):
+    x_path = OutDataDIR+"/feats/"+t_name+"/"
+    onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+    for fi in onlyfiles:
+        yield (fi.replace(".gz", ""), x_path+fi)
+
+def load_single_feat(file_p):
+    with gzip.open(file_p, "rb") as f:
+        fdf_dict = pickle.load(f)
+        return fdf_dict
+    
+def load_feats_CPg(OutDataDIR, t_name, cp_g):
+    fi = OutDataDIR+"/feats/"+t_name+"/"+cp_g+".gz"
+    if not os.path.isfile(os.path.join(fi)):
+        logger.error(f"file {fi} not found")
+        return None
+                 
+    with gzip.open(fi, "rb") as f:
+        fdf_dict = pickle.load(f)
+    return fdf_dict
+
+def prune_feats(df_dict, feat_cases=None):
+    if feat_cases is None or "all" in feat_cases:
+        return
+    
+    feat_cols = df_dict["feat_cols"]
+
+
+    add_expo = [feat_case for feat_case in feat_cases if "add_grid_load_expo_" in feat_case]
+    if add_expo:
+        if len(add_expo) > 1:
+            logger.warning(f"multiple add_grid_load_expo_ sets provided: {add_expo}; using first")
+        add_expo = add_expo[0]
+        sub_case="static"
+        if "expo_rnd" in add_expo:
+            sub_case="rnd"
+        add_expo_fac_min = int(add_expo.split("_")[-1])
+        add_expo_fac_max = min(add_expo_fac_min+10, 100)
+        #print(add_expo, add_expo_fac_min, add_expo_fac_max)
+        expo_cols = [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and sub_case+"_fac."+str(add_expo_fac_min) in c]
+        #print(expo_cols)
+        #print(feat_cols)
+        feat_cols = feat_cols + expo_cols
+        #print(feat_cols)
+        #exit()
+
+    if "add_bus_relations" in feat_cases:
+        for add_c in [c for c in df_dict["cp_g_df"].columns if ".bus_and_sgen." in c or ".no_ev_load." in c]:
+            feat_cols.append(add_c)
+
+    for feat_case in feat_cases:
+        if feat_case == "no_cps":
+            feat_cols = [f for f in feat_cols if "CP_" not in f]
+        elif feat_case == "no_cps_but_speed":
+            feat_cols = [f for f in feat_cols if "CP_" not in f or "charge_speed" in f] 
+
+        elif feat_case == "no_grid":
+            #feat_cols = [f for f in feat_cols if "bus." not in f]
+            feat_cols = [f for f in feat_cols if "grid_est_" not in f] 
+            feat_cols = [f for f in feat_cols if "grid_meas_" not in f]
+        elif feat_case == "no_grid_storage":
+            #feat_cols = [f for f in feat_cols if "bus." not in f]
+            feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".storage." in f)] 
+            feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".storage." in f)]
+        elif feat_case == "no_grid_line":
+            #feat_cols = [f for f in feat_cols if "bus." not in f]
+            feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".line." in f)] 
+            feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".line." in f)]
+        elif feat_case == "no_grid_sgen":
+            #feat_cols = [f for f in feat_cols if "bus." not in f]
+            feat_cols = [f for f in feat_cols if not ("grid_est_" in f and ".sgen." in f)] 
+            feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".sgen." in f)]
+        elif feat_case == "no_grid_bus_rel_to_pred":
+            pred_col = df_dict["pred_col"]
+            feat_cols = [f for f in feat_cols if not ("grid_meas_" in f and ".bus." in f and "_relation_to_"+pred_col.replace("_lag_0", "") in f)]
+
+        elif feat_case == "no_est":
+            feat_cols = [f for f in feat_cols if "grid_est_" not in f] 
+        elif feat_case == "no_meas":
+            feat_cols = [f for f in feat_cols if "grid_meas_" not in f]
+
+        elif feat_case == "no_time_diff":
+            feat_cols = [f for f in feat_cols if "time_diff_" not in f]
+        
+        elif feat_case == "only_pred_lag":
+            pred_col = df_dict["pred_col"]
+            feat_cols = [f for f in feat_cols if "_lag_0" in f or f.startswith(pred_col.replace("_lag_0", "_lag_"))]
+            
+        elif feat_case == "only_norm":
+            feat_cols = [f for f in feat_cols if "_norm" in f]
+        elif feat_case == "only_norm_but_pred":
+            pred_col = df_dict["pred_col"]
+            feat_cols = [f for f in feat_cols if "_norm" in f or f.startswith(pred_col.replace("_lag_0", "_lag_"))]
+        elif feat_case == "no_norm":
+            feat_cols = [f for f in feat_cols if "_norm" not in f]
+
+        elif feat_case == "no_norm1":
+            feat_cols = [f for f in feat_cols if "_cp_g_norm" not in f and "_cp_norm" not in f]
+        elif feat_case == "no_norm2":
+            feat_cols = [f for f in feat_cols if "_group_norm" not in f]
+            
+        elif feat_case == "no_date1":
+            feat_cols = [f for f in feat_cols if "date_exog_cp_g_" not in f and "date_exog_cp_" not in f]
+        elif feat_case == "no_date2":
+            feat_cols = [f for f in feat_cols if "date_exog_group_" not in f]
+
+        elif feat_case == "no_hour_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "hour" in f and "discretize" not in f)]
+        elif feat_case == "no_hour_only_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_only" in f)]
+        elif feat_case == "no_hour_day_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_day" in f)]
+        elif feat_case == "no_day_of_week_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "dayofweek" in f)]
+        elif feat_case == "no_day_is_work_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_day_is_work" in f)]
+        elif feat_case == "no_hour_balancing_date":
+            feat_cols = [f for f in feat_cols if not ("date_exog_" in f and "discretize_hour_balancing" in f)]
+
+
+
+    df_dict["feat_cols"] = feat_cols
diff --git a/ids/features_clf.py b/ids/features_clf.py
new file mode 100644
index 0000000000000000000000000000000000000000..c979a29c373db54ef323467b96fd1b1be31416ed
--- /dev/null
+++ b/ids/features_clf.py
@@ -0,0 +1,1357 @@
+
+import ast
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from copy import deepcopy
+from datetime import datetime
+import gc
+import gzip
+import hashlib
+import json
+import multiprocessing
+import os
+from pathlib import Path
+import pickle
+import sys
+import threading
+import time
+import warnings
+import pandas as pd
+from sklearn.covariance import EllipticEnvelope
+from sklearn.discriminant_analysis import StandardScaler
+from sklearn.ensemble import IsolationForest
+from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
+from sklearn.metrics  import accuracy_score
+from sklearn.calibration import cross_val_predict
+from sklearn.metrics import confusion_matrix, mean_squared_error, precision_recall_fscore_support
+from sklearn.model_selection import ParameterGrid
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import OneClassSVM
+from tqdm import tqdm
+
+
+from joblib import parallel_backend
+from threadpoolctl import threadpool_limits
+
+import re
+
+import logging
+
+from features_aux import add_lags, get_date_exog, load_feats, load_feats_CPg, load_feats_len, normalize_cols, add_all_lags
+from regression import get_cp_group_eval_dicts
+
+logger = logging.getLogger("WATTSON_EV_IDS.FeaturesClf")
+
+m_dict = multiprocessing.Manager()
+CLF_CACHE=m_dict.dict()
+#CLF_CACHE=dict()
+with open("ids.conf", 'r') as f:
+    conf = json.load(f)
+    NUM_THREADS = conf["NUM_THREADS"]
+
+
+feat_warn1=0
+feat_warn2=0
+def save_clf_data(feat, out_d, out_f):
+    #cp_g_dict = {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/year/predictions/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']}
+    Path(out_d).mkdir(parents=True, exist_ok=True)
+    
+    logger.debug(feat.columns)
+    feat.to_csv(out_f)
+    del feat
+    gc.collect()
+
+def add_grid_meas_relations(clf_df, cp_g_df, pred_col, speed_preds):
+    get_rel_expo_cols = [c for c in cp_g_df.columns if ".load." in c and "_norm" not in c and "grid_expo_" in c and ".active_power" in c and "_relation_to_" not in c] # and "10_" in c
+    get_rel_expo_cols_base = [c for c in cp_g_df.columns if ".load." in c and "_norm" not in c and "grid_expo_" in c and ".active_power_lag_0" in c and "_relation_to_" not in c]#.
+    get_rel_expo_cols_map=defaultdict(lambda:list())
+    
+    if len(get_rel_expo_cols_base) > 1: #'grid_expo_0.0.load.14.active_power_lag_0', 'grid_expo_0.0.load.6.active_power_lag_0', 
+        load_ns = [c.split(".")[3] for c in get_rel_expo_cols_base]
+        for rel_expo in get_rel_expo_cols:
+            for load_n in load_ns:
+                if ".load."+load_n+"." in rel_expo:
+                    rel_expo_base = rel_expo.replace(".load."+load_n+".",".load.total.")
+                    get_rel_expo_cols_map[rel_expo_base].append(rel_expo)
+        for rel_expo_base,rel_expos in get_rel_expo_cols_map.items():
+            cp_g_df[rel_expo_base]=0
+            get_rel_expo_cols.append(rel_expo_base)
+            for rel_expo in rel_expos:
+                cp_g_df[rel_expo_base]+=cp_g_df[rel_expo]
+                
+
+    # print([c for c in clf_df.columns])
+    # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_" in c and "sgen" in c])
+    # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_meas_" in c and "sgen" in c and ".active_power_lag_0" in c and "_relation_to_" not in c])
+    # print([c for c in cp_g_df.columns if "_norm" not in c and "grid_meas_" in c and "bus" in c and ".active_power_lag_0" in c and "_relation_to_" not in c])
+    # print(get_rel_expo_cols)
+    #print(cp_g_df)
+
+    new_cols=[]
+
+    #max ev power = bus + sgen - load(s)
+    bus_cols = [c for c in cp_g_df.columns if ".bus." in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c]
+    bus_col = bus_cols[0]
+    sgen_cols = [c for c in cp_g_df.columns if ".sgen." in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c]
+
+    name_b_total = bus_col.replace("active_power","total_power")
+    clf_df[name_b_total] = cp_g_df[bus_col]
+    for sgen_col in sgen_cols:
+        cp_g_df_sgen_col = cp_g_df[sgen_col]
+        clf_df[name_b_total] += cp_g_df_sgen_col #grid_meas_585.10030.bus.11.active_power_lag_0
+    new_cols.append(name_b_total)
+    
+    non_ev_power = bus_col.replace("active_power","non_ev_power")
+    clf_df[non_ev_power] = clf_df[name_b_total] - cp_g_df[pred_col] # (bus + sgen) - ev load = loads
+    new_cols.append(non_ev_power)
+    
+    non_ev_power_relation_to_bus = bus_col.replace("active_power","non_ev_power_relation_to_bus")
+    clf_df[non_ev_power_relation_to_bus] = clf_df[non_ev_power] / clf_df[name_b_total]
+    new_cols.append(non_ev_power_relation_to_bus)
+
+    ev_power_relation_to_bus = bus_col.replace("active_power","only_ev_power_relation_to_bus")
+    clf_df[ev_power_relation_to_bus] = cp_g_df[pred_col] / clf_df[name_b_total]
+    new_cols.append(ev_power_relation_to_bus)
+    
+    for rec in get_rel_expo_cols:
+        name = rec.replace("_lag_0","")+"_max_ev_power"+"_lag_0"
+        clf_df[name] = clf_df[name_b_total] - cp_g_df[rec]
+        new_cols.append(name)
+
+        name2 = rec.replace("_lag_0","")+"_max_ev_power_relation_to_bus"+"_lag_0"
+        clf_df[name2] = clf_df[name] / clf_df[name_b_total]
+        new_cols.append(name2)
+
+        name2 = rec.replace("_lag_0","")+"_non_ev_power_relation_to_expo"+"_lag_0"
+        clf_df[name2] = cp_g_df[rec] / clf_df[non_ev_power]
+        new_cols.append(name2)
+
+
+        name = rec.replace("_lag_0","")+"_relation_to_"+pred_col.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to received charge speed
+        clf_df.loc[(cp_g_df[rec] != 0), name] = cp_g_df[pred_col] / cp_g_df[rec]
+        clf_df.loc[(cp_g_df[rec] == 0), name] = 0
+        new_cols.append(name)
+
+        for speed_pred in speed_preds:
+            name = rec.replace("_lag_0","")+"_relation_to_"+speed_pred.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to predicted charge speeds
+            clf_df.loc[(cp_g_df[rec] != 0), name] = clf_df[speed_pred] / cp_g_df[rec]
+            clf_df.loc[(cp_g_df[rec] == 0), name] = 0
+            new_cols.append(name)
+
+        if len(bus_cols) >= 1:
+            if len(bus_cols) > 1:
+                logger.warning(f"len(bus_cols) > 1: {bus_cols}")
+            bus_col = bus_cols[0]
+            name = bus_col.replace("_lag_0","")+"_relation_to_"+rec.replace("_lag_0","")+"_lag_0" #relation: grid load measurement to bus measurement
+            clf_df.loc[(cp_g_df[rec] != 0), name] = cp_g_df[bus_col] / cp_g_df[rec]
+            clf_df.loc[(cp_g_df[rec] == 0), name] = 0
+            new_cols.append(name)
+        else:
+            logger.warning(f"bus_cols empty: {[c for c in cp_g_df.columns if 'bus' in c]}")
+
+
+        clf_df = clf_df.copy()
+
+    # print(clf_df)
+    # #print(new_cols)
+    # exit()
+
+    if False:
+        new_cols_df = clf_df[new_cols]
+        normalize_cols(new_cols_df, "_prune_norm")
+        for c in new_cols_df.columns:
+            if "_norm" in c and c not in cp_g_df.columns:
+                new_c = c.replace("_lag_0_prune_norm", "_prune_norm_lag_0")
+                clf_df[new_c] = new_cols_df[c]
+                clf_df = clf_df.copy()
+    return clf_df
+
+
+def get_clf_feat_dfs(cp_g_pred, cp_g_df, pred_col):
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+        pred_cols = [c for c in cp_g_pred.columns if "prediction_" in c]
+        clf_df = cp_g_pred[pred_cols].copy()
+        clf_df["prediction_mean"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c]].mean(axis=1)
+        clf_df["prediction_min"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c and "_mean" not in c]].min(axis=1)
+        clf_df["prediction_max"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_" in c and "_mean" not in c]].max(axis=1)
+        if len(pred_cols) >= 2:
+            clf_df["prediction_mean_0_1"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_0" in c or "prediction_1" in c]].mean(axis=1)
+        if len(pred_cols) >= 3:
+            clf_df["prediction_mean_0_1_2"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_0" in c or "prediction_1" in c or "prediction_2" in c]].mean(axis=1)
+            clf_df["prediction_mean_1_2"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_1" in c or "prediction_2" in c]].mean(axis=1)
+        if len(pred_cols) >= 4:
+            clf_df["prediction_mean_2_3"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_2" in c or "prediction_3" in c]].mean(axis=1)
+        if len(pred_cols) >= 5:
+            clf_df["prediction_mean_3_4"] = cp_g_pred[[c for c in cp_g_pred.columns if "prediction_3" in c or "prediction_4" in c]].mean(axis=1)
+        # print(clf_df)
+        # exit()
+        pred_col_df = cp_g_df[pred_col].loc[clf_df.index]
+        # if len(clf_df) != len(pred_col_df):
+        #     print(clf_df)
+        #     print(pred_col_df)
+        for c in clf_df.columns:
+            if "_mean" in c or "_min" in c or "_max" in c:
+                clf_df[c+"_diff"] = clf_df[c] - pred_col_df
+                clf_df[c+"_diff_abs"] = (clf_df[c] - pred_col_df).abs()
+                #clf_df[c+"_diff_rmse"] = mean_squared_error(clf_df[c], pred_col_df, squared=False)
+                clf_df = clf_df.copy()
+            
+        clf_df = add_grid_meas_relations(clf_df, cp_g_df, pred_col, ["prediction_mean", "prediction_min", "prediction_max"])    #pred_cols+ 
+        # print(clf_df)
+        # for c in clf_df.columns:
+        #     print(c)
+        # exit()
+
+        normalize_cols(clf_df, "_clf_norm")
+        #print(clf_df)
+        return clf_df
+
+
+def get_clf_feat_file_dicts(OutDataDIR):
+    regex = re.compile(r"^(CP_\d+)_(\w+)_([\d\w]+).csv.gz$")
+
+    ret_d=defaultdict(lambda:defaultdict(lambda:list()))
+    DIR=OutDataDIR+"/clf_feats/"
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))]
+    for actor in sub_folders:
+        if ".bak" in actor:
+            logger.warning(f'skiping {OutDataDIR+"/clf_feats/"+actor}')
+            continue
+        if len(actor.split(".")) > 1:
+            features = actor.split(".")[1:]
+        else:
+            features=["all"]
+            logger.warning(f"assuming all features for {actor}")
+        x_path=OutDataDIR+"/clf_feats/"+actor
+        onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+        for f in onlyfiles:
+            result = regex.search(f)
+            if result:
+                result.group(1)
+                ret_d[actor][result.group(1)].append({"reg":result.group(2),"shifts":result.group(3),"file":os.path.join(x_path, f),"features":features})
+            else:
+                logger.error(f"unk file {f}")
+                continue
+    return ret_d
+
+
+def get_clf_is_atk_dfs(OutDataDIR, group):
+    is_atk_cols = ['meter_value_sampled_value_lag_0', 'custom_data_meter_no_atk_lag_0', 
+                    'custom_data_energy_interval_lag_0', 'custom_data_original_energy_interval_lag_0',
+                    'custom_data_average_power_lag_0', 'custom_data_original_average_power_lag_0', 
+                    'meter_diff_lag_0', 'custom_data_meter_diff_lag_0',
+                    'charge_speed_lag_0', 'custom_data_charge_speed_lag_0','is_attack_lag_0']
+
+    is_atk_dfs = dict()
+
+    cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")]
+    for actor_prefix in sorted(cpos):
+        act_feat = load_feats_CPg(OutDataDIR, actor_prefix, group) #generator
+        df = act_feat["cp_g_df"]
+        is_atk_dfs[actor_prefix] = df[is_atk_cols].copy()
+
+    return is_atk_dfs
+
+
+def _interp(s, targets, col="charge_speed_lag_0", method="time"):
+    o_idx = list(s.index.values)
+    n_idx = list(targets.values) # if v not in o_idx
+    f_idx=set(o_idx + n_idx)
+    s2 = s.reindex(pd.to_datetime(list(f_idx)))
+    if col is None:
+        return s2.interpolate(method=method).loc[targets]
+    else:
+        return s2[col].interpolate(method=method).loc[targets]
+
+def _interp1(ts, target, col="charge_speed_lag_0"):
+    if target in ts.index:
+        return ts[col].loc[target]
+    if target < ts.index.min() or target > ts.index.max():  #before or after table
+        return 0.0
+    ts1 = ts.sort_index()
+    b = (ts1.index > target).argmax() # index of first entry after target
+    s = ts1.iloc[b-1:b+1]
+    if len(s)==0: #before or after table
+        return 0.0
+    # Insert empty value at target time.
+    s = s.reindex(pd.to_datetime(list(s.index.values) + [target]))
+    return s[col].interpolate(method='time').loc[target]
+    
+def get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, target, col="charge_speed_lag_0", col_s=None):
+    val = 0
+    val_s = 0
+    if col_s is None:
+        col_s = "custom_data_" + col
+
+    val += _interp(cp_group_grid_power, target, col=None)
+    for actor, is_atk_df in is_atk_dfs.items():
+        val_s += _interp(is_atk_df, target, col=col_s)
+
+    return (val - val_s).rename(cp_group_grid_power.name+"_diff_to_"+col_s)
+
+def get_clf_is_atk(is_atk_dfs, target, col="charge_speed_lag_0", col_s=None):
+    val = 0
+    val_s = 0
+    if col_s is None:
+        col_s = "custom_data_" + col
+    for actor, is_atk_df in is_atk_dfs.items():
+        val  +=  _interp(is_atk_df, target, col=col)
+        if col != "is_attack_lag_0":
+            val_s += _interp(is_atk_df, target, col=col_s)
+    return (val - val_s).rename(col+"_diff")
+
+def get_param_grid_len(clf_type):
+    return len(ParameterGrid(get_param_grid_from_string(clf_type)))
+
+def get_param_grid_from_string(clf_type, train_contamination=False): 
+    if clf_type == "LocalOutlierFactor":
+        if train_contamination == True:
+            param_grid = {'algorithm': ['auto'], 'leaf_size': [30,20,40], 'metric': ['minkowski'], 'metric_params': [None], 'n_jobs': [NUM_THREADS], 'n_neighbors': [20,10,30], 'novelty': [True], 'p': [1,2], 'contamination': ['auto']}
+        else:
+            param_grid = {'algorithm': ['auto'], 'leaf_size': [30,20,40], 'metric': ['minkowski'], 'metric_params': [None], 'n_jobs': [NUM_THREADS], 'n_neighbors': [20,10,30], 'novelty': [True], 'p': [1,2], 'contamination': [1e-323]}
+    elif clf_type == "OneClassSVM":
+        param_grid = {'cache_size': [7000], 'kernel': ['rbf', 'sigmoid'], 'max_iter': [200], 'nu': [0.5,0.2,0.7], 'shrinking': [True], 'tol': [0.001], 'verbose': [False]}
+        # param_grid1 = {'cache_size': [7000], 'coef0': [0.0, 0.5, 2.0], 'gamma': ['scale'], 'kernel': ['poly', 'sigmoid'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]}
+        # param_grid2 = {'cache_size': [7000], 'degree': [3,6,9], 'gamma': ['scale'], 'kernel': ['poly'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]}
+        # param_grid3 = {'cache_size': [7000], 'kernel': ['rbf', 'linear'], 'max_iter': [-1], 'nu': [0.5,0.2,0.7], 'shrinking': [True,False], 'tol': [0.001,0.01,0.0001], 'verbose': [False]}
+        # param_grid = [param_grid1, param_grid2, param_grid3]
+    elif clf_type == "EllipticEnvelope":
+        # param_grid = {'assume_centered': [False,True], 'contamination': [1e-323], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [None,0.1,0.5,0.8,0.9,0.99], "random_state": [12345]} #The covariance matrix of the support data is equal to 0, try to increase support_fractio
+        if train_contamination == True:
+            param_grid = {'assume_centered': [False,True], 'contamination': [0.1], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [0.7,0.8,0.9,0.99]}
+        else:
+            param_grid = {'assume_centered': [False,True], 'contamination': [1e-323], 'random_state': [12345], 'store_precision': [True], 'support_fraction': [0.7,0.8,0.9,0.99]}
+    elif clf_type == "IsolationForest":
+        if train_contamination == True:
+            param_grid = {'bootstrap': [False,True], 'contamination': ['auto'], 'max_features': [1.0,0.9], 'max_samples': ['auto',0.9,0.8], 'n_estimators': [50,100,200], 'n_jobs': [NUM_THREADS], 'random_state': [12345], 'verbose': [0], 'warm_start': [False]}
+        else:
+            param_grid = {'bootstrap': [False,True], 'contamination': [1e-323], 'max_features': [1.0,0.9], 'max_samples': ['auto',0.9,0.8], 'n_estimators': [50,100,200], 'n_jobs': [NUM_THREADS], 'random_state': [12345], 'verbose': [0], 'warm_start': [False]}
+    else:
+        logger.error(f"unk clf_type {clf_type}")
+        param_grid = {}
+    return param_grid
+    
+def get_clfs_from_string(clf_type, config=None, train_contamination=False): 
+    param_grid = get_param_grid_from_string(clf_type, train_contamination=train_contamination)
+    if config is None:
+        pg = ParameterGrid(param_grid)
+    else:
+        pg = [config]
+    for p in pg:
+        if clf_type == "LocalOutlierFactor":
+            yield LocalOutlierFactor(**p)
+        elif clf_type == "OneClassSVM":
+            yield OneClassSVM(**p)
+        elif clf_type == "EllipticEnvelope":
+            yield EllipticEnvelope(**p)
+        elif clf_type == "IsolationForest":
+            yield IsolationForest(**p)
+        else:
+            logger.error(f"unk clf_type {clf_type}")
+            yield None
+
+
+
+def get_clf_result_output_conc(best_d_ag, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=None, allow_diff_shifts=True):
+    res_l=[]
+    th_i=0
+    # data_lock=threading.Lock()
+    # data_lock2=threading.Lock()
+    # pbar2=None
+    # with ThreadPoolExecutor(NUM_THREADS) as pool:
+    m = multiprocessing.Manager()
+    data_lock=m.Lock()
+    data_lock2=m.Lock()
+    pbar2=pbar
+    pbar=None
+    with ProcessPoolExecutor(NUM_THREADS) as pool: #multiprocessing
+        results=[]
+        for clf, best_dict in best_d_ag.items():
+            th_i+=1
+            if th_i % 2 == 0:   
+                with data_lock: 
+                    results.append( pool.submit(get_clf_result_output, clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar, allow_diff_shifts, data_lock) )
+            else:
+                with data_lock2: 
+                    results.append( pool.submit(get_clf_result_output, clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar, allow_diff_shifts, data_lock2) )
+            time.sleep(0.05)
+
+            if th_i % NUM_THREADS == 0 or th_i == 2 or th_i == 4:            
+                for r in results:
+                    timestamp1 = time.time()
+                    clf_results = r.result()
+                    timestamp2 = time.time()
+                    logger.debug("r.result() took %.2f seconds" % (timestamp2 - timestamp1))
+                    if pbar2 is not None:
+                        pbar2.update(1)
+                    if clf_results is not None:
+                        res_l.append(clf_results)
+                results=[]
+                gc.collect()
+
+
+        for r in results:
+            clf_results = r.result()
+            if pbar2 is not None:
+                pbar2.update(1)
+            if clf_results is not None:
+                res_l.append(clf_results)
+
+    return res_l
+
+def do_get_clf_is_atk_conc(pool, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None):
+    return pool.submit(do_get_clf_is_atk_wrap, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=pbar)
+
+def do_get_clf_is_atk_wrap(atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None):
+    end_d = max([v.index.max() for v in is_atk_dfs.values()])
+    atk_df = atk_df.loc[ATK_START_DATE:end_d] 
+    # print(atk_df)
+    # print([k for k in is_atk_dfs.keys()])
+    # print(is_atk_dfs["CPO_0"])
+    clf_is_atk = do_get_clf_is_atk(is_atk_dfs, atk_df, cp_group_grid_power)
+    if pbar is not None:
+        pbar.update(1)
+    return clf_is_atk
+
+def do_get_clf_is_atk(is_atk_dfs, atk_full_clf_feat, cp_group_grid_power):
+    clf_is_atk1 = get_clf_is_atk(is_atk_dfs, atk_full_clf_feat.index)
+    clf_is_atk2 = get_clf_is_atk(is_atk_dfs, atk_full_clf_feat.index, col="is_attack_lag_0")
+    # print(cp_group_grid_power)
+    # print(is_atk_dfs)
+    clf_is_atk3 = get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, atk_full_clf_feat.index)
+    clf_is_atk4 = get_clf_is_atk_grid(cp_group_grid_power, is_atk_dfs, atk_full_clf_feat.index, col_s="charge_speed_lag_0")
+
+    #clf_is_atk = clf_is_atk1.to_frame().join(clf_is_atk2)
+    #print(clf_is_atk)
+    clf_is_atk = pd.concat([clf_is_atk1,clf_is_atk2,clf_is_atk3,clf_is_atk4 ],axis=1)
+    #print(clf_is_atk)
+    #exit()
+    clf_is_atk = clf_is_atk.loc[atk_full_clf_feat.index]
+    clf_is_atk["is_attack1"] = 1
+    clf_is_atk["is_attack2"] = 1
+    clf_is_atk["is_attack3"] = 1
+    clf_is_atk["is_attack4"] = 1
+    clf_is_atk.loc[clf_is_atk[clf_is_atk1.name] != 0, "is_attack1"] = -1 #"charge_speed_lag_0_diff"
+    clf_is_atk.loc[clf_is_atk[clf_is_atk2.name] != 0, "is_attack2"] = -1 #"is_attack_lag_0_diff"
+    clf_is_atk.loc[clf_is_atk[clf_is_atk3.name] != 0, "is_attack3"] = -1 #grid vs should speed
+    clf_is_atk.loc[clf_is_atk[clf_is_atk4.name] != 0, "is_attack4"] = -1 #grid vs reported speed
+    # print(clf_is_atk)
+    # print(clf_is_atk['is_attack1'].equals(clf_is_atk['is_attack2']))
+
+    # clf_is_atk["is_attack_d"] = 1
+    # clf_is_atk.loc[clf_is_atk["is_attack1"] != clf_is_atk["is_attack2"], "is_attack_d"] = -1
+
+    ##print(clf_is_atk[clf_is_atk["is_attack_d"] == 1])
+    # print(clf_is_atk[clf_is_atk["is_attack_d"] == -1]) #TODO: why the diff
+
+    # print(atk_full_clf_feat)
+    # print(clf_is_atk)
+    # print(y_p)
+    return clf_is_atk
+
+
+def get_atk_full_clf_feat(atk_clf_feat_file, clf_f, atk_df, atk_labels_df, ATK_START_DATE, allow_diff_shifts=True):
+    atk_cp_g_clf_feat = pd.read_csv(atk_clf_feat_file, index_col=0, parse_dates=[0])
+    atk_df = atk_df.loc[atk_cp_g_clf_feat.index.min():atk_cp_g_clf_feat.index.max()] 
+
+    if clf_f is None:
+        clf_f=list(atk_cp_g_clf_feat.columns)+list(atk_df.columns)
+    if allow_diff_shifts:
+        clf_f_bak = clf_f
+        clf_f_l = len(clf_f)
+        atk_full_clf_feat_columns=list(atk_cp_g_clf_feat.columns)+list(atk_df.columns)
+        clf_f = [c for c in clf_f if c in atk_full_clf_feat_columns]
+        if len(clf_f) != clf_f_l:
+            # logger.info(f"train and test used diff features! {clf_f_bak=}")
+            # logger.info(f"train and test used diff features! {clf_f=}")
+            # logger.info(f"train and test used diff features! {[f for f in clf_f_bak if f not in clf_f]}")
+            # exit()
+            
+            global feat_warn1
+            if feat_warn1 <=1:
+                feat_warn1+=1
+                logger.warning(f"train and test used diff features! {allow_diff_shifts=}")
+
+    atk_cp_g_clf_feat = atk_cp_g_clf_feat[[c for c in clf_f if c in atk_cp_g_clf_feat.columns]]
+    atk_df = atk_df[[c for c in clf_f if c in atk_df.columns]]
+
+    #end_d = max([v.index.max() for v in is_atk_dfs.values()])
+    end_d = atk_labels_df.index.max()
+    atk_full_clf_feat = pd.concat([atk_cp_g_clf_feat, atk_df], axis=1).loc[ATK_START_DATE:end_d][clf_f]
+    del atk_cp_g_clf_feat
+    del atk_df
+    gc.collect()
+    return atk_full_clf_feat, clf_f
+
+def get_clf_result_output(clf, best_dict, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=None, allow_diff_shifts=True, data_lock=None):
+    #print(clf,best_dict)
+
+    if data_lock is not None:
+        data_lock.acquire()
+
+    clf_feat_file = get_clf_feat_for_conf(cp_g_dict, best_dict)
+    atk_clf_feat_file = get_clf_feat_for_conf(atk_cp_g_dict, best_dict, allow_diff_shifts=allow_diff_shifts)
+    # print(cp_g_dict)
+    # print(atk_cp_g_dict)
+    # print(best_dict)
+    if atk_clf_feat_file is None:
+        logger.warning(f"no atk_clf_feat_file found for {clf=} in {atk_cp_g_dict=}")
+        if pbar is not None:
+            pbar.update(1)
+        if data_lock is not None:
+            data_lock.release()
+        return None
+    #print(clf_feat_file)
+    #print(atk_clf_feat_file)
+    clf_n = clf.split(".")[0]
+    clf_c = best_dict["conf"]
+    clf_c = ast.literal_eval(clf_c)
+    _clf_f = best_dict["feat"]
+    clf_o = best_dict["offset"]
+    clf_r = best_dict["reg"]
+    #print(f"TEST: {clf=} {clf_c=} {_clf_f=} {clf_o=} {clf_r=} {[k for k in best_dict.keys()]}")
+
+    clf_only = clf.replace("."+clf_r, "")
+    clf_features_string = clf_only.split(".")[1:]
+    do_scale_pipeline = False
+    if "only_norm" in clf_features_string:
+        do_scale_pipeline = True
+
+    #print(f"TEST: {clf_only=} {clf_r=} {clf_features_string=} {do_scale_pipeline=}")
+
+    #print(atk_clf_feat_file)
+    atk_full_clf_feat, clf_f = get_atk_full_clf_feat(atk_clf_feat_file, _clf_f, atk_df, atk_labels_df, ATK_START_DATE, allow_diff_shifts)
+
+
+    cp_g_clf_feat = pd.read_csv(clf_feat_file, index_col=0, parse_dates=[0])
+    train_df = train_df.loc[cp_g_clf_feat.index.min():cp_g_clf_feat.index.max()]
+
+    cp_g_clf_feat = cp_g_clf_feat[[c for c in clf_f if c in cp_g_clf_feat.columns]]
+    train_df = train_df[[c for c in clf_f if c in train_df.columns]]
+
+
+    full_clf_feat = pd.concat([cp_g_clf_feat, train_df], axis=1).loc[TRAIN_START_DATE:ATK_START_DATE][clf_f]
+    del cp_g_clf_feat
+    del train_df
+    gc.collect()
+
+
+    # full_clf_feat = full_clf_feat[clf_f]
+    # atk_full_clf_feat = atk_full_clf_feat[clf_f]
+    # gc.collect()
+
+    if data_lock is not None:
+        data_lock.release()
+
+    # print(clf_n, type(clf_n))
+    # print(clf_c, type(clf_c))
+    # print(clf_f, type(clf_f))
+    # print(clf_o, type(clf_o))
+    # exit()
+
+    #is_atk_pred = do_clf(clf_n, clf_c, full_clf_feat, atk_full_clf_feat, offset_=0, pbar=pbar)
+    try:
+        y_p = do_clf(clf_n, clf_c, full_clf_feat, atk_full_clf_feat, pbar=pbar, overwrite_jobs=2, scale_pipeline=do_scale_pipeline)
+    except Exception as e: #TODO: handle "ValueError: The covariance matrix of the support data is equal to 0, try to increase support_fraction"
+        clf_n=clf_n+"_"+str(e)
+        y_p = [1 for x in atk_full_clf_feat.values]
+
+    #is_atk_pred_0 = [1 if p >= 0 else -1 for p in y_p]
+    is_atk_pred = [1 if p >= clf_o else -1 for p in y_p]
+    #print(is_atk_pred)
+
+    # print(full_clf_feat)
+    # print(atk_full_clf_feat)
+    # print(is_atk_dfs["CPO_0"].loc[ATK_START_DATE:])
+    timestamp1 = time.time()
+    #clf_is_atk = do_get_clf_is_atk(is_atk_dfs, atk_full_clf_feat)
+    clf_is_atk = atk_labels_df
+    timestamp2 = time.time()
+    logger.debug("do_get_clf_is_atk() took %.2f seconds" % (timestamp2 - timestamp1))
+
+    clf_results = get_clf_eval(y=clf_is_atk['is_attack1'], y_pred=is_atk_pred)
+    #clf_results = get_clf_eval(y=clf_is_atk['is_attack2'], y_pred=is_atk_pred)
+    clf_results["clf"] =  clf
+    clf_results["clf_n"] =  clf_n
+    clf_results["clf_f"] =  clf_f
+    clf_results["clf_o"] =  clf_o
+    clf_results["clf_r"] =  clf_r
+    #print(clf_results)
+    clf_results["decision_function"] =  list(y_p)
+    return clf_results
+
+
+
+def get_clf_id(clf_n, estimator, df, feat_cols, conf):
+    h = hashlib.new('sha256')
+    h.update(repr(conf).encode("utf-8"))
+    h.update(pd.util.hash_pandas_object(df, index=True).values)
+    h.update(repr(sorted(feat_cols)).encode("utf-8"))
+    h.update(repr(estimator).encode("utf-8"))
+    ha = h.hexdigest()
+    clf_id = (clf_n, ha)
+    return clf_id
+
+def do_clf(clf_type, config, full_clf_feat, atk_full_clf_feat, offset_=None, pbar=None, scale_pipeline=False, overwrite_jobs=None):
+    # print([c for c in full_clf_feat.columns])
+    # exit()
+    X = full_clf_feat.values
+    atk_X = atk_full_clf_feat.values
+
+    if overwrite_jobs is not None:
+        if "n_jobs" in config:
+            config["n_jobs"] = overwrite_jobs
+
+    estimators = [e for e in get_clfs_from_string(clf_type, config=config)] #
+    if len(estimators) > 1:
+        raise ValueError(f"len(estimators) > 1: {estimators=}")
+    
+
+    if not sys.warnoptions and (clf_type == "OneClassSVM" or clf_type == "EllipticEnvelope"):
+        warnings.simplefilter("ignore")
+        os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        for estimator in estimators: #should be only one since config...
+            if scale_pipeline:
+                estimator = make_pipeline(StandardScaler(),estimator)
+
+            timestamp1 = time.time()
+            clf_id = get_clf_id(clf_type, estimator, full_clf_feat, [c for c in full_clf_feat.columns], config)
+            if clf_id in CLF_CACHE:
+                logger.debug(f"hit {len(CLF_CACHE)}")
+                estimator = deepcopy(CLF_CACHE[clf_id])
+            else:
+                logger.debug(f"miss {len(CLF_CACHE)}")
+                estimator.fit(X=X)
+                CLF_CACHE[clf_id] = estimator
+
+            timestamp2 = time.time()
+            logger.debug("fit() took %.2f seconds" % (timestamp2 - timestamp1))
+
+
+            y_p = estimator.decision_function(X=atk_X)
+            timestamp3 = time.time()
+            logger.debug("decision_function() took %.2f seconds" % (timestamp3 - timestamp2))
+            #y_pred = estimator.predict(X=atk_X)
+            if pbar is not None:
+                pbar.update(1)
+            return y_p
+
+def get_clf_eval(y, y_pred):
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', UndefinedMetricWarning)
+        #precision0, recall0, fscore0, support0 = precision_recall_fscore_support(y, y_pred)
+        precision1, recall1, fscore1, support1 = precision_recall_fscore_support(y, y_pred, average="binary", labels=[1, -1])
+        #precision2, recall2, fscore2, support2 = precision_recall_fscore_support(y, y_pred, average="weighted")
+        tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel()
+        accuracy = accuracy_score(y, y_pred)
+
+        if (tp+fn) != 0:
+            tpr = tp/(tp+fn)
+        else:
+            tpr=0
+        if (fp+tn) != 0:
+            fpr = fp/(fp+tn)
+        else:
+            fpr=0
+    ret = {
+        "tpr": tpr,
+        "fpr": fpr,
+        "precision": precision1,
+        "recall": recall1,
+        "fscore": fscore1,
+        "support": support1,
+        "tn": tn,
+        "fp": fp,
+        "fn": fn,
+        "tp": tp,
+        "accuracy": accuracy,
+    }
+    return ret
+
+def cross_val_clf(clf_type, full_clf_feat, feat_cols, clf_feat_cols, pred_col=None, only_clf_feat=False, pbar=None, scale_pipeline=False, train_contamination=False):
+    full_feat_cols = sorted(feat_cols + clf_feat_cols)
+    if only_clf_feat:
+        full_feat_cols = sorted(clf_feat_cols)
+    # print(full_clf_feat)
+    # print(full_feat_cols)
+    # # print([c for c in full_clf_feat.columns if "grid" in c])
+    # exit()
+    X = full_clf_feat[full_feat_cols].values
+
+    if pred_col is None:
+        y = [1] * len(X)
+    else:
+        y = full_clf_feat[pred_col].values
+
+    ret_l = []
+    estimators = get_clfs_from_string(clf_type, train_contamination=train_contamination) #get list w/ hyper params...
+    # LOF: decision_function: The shift offset allows a zero threshold for being an outlier.
+    offset_ = 0
+
+    # IF: When the contamination parameter is set to "auto", the offset is equal to -0.5 as the scores of inliers are close to 0 and the scores of outliers are close to -1. 
+    # LOF: inliers score around -1 (the higher, the less abnormal).
+    # LOF: The offset is set to -1.5 (inliers score around -1), except when a
+    #     contamination parameter different than "auto" is provided. 
+    # if clf_type== "IsolationForest":
+    #     offset_ = -0.5
+    # elif clf_type== "LocalOutlierFactor":
+    #     offset_ = -1.5
+
+    
+    for estimator in estimators:
+        clf_params = estimator.get_params()
+        if scale_pipeline:
+            estimator = make_pipeline(StandardScaler(),estimator)
+            if "make_pipeline(StandardScaler," not in clf_type:
+                clf_type = f"make_pipeline(StandardScaler,{clf_type})"
+        now = datetime.now()
+        try:
+            if not sys.warnoptions and ("OneClassSVM" in clf_type or "EllipticEnvelope" in clf_type):
+                warnings.simplefilter("ignore")
+                os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=ConvergenceWarning)
+
+                nt=NUM_THREADS
+                if "LocalOutlierFactor" in clf_type or "IsolationForest" in clf_type: #native multi-threading
+                    nt=1 #doesnt seem to work...
+                # if clf_type == "EllipticEnvelope" and not only_clf_feat:
+                #     nt=1
+                #if not only_clf_feat:
+                #    nt=round(nt/2)
+                pre_dispatch=nt
+                #logger.info(f"{pre_dispatch=}, {nt=}")
+                with parallel_backend("threading", n_jobs=nt):
+                    with threadpool_limits(limits=nt, user_api='blas'):
+                        os.environ["OMP_NUM_THREADS"] = str(nt)
+                        os.environ["MKL_NUM_THREADS"] = str(nt)
+                        os.environ["OPENBLAS_NUM_THREADS"] = str(nt)
+                        os.environ["BLIS_NUM_THREADS"] = str(nt)
+                        y_p = cross_val_predict(estimator, X, y=None, cv=5, n_jobs=nt, method="decision_function", pre_dispatch=pre_dispatch) #Returns -1 for anomalies/outliers and 1 for inliers.
+
+                #print(y_p[:100])
+                y_pred = [1 if p >= offset_ else -1 for p in y_p]
+                # y_pred = cross_val_predict(estimator, X, y=None, cv=5, n_jobs=NUM_THREADS) #Returns -1 for anomalies/outliers and 1 for inliers.
+        except Exception as e:
+            logger.error(e)
+            logger.error(str(estimator))
+            # logger.error(full_clf_feat[full_feat_cols])
+            # logger.error(full_clf_feat[[c for c in full_clf_feat.columns if "pred"in c]])
+            ret = {
+                "clf_type": clf_type,
+                "clf_params": clf_params, 
+                "full_feat_cols": full_feat_cols,
+                "y_p": str(e),
+            }
+            if pbar is not None:
+                pbar.update(1)
+            ret_l.append(ret)
+            #raise e
+            continue
+
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', UndefinedMetricWarning)
+            #precision0, recall0, fscore0, support0 = precision_recall_fscore_support(y, y_pred)
+            precision1, recall1, fscore1, support1 = precision_recall_fscore_support(y, y_pred, average="binary", labels=[1, -1])
+            #precision2, recall2, fscore2, support2 = precision_recall_fscore_support(y, y_pred, average="weighted")
+            tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel()
+            accuracy = accuracy_score(y, y_pred)
+
+            # print(y[:100])
+            # print(y_pred[:100])
+
+            # print(precision1, recall1, fscore1, support1)
+            # print(tn, fp, fn, tp)
+            # print(accuracy)
+
+        later = datetime.now()
+        difference = (later - now).total_seconds()
+        ret = {
+            "clf_type": clf_type,
+            #"clf": str(estimator),
+            "clf_params": clf_params, 
+            "full_feat_cols": full_feat_cols,
+            #"y_true": y,
+            "y_p": list(y_p),
+            #"y_pred": list(y_pred),
+            #"offset_":offset_,
+            "precision": precision1,
+            "recall": recall1,
+            "fscore": fscore1,
+            "tn": tn,
+            "fp": fp,
+            "fn": fn,
+            "tp": tp,
+            "accuracy": accuracy,
+            "t_diff": difference,
+        }
+        if pbar is not None:
+            pbar.update(1)
+        ret_l.append(ret)
+    return ret_l
+
+
+
+
+def prune_clf_feats(df, feat_cases=None, df_dict=None):
+    feat_cols = list(df.columns)
+    if feat_cases is None or "all" in feat_cases:
+        return feat_cols
+
+    # print([c for c in df_dict["cp_g_df"].columns if  "_non_ev_power_" in c])
+    # print([c for c in feat_cols if  "_non_ev_power_" in c])
+    # exit()
+    
+
+    add_expo = [feat_case for feat_case in feat_cases if "add_grid_load_expo_" in feat_case]
+    if add_expo:
+        if df_dict is None:
+            logger.error(f"add_grid_load_expo_ provided: {add_expo}; but no df_dict")
+            exit()
+        if len(add_expo) > 1:
+            logger.warning(f"multiple add_grid_load_expo_ sets provided: {add_expo}; using first")
+        add_expo = add_expo[0]
+        sub_case="static"
+        if "expo_rnd" in add_expo:
+            sub_case="rnd"
+        add_expo_fac_min = int(add_expo.split("_")[-1])
+        #add_expo_fac_max = min(add_expo_fac_min+10, 100)
+        #print(add_expo, add_expo_fac_min, add_expo_fac_max)
+        expo_cols = [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and sub_case+"_fac."+str(add_expo_fac_min) in c and "_relation_to_" not in c]
+        feat_cols = [c for c in feat_cols if "grid_expo_" not in c or (sub_case+"_fac."+str(add_expo_fac_min) in c)]
+        # print(expo_cols)
+        # print(feat_cols)
+        feat_cols = feat_cols + expo_cols
+        #print(feat_cols)
+    else:
+        feat_cols = [f for f in feat_cols if "grid_expo_"not in f]
+
+    # if "add_rel_to_cols" in feat_cases:
+    #     add_rel_cols= [c for c in df_dict["cp_g_df"].columns if "_relation_to" in c]
+    #     get_rel_expo_cols = [c for c in feat_cols if "_norm" not in c and "grid_expo_" in c]
+
+    #     #print(df_dict["cp_g_df"])
+    #     new_cols=[]
+    #     for rec in get_rel_expo_cols:
+    #         name = rec.replace("_lag_0","")+"_relation_to_"+df_dict["pred_col"].replace("_lag_0","")+"_lag_0"
+    #         df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] != 0), name] = df_dict["cp_g_df"][df_dict["pred_col"]] / df_dict["cp_g_df"][rec]
+    #         df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] == 0), name] = 0
+    #         new_cols.append(name)
+
+    #         bus_col = [c for c in df_dict["cp_g_df"].columns if "bus" in c and ".active_power" in c and "_norm" not in c and "_lag_0" in c and "_relation_to_" not in c][0]
+    #         name = bus_col.replace("_lag_0","")+"_relation_to_"+rec.replace("_lag_0","")+"_lag_0"
+    #         df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] != 0), name] = df_dict["cp_g_df"][bus_col] / df_dict["cp_g_df"][rec]
+    #         df_dict["cp_g_df"].loc[(df_dict["cp_g_df"][rec] == 0), name] = 0
+    #         new_cols.append(name)
+
+    #     new_cols_df = df_dict["cp_g_df"][new_cols]
+    #     normalize_cols(new_cols_df, "_prune_norm")
+    #     for c in new_cols_df.columns:
+    #         if "_norm" in c and c not in df_dict["cp_g_df"].columns:
+    #             new_c = c.replace("_lag_0_prune_norm", "_prune_norm_lag_0")
+    #             df_dict["cp_g_df"][new_c] = new_cols_df[c]
+    #             new_cols.append(new_c)
+
+    #     add_rel_cols = add_rel_cols + new_cols
+    #     #print(df_dict["pred_col"].replace("_lag_0",""))
+    #     feat_cols = feat_cols + add_rel_cols
+
+    if feat_cases is not None and "set_1" in feat_cases:
+        set_1 = ['prediction_0_diff', 'prediction_mean_diff', ]
+        set_1 +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and "non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #'grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0', 
+        set_1 +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_static_fac." in f and 
+                 ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', 
+                  "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', 
+                  "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0',
+                  "_relation_to_prediction_mean_lag_0" in f) and  # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0',     
+                  "_norm" not in f]
+       
+        feat_cols = [f for f in feat_cols if f in set_1]
+        # print(set_1)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_2" in feat_cases:
+        set_2 = ['prediction_0_diff', 'prediction_mean_diff', ]       
+        feat_cols = [f for f in feat_cols if f in set_2]
+        return feat_cols
+    elif feat_cases is not None and "set_3" in feat_cases:
+        set_x = ['prediction_0_diff', 'prediction_mean_diff', ]       
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_4" in feat_cases:
+        set_x = ['prediction_0_diff','prediction_1_diff','prediction_2_diff', 'prediction_mean_diff', ]       
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_5" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f and "_rmse" not in f] 
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_6" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f] 
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c]
+        h_set +=[df_dict["pred_col"]]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_7" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" in f] 
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" in c]
+        h_set +=["charge_speed_group_norm_lag_0"]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_8" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f] 
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_9" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" in f] 
+        h_set =["charge_speed_lag_0"]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_10" in feat_cases:
+        set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] 
+        h_set =["charge_speed_lag_0"]
+        set_x+=h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_11" in feat_cases:
+        set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] 
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_12" in feat_cases:
+        set_x = ['prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff'] 
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_13" in feat_cases:
+        set_x = [f for f in feat_cols if 'prediction_' in f and "_norm" not in f and "_rmse" not in f and "_diff" in f and "_abs" not in f and "_mean" in f] 
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    
+    elif feat_cases is not None and "set_31" in feat_cases:
+        set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff']  
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_32" in feat_cases:
+        set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff']  
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c]
+        h_set +=["charge_speed_lag_0"]
+        set_x += h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_33" in feat_cases:
+        set_x = ['prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff']  
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        h_set = [c for c in df_dict["cp_g_df"].columns if "date_exog_group_hour_" in c and "_norm" not in c]
+        set_x += h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        return feat_cols
+    elif feat_cases is not None and "set_34" in feat_cases:
+        set_x = ['prediction_mean', 'prediction_min', 'prediction_max', 'prediction_mean_diff', 'prediction_min_diff', 'prediction_max_diff']  
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        h_set =["charge_speed_lag_0"]
+        set_x += h_set
+        feat_cols+=h_set
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    
+    elif feat_cases is not None and "set_35" in feat_cases:
+        set_x = ['prediction_mean_diff', ]       
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_static_fac." in f and 
+                 ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', 
+                  "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', 
+                  "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0',
+                  "_relation_to_prediction_mean_lag_0" in f) and  # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0',     
+                  "_norm" not in f]
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+    elif feat_cases is not None and "set_352" in feat_cases:
+        set_x = ['prediction_mean_diff', ]       
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".non_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.non_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_meas_"in f and ".bus." in f and ".only_ev_power_relation_to_bus_lag_0" in f and "_norm" not in f] #grid_meas_0.0.bus.11.only_ev_power_relation_to_bus_lag_0
+        set_x +=[f for f in feat_cols if "grid_expo_"in f and ".load." in f and ".active_power_" in f and "_fac." in f and #add_grid_load_expo_rnd_90
+                 ("_max_ev_power_relation_to_bus_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_max_ev_power_relation_to_bus_lag_0', 
+                  "_non_ev_power_relation_to_expo_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_non_ev_power_relation_to_expo_lag_0', 
+                  "_relation_to_charge_speed_lag_0" in f or # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_charge_speed_lag_0',
+                  "_relation_to_prediction_mean_lag_0" in f) and  # 'grid_expo_0.0.load.7.active_power_static_fac.100_relation_to_prediction_mean_lag_0',     
+                  "_norm" not in f]
+        feat_cols = [f for f in feat_cols if f in set_x]
+        # print(set_x)
+        # print(feat_cols)
+        # exit()
+        return feat_cols
+
+    for feat_case in feat_cases:
+        if feat_case == "only_norm":
+            feat_cols = [f for f in feat_cols if "_norm" in f]
+        elif feat_case == "no_norm":
+            feat_cols = [f for f in feat_cols if "_norm" not in f]
+        elif feat_case == "only_diff":
+            feat_cols = [f for f in feat_cols if "_diff" in f or "_relation_to" in f]
+
+    return feat_cols
+
+
+def get_best_clf_conf_offset(y_p, max_fpr):
+    if y_p["y_p"] is None:
+        return None, None
+    offset = 0
+    y_p = sorted(y_p["y_p"])
+    best_y_p = 0
+    p_y_p = len(y_p) * max_fpr
+    # print(y_p)
+    for min_y in y_p:
+        _best_y_p = len([y for y in y_p if y < min_y]) #len of false positives
+        if _best_y_p > p_y_p: #more false postivies than allowed
+            break
+        offset = min_y
+        best_y_p = _best_y_p
+    # print(offset, best_y_p, p_y_p, len(y_p))
+    # exit()
+    return offset, best_y_p
+
+def get_best_clf_conf2(dfr, max_fpr=0.005, ignore_fscore=False): # >= offset_ -> inlier=1
+    #print(dfr)
+    res = dfr.apply(lambda x: get_best_clf_conf_offset(x, max_fpr=max_fpr), axis=1)
+    dfr["offset"], dfr["false_positives"]= zip(*res)
+    if ignore_fscore:
+        df_best = dfr
+    else:
+        df_best = dfr[dfr["fscore"] == dfr["fscore"].max()]
+    df_best_o = df_best[df_best["offset"] == df_best["offset"].max()]
+    df_best_f = df_best_o[df_best_o["t_diff"] == df_best_o["t_diff"].min()].iloc[0] 
+    # print(df_best_f, df_best_f["offset"])
+    # exit()
+    return df_best_f, df_best_f["offset"]
+
+def get_best_clf_conf(dfr, max_fpr=0.01):
+    df_best = dfr[dfr["fscore"] == dfr["fscore"].max()]
+    df_best_f = df_best[df_best["t_diff"] == df_best["t_diff"].min()].iloc[0]
+    return df_best_f, 0
+
+def to_float_list(val):
+    val = val.replace("[", "").replace("]", "")
+    try:
+        ret = [float(v) for v in val.split(", ")]
+    except Exception as e:
+        logger.info(f"{e} with {val}")
+        return None
+    return ret
+
+def _eval_tuning_clf(dfr):
+    df_best_f, offset = get_best_clf_conf2(dfr.copy())
+    # print(group,clf,reg,df_best_f["clf_params"],df_best_f["fscore"], offset)
+    # exit()
+    return  {"conf":df_best_f["clf_params"], "eval":df_best_f["fscore"], "feat":ast.literal_eval(df_best_f["full_feat_cols"]), "offset": offset}
+
+def eval_tuning_clf(x_path, tqdm_n=0, full=False):
+    onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+
+    #CP_4_LocalOutlierFactor.no_reg.only_norm.csv.gz
+    regex = re.compile(r"^(CP_\d+)_([\w\.\_]+)\.csv\.gz$")
+    eval_tuning_files=defaultdict(lambda: dict())
+    for f in tqdm(onlyfiles, desc=f"eval_tuning {x_path} get eval_tuning_files", disable=tqdm_n>0):
+        #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz
+        result = regex.search(f)
+        if result: #
+            f_df = pd.read_csv(os.path.join(x_path, f), index_col=0) # , converters={"clf_params":ast.literal_eval, "full_feat_cols":ast.literal_eval, "y_p":ast.literal_eval} , "features":ast.literal_eval, "clf_features":ast.literal_eval
+            # f_df['clf_params'] = f_df['clf_params'].apply(lambda x: ast.literal_eval(x))
+            # f_df['full_feat_cols'] = f_df['full_feat_cols'].apply(lambda x: ast.literal_eval(x))
+            f_df['y_p'] = f_df['y_p'].apply(lambda x: to_float_list(x))
+            # print(f_df)
+            # print(f_df.columns)
+            eval_tuning_files[result.group(1)][result.group(2)] = f_df
+            #break
+        else:
+            logger.info(f"unk file {os.path.join(x_path, f)}")
+            continue
+
+    if full:
+        return eval_tuning_files
+
+    ret_dict=defaultdict(lambda: dict())
+    results=[]
+    #with ThreadPoolExecutor(NUM_THREADS) as pool:
+    with ProcessPoolExecutor(NUM_THREADS) as pool:
+        for group,group_d in tqdm(eval_tuning_files.items(), desc=f"eval_tuning {x_path} get ret_dict", disable=True):
+            for clf, df in group_d.items():
+                for index, rsf in df[["reg", 'shifts', 'features']].drop_duplicates().iterrows():
+                    # print(rsf["reg"])
+                    dfr = df[(df["reg"] == rsf["reg"]) & (df["shifts"] == rsf["shifts"]) & (df["features"] == rsf["features"])]
+                    reg = dfr["reg"].iloc[0] +"."+ str(dfr["shifts"].iloc[0]) +"."+ ".".join(ast.literal_eval(dfr["features"].iloc[0]))
+
+                    results.append( (group,clf,reg, pool.submit(_eval_tuning_clf, dfr)) )
+                    #ret_dict[group][(clf,reg)] = _eval_tuning_clf(dfr)
+
+        for group,clf,reg,r in tqdm(results, desc=f"eval_tuning {x_path} get ret_dict", disable=tqdm_n>0):
+            ret_dict[group][(clf,reg)] = r.result()
+    return ret_dict
+
+
+def get_eval_dicts_clf(OutDataDIR):
+    ret_d=dict()
+    DIR=OutDataDIR+"/clf_eval/"
+
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")]
+    for cpo in sub_folders:
+        ret_dict=eval_tuning_clf(OutDataDIR+"/clf_eval/"+cpo)
+        ret_d[cpo] = ret_dict
+
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")]
+    for dso in sub_folders:
+        ret_dict=eval_tuning_clf(OutDataDIR+"/clf_eval/"+dso)
+        ret_d[dso] = ret_dict
+
+    return ret_d
+
+
+def save_eval_dicts_clf(best_d, OutDataDIR):
+    for actor in sorted(best_d.keys()):
+        ret_dict = best_d[actor]
+        Path(OutDataDIR+"/clf_eval_dicts/"+actor+"/").mkdir(parents=True, exist_ok=True)
+        for group,ret_d2 in tqdm(ret_dict.items(), desc=f"saving {actor}"):
+            with gzip.open(OutDataDIR+"/clf_eval_dicts/"+actor+"/"+group+".gz", "wb") as f:
+                pickle.dump(ret_d2, f)
+
+
+def load_eval_dicts_clf(OutDataDIR):
+    DIR = OutDataDIR+"/clf_eval_dicts/"
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))]
+
+    best_d=defaultdict(lambda: defaultdict(lambda: dict()))
+    for _x_path in sub_folders:
+        x_path = os.path.join(DIR,_x_path)
+        onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+        for fi in onlyfiles:
+            with gzip.open(os.path.join(x_path,fi), "rb") as f:
+                fdf_dict = pickle.load(f)
+                best_d[_x_path][fi.replace(".gz", "")] = fdf_dict
+    return best_d
+
+def get_clf_feat_for_conf(cp_g_dict, best_dict, allow_diff_shifts=False):
+    #print(cp_g_dict) #[{'reg': 'GradientBoostingRegressor', 'shifts': '5', 'file': 'data/elaadnl/clf_feats/DSO.all/CP_11_GradientBoostingRegressor_5.csv.gz', 'features': ['all']}, ...
+    #print(best_dict["reg"].split("."))
+    best_dict_reg = best_dict["reg"].split(".")
+    for cp_g_d in cp_g_dict:
+        if cp_g_d["reg"] == best_dict_reg[0] and cp_g_d["shifts"] == best_dict_reg[1] and  cp_g_d["features"] == best_dict_reg[2:]:
+            return cp_g_d["file"]
+    for cp_g_d in cp_g_dict:
+        if cp_g_d["reg"] == best_dict_reg[0] and allow_diff_shifts and  cp_g_d["features"] == best_dict_reg[2:]:
+            
+            global feat_warn2
+            if feat_warn2 <=1:
+                feat_warn2+=1
+                logger.warning(f"train and test used diff shifts for features! {allow_diff_shifts=}")
+            return cp_g_d["file"]
+    return None
+
+def get_atks_old(l_buffer=2): #based on EVAtksScript; TODO: auto gen...
+    atk_l=[]
+    step=0
+    times = [ #Timedelta from sim start time [select attack day]; (hour, minute) to start attack [select attack time]
+        (pd.Timedelta(days=1),  0, 20, None), #atk 0 getSeed
+        (pd.Timedelta(days=1),  0, 40, None), #atk 1 getCPs
+    ]
+    atk_cycles=27
+    atk_cycles_actual=0
+    for x in range(0, atk_cycles, 5): #[0, 5, 10, 15, 20, 25]
+        times.append((pd.Timedelta(days=0+x),  8, 00, "atk 2 MAD inc")) #atk 2 MAD inc
+        times.append((pd.Timedelta(days=0+x), 13, 00, "atk 3 FDI red")) #atk 3 FDI red
+        times.append((pd.Timedelta(days=1+x),  8, 00, "atk 4 FDI inc (no limit)")) #atk 4 FDI inc (no limit)
+        times.append((pd.Timedelta(days=1+x), 18, 00, "atk 5 MAD inc and FDI same")) #atk 5 MAD inc and FDI same
+        times.append((pd.Timedelta(days=2+x),  8, 00, "atk 6 MAD red")) #atk 6 MAD red
+        times.append((pd.Timedelta(days=2+x), 18, 00, "atk 7 FDI inc")) #atk 7 FDI inc
+        times.append((pd.Timedelta(days=3+x),  8, 00, "atk 8 MAD inc x2")) #atk 8 MAD inc x2
+        times.append((pd.Timedelta(days=3+x), 18, 00, "atk 9 MAD red and FDI inc")) #atk 9 MAD red and FDI inc
+        times.append((pd.Timedelta(days=4+x),  8, 00, "atk 10 MAD inc and FDI red")) #atk 10 MAD inc and FDI red
+        times.append((pd.Timedelta(days=4+x), 18, 00, "atk 11 MAD inc x1.5")) #atk 11 MAD inc x1.5
+        atk_cycles_actual+=1
+    steps_per_cycle=10
+
+    times.append((pd.Timedelta(days=31), 23, 59, None)) #Shutdown
+
+    offset = pd.Timedelta(days=2)
+    
+    setup_times = [0,1]
+    times = [(x[0]+offset, x[1], x[2], x[3]) if i not in setup_times else (x[0], x[1], x[2], x[3]) for i,x in enumerate(times)]
+
+    start_sim_clock_time = pd.to_datetime("2023-11-29 00:00:00")
+
+    for x in setup_times:
+        atk_time = times[step]
+        atk_time_sim = start_sim_clock_time + atk_time[0]
+        atk_time_h = atk_time[1]
+        atk_time_m = atk_time[2]
+        atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+        atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim})
+        step += 1
+
+    for x in range(atk_cycles_actual):
+        for i in range(steps_per_cycle):
+            atk_dur_h = x+1 #*6 *365
+            atk_time = times[step]
+            if atk_time[3] is None:
+                break
+            
+            atk_time_sim = start_sim_clock_time + atk_time[0]
+            
+            atk_time_h = atk_time[1] -1 #rnd times
+            atk_time_m = atk_time[2]
+            # atk_time_h = atk_time[1] + self.rand.randint(low=-1,high=2)
+            # atk_time_m = atk_time[2] + self.rand.randint(low= 0,high=60-atk_time[2])
+            atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+            length=15 * 4 * atk_dur_h
+            #l_buffer=2#10
+            length+=60*(1+3 +l_buffer) #1+3 rnd times
+            end_diff = pd.Timedelta(minutes=length)
+            atk_time_sim_end = atk_time_sim+end_diff
+            atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim_end})
+            step += 1
+
+    
+    for x in ["shutdown"]:
+        atk_time = times[step]
+        atk_time_sim = start_sim_clock_time + atk_time[0]
+        atk_time_h = atk_time[1]
+        atk_time_m = atk_time[2]
+        atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+        atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim})
+        step += 1
+
+    return atk_l
+
+
+def get_atks(l_buffer=2, start_sim_offset_h=0): #based on EVAtksScript; TODO: auto gen...
+    atk_l=[]
+    step=0
+    times = [ #Timedelta from sim start time [select attack day]; (hour, minute) to start attack [select attack time]
+        (pd.Timedelta(days=1),  0, 20, None), #atk 0 getSeed
+        (pd.Timedelta(days=1),  0, 40, None), #atk 1 getCPs
+    ]
+    atk_cycles=27
+    atk_cycles_actual=0
+    for x in range(0, atk_cycles, 10): #[0, 10, 20,]
+        times.append((pd.Timedelta(days=0+x),  7, 00, "atk 2 MAD inc")) #atk 2 MAD inc
+        times.append((pd.Timedelta(days=1+x), 14, 00, "atk 3 FDI red")) #atk 3 FDI red
+        times.append((pd.Timedelta(days=2+x),  8, 00, "atk 4 FDI inc (no limit)")) #atk 4 FDI inc (no limit)
+        times.append((pd.Timedelta(days=3+x), 18, 00, "atk 5 MAD inc and FDI same")) #atk 5 MAD inc and FDI same
+        times.append((pd.Timedelta(days=4+x),  8, 00, "atk 6 MAD red")) #atk 6 MAD red
+        times.append((pd.Timedelta(days=5+x), 18, 00, "atk 7 FDI inc")) #atk 7 FDI inc
+        times.append((pd.Timedelta(days=6+x),  8, 00, "atk 8 MAD inc x2")) #atk 8 MAD inc x2
+        times.append((pd.Timedelta(days=7+x), 18, 00, "atk 9 MAD red and FDI inc")) #atk 9 MAD red and FDI inc
+        times.append((pd.Timedelta(days=8+x),  8, 00, "atk 10 MAD inc and FDI red")) #atk 10 MAD inc and FDI red
+        times.append((pd.Timedelta(days=9+x), 18, 00, "atk 11 MAD inc x2 (no limit) and FDI same")) #atk 11 MAD inc x2 (no limit) and FDI same
+        atk_cycles_actual+=1    
+    steps_per_cycle=10
+
+    times.append((pd.Timedelta(days=31), 23, 59, None)) #Shutdown
+
+    offset = pd.Timedelta(days=2)
+    
+    setup_times = [0,1]
+    times = [(x[0]+offset, x[1], x[2], x[3]) if i not in setup_times else (x[0], x[1], x[2], x[3]) for i,x in enumerate(times)]
+
+    start_sim_clock_time = pd.to_datetime("2023-11-29 00:00:00")
+
+    for x in setup_times:
+        atk_time = times[step]
+        atk_time_sim = start_sim_clock_time + atk_time[0]
+        atk_time_h = atk_time[1]
+        atk_time_m = atk_time[2]
+        atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+        atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim})
+        step += 1
+
+    for x in range(atk_cycles_actual):
+        for i in range(steps_per_cycle):
+            atk_dur_h = x+1 #*6 *365
+            atk_dur_h = min(atk_dur_h, 4) #max 4 hours...
+            atk_time = times[step]
+            if atk_time[3] is None:
+                break
+            
+            atk_time_sim = start_sim_clock_time + atk_time[0]
+            
+            atk_time_h = atk_time[1] - 1 #rnd times
+            atk_time_h += start_sim_offset_h 
+            atk_time_m = atk_time[2]
+            # atk_time_h = atk_time[1] + self.rand.randint(low=-1,high=2)
+            # atk_time_m = atk_time[2] + self.rand.randint(low= 0,high=60-atk_time[2])
+            atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+            length=15 * 4 * atk_dur_h
+            #l_buffer=2#10
+            length+=60*(1+3 +l_buffer) #1+3 rnd times
+            end_diff = pd.Timedelta(minutes=length)
+            atk_time_sim_end = atk_time_sim+end_diff
+            atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim_end})
+            step += 1
+
+    
+    for x in ["shutdown"]:
+        atk_time = times[step]
+        atk_time_sim = start_sim_clock_time + atk_time[0]
+        atk_time_h = atk_time[1]
+        atk_time_m = atk_time[2]
+        atk_time_sim = atk_time_sim.replace(hour=atk_time_h, minute=atk_time_m)
+
+        atk_l.append({"type":atk_time[3], "start":atk_time_sim, "end":atk_time_sim})
+        step += 1
+
+    return atk_l
\ No newline at end of file
diff --git a/ids/features_cpo.py b/ids/features_cpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..425375413fcf8f73b426796f754e74338be8cd92
--- /dev/null
+++ b/ids/features_cpo.py
@@ -0,0 +1,409 @@
+
+from collections import defaultdict
+import os
+from matplotlib import pyplot as plt
+import pandas as pd
+from tqdm import tqdm
+
+
+import re
+
+import logging
+
+from features_aux import add_lags, get_date_exog, normalize_cols, add_all_lags
+
+logger = logging.getLogger("WATTSON_EV_IDS.FeaturesCPO")
+
+ROUND_TO=6
+
+def get_cpo_files(DIR):
+    ret={}
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))]
+    for x in [f for f in sub_folders if f.startswith("CPO_")]:
+        x_path = os.path.join(DIR, x)
+        onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+        onlyfiles = [f for f in onlyfiles if f.startswith("cpo_ocpp_data_") and f.endswith(".csv.gz")]
+        ret[x] = onlyfiles
+    return ret
+
+def read_cpo_files(dir, files):
+    # print(dir, files)
+    regex = re.compile(r"^cpo_ocpp_data_(CP_\d+)_([\w\d]+).csv.gz$")
+    cpo_files=defaultdict(lambda: dict())
+    for f in tqdm(files, desc="loading cpo_files"):
+        #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz
+        result = regex.search(f)
+        if result:
+            f_df = pd.read_csv(os.path.join(dir, f), index_col="meter_value_ts", parse_dates=["meter_value_ts"])
+            if result.group(1) != f_df.iloc[0]["cp_group"]:
+                logger.error(f"wrong cp_group {result.group(1)} != {f_df.iloc[0]['cp_group']}")
+            # f_df = f_df[[c for c in f_df.columns if not c.startswith("custom_data") and c != "cp_c_id" and c != "cp_group"]]
+            cpo_files[result.group(1)][result.group(2)] = f_df
+            # break
+        else:
+            logger.error(f"unk file {f}")
+            continue
+    return cpo_files
+
+
+
+def get_features_cpo(df, per_cp_normalize, per_cp_date_exog):
+    keep_cols = ["charge_speed", "meter_value_sampled_value", "meter_diff", "time_diff", "cp_c_id"]
+    f_df = df[[c for c in df.columns if c != "cp_group"]] #  c != "cp_c_id" and not c.startswith("custom_data") and
+    f_df = pd.concat([f_df, pd.get_dummies(f_df["event"], prefix="event")], axis=1)
+    # f_df["meter_value_p"] = (f_df["meter_value_sampled_value"] - f_df["meter_value_sampled_value"].min()) / (f_df["meter_value_sampled_value"].max() - f_df["meter_value_sampled_value"].min())
+    # f_df["meter_value_p_diff"] = f_df["meter_value_p"] - f_df["meter_value_p"].shift(1)
+    # f_df["meter_value_p_diff"] = f_df["meter_value_p_diff"].fillna(0)
+    f_df["time_diff"] = f_df["time_diff"].fillna(0)
+
+    f_df["is_attack"] = 0
+    f_df.loc[f_df[f_df["custom_data_atk_type"] != "None"].index, "is_attack"] = 1
+    f_df = pd.concat([f_df, pd.get_dummies(f_df["custom_data_atk_type"], prefix="custom_data_atk_type")], axis=1)
+
+    if per_cp_date_exog:
+        c1 = f_df.columns
+        get_date_exog(f_df, prefix="date_exog_cp_")
+        c_del = [c for c in f_df.columns if c not in c1]
+        for c in f_df.columns:
+            if c.startswith("date_exog_cp_"):
+                f_df = pd.concat([f_df, pd.get_dummies(f_df[c], prefix=c)], axis=1)
+        f_df = f_df[[c for c in f_df.columns if c not in c_del]] #del non dummy date exog
+        keep_cols+= [c for c in f_df.columns if c.startswith("date_exog_cp_")]
+
+    keep_cols+= [c for c in f_df.columns if c.startswith("custom_data") and "custom_data_vendor_id" not in c and "custom_data_atk_type" not in c]
+    keep_cols+= ["is_attack"]
+    f_df = f_df[[c for c in f_df.columns if c.startswith("event_") or c in keep_cols]]
+
+    # if len(f_df["is_attack"].drop_duplicates()) > 0:
+    #     print(f_df)
+    #     exit()
+    # if len(f_df["is_attack"].drop_duplicates()) > 0:
+    #     print(f_df)
+    #     exit()
+
+    if per_cp_normalize:
+        normalize_cols(f_df, "_cp_norm")
+    return f_df.copy()
+
+def resample_feats3(df, t_min, t_max, freq='15Min'):
+    oidx = df.index
+    t_min2=df.index.min()
+    t_max2=df.index.max()
+    #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min')
+    nidx = pd.date_range(t_min2, t_max2, freq=freq) #, normalize=True
+    res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx)
+    res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0)
+    res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0)
+    res = resample_feats(res, t_min, t_max, freq)
+    return res
+
+def resample_feats2(df, t_min, t_max, freq='15Min'):
+    oidx = df.index
+    #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min')
+    nidx = pd.date_range(t_min, t_max, freq=freq)
+    res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx)
+    res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0)
+    res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0)
+
+    for fward_c in  ["meter_value_sampled_value", "meter_diff", "charge_speed", "meter_no_atk"]: 
+        fward_cs = [c for c in df.columns if fward_c in c]
+        res[fward_cs] = df[fward_cs].reindex(oidx.union(nidx)).ffill().reindex(nidx)
+    for fward_c in  ["meter_value_sampled_value", "meter_no_atk"]:
+        fward_cs = [c for c in df.columns if fward_c in c]
+        res[fward_cs] = res[fward_cs].fillna(res[fward_cs].min())
+    for fward_c in  ["meter_diff", "charge_speed"]:
+        fward_cs = [c for c in df.columns if fward_c in c]
+        res[fward_cs] = res[fward_cs].fillna(0)
+    return res
+
+def resample_feats(df, t_min, t_max, freq='15Min'):
+    oidx = df.index
+    #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min')
+    nidx = pd.date_range(t_min, t_max, freq=freq)
+    res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = df[[c for c in df.columns if "event_Ended" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="backward").reindex(nidx)
+    res[[c for c in df.columns if "event_Started" in c]] = df[[c for c in df.columns if "event_Started" in c]].reindex(oidx.union(nidx)).interpolate('time', limit_direction="forward").reindex(nidx)
+    res[[c for c in df.columns if "event_Ended" in c]] = res[[c for c in df.columns if "event_Ended" in c]].fillna(0)
+    res[[c for c in df.columns if "event_Started" in c]] = res[[c for c in df.columns if "event_Started" in c]].fillna(0)
+    return res
+
+
+def get_cpo_dfs(cpo_files, per_cp_normalize, per_cp_date_exog):
+    cpo_dfs=dict()
+    t_min=dict()
+    t_max=dict()
+    for group, cps in tqdm(cpo_files.items(), desc="loading cpo_dfs"):
+        cp_dfs=[]
+        for cp, cp_df in cps.items():
+            cp_dff = get_features_cpo(cp_df, per_cp_normalize=per_cp_normalize, per_cp_date_exog=per_cp_date_exog)
+            if group not in t_min:
+                t_min[group] = cp_dff.index.min()
+                t_max[group] = cp_dff.index.max()
+            else:
+                t_min[group] = min(t_min[group], cp_dff.index.min())
+                t_max[group] = max(t_max[group], cp_dff.index.max())
+            
+            cp_dfs.append(cp_dff)
+            # print(group, cp, cp_dff)
+            # exit()
+        cpo_dfs[group] = cp_dfs
+    return cpo_dfs, t_min, t_max
+
+
+    # ['meter_value_sampled_value', 'custom_data_meter_no_atk',
+    #    'custom_data_scaling', 'custom_data_energy_interval',
+    #    'custom_data_original_energy_interval', 'custom_data_average_power',
+    #    'custom_data_original_average_power', 'meter_diff', 'time_diff',
+    #    'charge_speed', 'custom_data_meter_diff', 'custom_data_charge_speed',
+    #    'event_Ended', 'event_Started', 'event_Updated', 'is_attack',
+    #    'date_exog_cp_dayofweek_1', 'date_exog_cp_dayofweek_2',
+    #    'date_exog_cp_dayofweek_6', 'date_exog_cp_hour_7',
+    #    'date_exog_cp_hour_8', 'date_exog_cp_hour_9', 'date_exog_cp_hour_10',
+    #    'date_exog_cp_hour_11', 'date_exog_cp_hour_12', 'date_exog_cp_hour_13',
+    #    'date_exog_cp_hour_14', 'date_exog_cp_hour_15', 'date_exog_cp_hour_16',
+    #    'date_exog_cp_discretize_hour_only_Play',
+    #    'date_exog_cp_discretize_hour_only_Work',
+    #    'date_exog_cp_discretize_hour_day_High-Home',
+    #    'date_exog_cp_discretize_hour_day_High-Work',
+    #    'date_exog_cp_discretize_day_is_work_False',
+    #    'date_exog_cp_discretize_day_is_work_True',
+    #    'date_exog_cp_discretize_hour_balancing_low',
+    #    'date_exog_cp_discretize_hour_balancing_peak'],
+def _add_individual_cp_features(cp_dfs):
+    for cp_df in cp_dfs:
+        do_cols = [c for c in cp_df.columns if "date_exog_" not in c and "custom_data_" not in c  and c != "cp_c_id"]
+        cp_id = cp_df["cp_c_id"].drop_duplicates().iloc[0]
+        cp_df2 = cp_df[[c for c in do_cols]].copy()
+        cp_df2.rename(columns={c:"CP_"+str(cp_id)+"_"+c for c in do_cols}, inplace=True)
+        cp_df = pd.concat([cp_df, cp_df2], axis=1)
+        cp_df.drop(columns="cp_c_id", inplace=True)
+        yield cp_df
+
+
+def insert_sess_ends(cp_dff):
+    #print([c for c in cp_dff.columns])
+    #print(cp_dff[['time_diff', 'charge_speed','event_Ended', 'event_Started', 'event_Updated', "is_attack",]])
+    #print(cp_dff[cp_dff['event_Ended'] == 1])
+    if "event_Ended" not in cp_dff.columns:
+        cp_dff["event_Ended"] = 0
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Ended")] = 1
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Updated")] = 0
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Started")] = 0
+    if cp_dff.iloc[-1]["event_Ended"] != 1:
+        logger.debug('cp_dff.iloc[-1]["event_Ended"] != 1')
+        #logger.error(cp_dff[['time_diff', 'charge_speed','event_Started', 'event_Updated', "event_Ended", "is_attack",]])
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Ended")] = 1
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Updated")] = 0
+        cp_dff.iloc[-1,cp_dff.columns.get_loc("event_Started")] = 0
+        #exit()
+    for idx in cp_dff[cp_dff['event_Ended'] == 1].index:
+        idx_l = cp_dff.index.get_loc(idx)
+        idx_c = cp_dff.index[idx_l]
+        idx_max=len(cp_dff)-1
+        idx_n = cp_dff.index[min(idx_max,idx_l+1)]
+        new_idx=None
+        time_diff=None
+        for x in [10,1,0.1]:
+            #print(idx_c+pd.Timedelta(seconds=x),idx_n)
+            if idx_c+pd.Timedelta(seconds=x) < idx_n or idx_n == cp_dff.index[idx_max]:
+                new_idx = idx_c+pd.Timedelta(seconds=x)
+                time_diff=(x/60)/60 # s to h
+                break
+        if new_idx is None:
+            raise Exception(f"no new_idx found for {idx_c=}, {idx_n=} in:\n {cp_dff}")
+        #print(new_idx)
+        cp_dff.loc[new_idx] = cp_dff.loc[idx_c]
+        cp_dff.loc[new_idx, "time_diff"] = time_diff
+        zero_list = ['energy_interval', 'average_power', 'meter_diff', "charge_speed", "is_attack"]
+        for z in zero_list:
+            for c in cp_dff.columns:
+                if z in c:
+                    cp_dff.loc[new_idx, c] = 0
+        cp_dff = cp_dff.sort_index()
+    #print(cp_dff[['time_diff', 'charge_speed','event_Ended', 'event_Started', 'event_Updated', "is_attack",]])
+    return cp_dff
+
+
+def get_cp_df_interp2(cp_dfs, group, t_min, t_max, add_individual_cp_features=True):
+    base_cols=sorted(set([c for cp_df in cp_dfs for c in cp_df.columns if c != "cp_c_id"]))
+    if add_individual_cp_features:
+        cp_dfs = _add_individual_cp_features(cp_dfs)
+
+    cp_df_interp2=None
+    for cp_dff in cp_dfs:
+        cp_dff = insert_sess_ends(cp_dff)
+        cp_dff = resample_feats(cp_dff, t_min[group], t_max[group])
+        # fig, ax = plt.subplots(figsize=(16,9))
+        # ax.plot(cp_dff.index, cp_dff["charge_speed"], label="charge_speed")
+        # plt.legend()
+        # plt.show()
+        # plt.close()
+        # print(row)
+        #exit()
+        if  cp_df_interp2 is None:
+            cp_df_interp2=cp_dff
+        else:
+            add_cols=[c for c in cp_dff.columns if c in base_cols and c in cp_df_interp2.columns]
+            concat_cols=[c for c in cp_dff.columns if c not in add_cols]
+            cp_df_interp2[add_cols]+=cp_dff[add_cols]
+            # cp_df_interp2 = pd.concat([cp_df_interp2, cp_dff[add_cols]]) 
+            # cp_df_interp2 = cp_df_interp2.groupby(cp_df_interp2.index).sum()
+            cp_df_interp2 = pd.concat([cp_df_interp2, cp_dff[concat_cols]], axis=1)
+    return cp_df_interp2.round(ROUND_TO)
+
+def get_cp_df_interp(cp_dfs, group, t_min, t_max, add_individual_cp_features=True):
+    cp_dfs_interp=[]
+
+    if add_individual_cp_features:
+        cp_dfs = _add_individual_cp_features(cp_dfs)
+
+    for cp_dff in cp_dfs:
+        cp_dff = resample_feats(cp_dff, t_min[group], t_max[group])
+        cp_dfs_interp.append(cp_dff)
+        # print(cp_dff["meter_value_sampled_value"].min())
+        # print(group, cp_dff)
+
+    cp_df_interp = pd.concat(cp_dfs_interp)
+    cp_df_interp = cp_df_interp.groupby(cp_df_interp.index).sum()
+    return cp_df_interp.round(ROUND_TO)
+
+def resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize, per_cp_group_date_exog, add_individual_cp_features=True):
+    for group, cp_dfs in tqdm(cpo_dfs.items(), desc="loading cp_df_interp"):
+
+        cp_df_interp = get_cp_df_interp2(cp_dfs, group, t_min, t_max, add_individual_cp_features)
+        # cp_df_interp = get_cp_df_interp(cp_dfs, group, t_min, t_max, add_individual_cp_features)
+        #print(cp_df_interp["charge_speed"])
+        # print(cp_df_interp.columns)
+        # print(cp_df_interp2)
+        # print(cp_df_interp2.columns)
+        # print(cp_df_interp.equals(cp_df_interp2))
+        #exit()
+        #cp_df_interp.loc[:, "num_current_sessions"] = cp_df_interp[[c for c in cp_df_interp if "event_Updated" in c]].sum(axis=1) # not needed since event_Updated already summed up...
+
+        if per_cp_group_date_exog:
+            c1 = cp_df_interp.columns
+            get_date_exog(cp_df_interp, prefix="date_exog_group_")
+            c_del = [c for c in cp_df_interp.columns if c not in c1]
+            for c in cp_df_interp.columns:
+                if c.startswith("date_exog_group_"):
+                    cp_df_interp = pd.concat([cp_df_interp, pd.get_dummies(cp_df_interp[c], prefix=c)], axis=1)
+            cp_df_interp = cp_df_interp[[c for c in cp_df_interp.columns if c not in c_del]]
+
+        if per_cp_group_normalize:
+            normalize_cols(cp_df_interp, "_group_norm")
+        
+        # if len(cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) > 1:
+        #     print("1",cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()])
+        #     exit()
+        # print(group, cp_df_interp)
+        # break
+        yield group, cp_df_interp
+
+
+def _get_cp_feat_cols(cp_df_interp, group, cpo_feat_cols, keep_only_normalized=False, keep_all_normalized=False, pred_col = "charge_speed_lag_0"):
+    keep_lag_0_CPs = ["charge_speed"]
+    keep_lag_0_n = ["time_diff", 'event_Ended', 'event_Started', 'event_Updated', "date_exog_cp_", "date_exog_group_"]
+    if keep_all_normalized:
+        keep_lag_0_n+=["_norm"]
+    keep_lag_1_n = ["meter_value_sampled_value", "meter_diff", "charge_speed"]
+    
+    
+    for col in cp_df_interp.columns:
+
+        for s in keep_lag_0_CPs:
+            if (s in col and "_lag_" in col and "CP_" in col) and ("custom_data_" not in col and "is_attack" not in col):
+                if keep_only_normalized and "_norm" not in col:
+                    continue
+                cpo_feat_cols[group]["keep_lag_0_n_cols"].add(col)
+
+        for s in keep_lag_0_n:
+            if (s in col and "_lag_" in col) and ("custom_data_" not in col and "is_attack" not in col):
+                if keep_only_normalized and "_norm" not in col:
+                    continue
+                cpo_feat_cols[group]["keep_lag_0_n_cols"].add(col)
+        for s in keep_lag_1_n:
+            if (s in col and "_lag_0" not in col) and ("custom_data_" not in col and "is_attack" not in col):
+                if keep_only_normalized and "_norm" not in col:
+                    continue
+                if col in cpo_feat_cols[group]["keep_lag_0_n_cols"]:
+                    continue
+                cpo_feat_cols[group]["keep_lag_1_n_cols"].add(col)
+
+    cpo_feat_cols[group]["pred_col"].add(pred_col)
+
+def _get_cpo_feat_cols(cpo_df_interp_l, keep_only_normalized=False):
+    cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: set()))
+    for group, cp_df_interp in cpo_df_interp_l.items():
+        _get_cp_feat_cols(cp_df_interp, group, cpo_feat_cols, keep_only_normalized)
+    return cpo_feat_cols
+
+def get_cp_feat_cols(g,t,cpo_group_feat_cols):
+    for col_t,cols in t.items():
+        if col_t in ["keep_lag_0_n_cols", "keep_lag_1_n_cols"]:
+            cpo_group_feat_cols[g]["feat_cols"].extend(sorted(cols))
+        elif col_t in ["pred_col"]:
+            cpo_group_feat_cols[g]["pred_col"].extend(sorted(cols))
+
+def get_cpo_feat_cols(cpo_df_interp_l):
+    cpo_feat_cols = _get_cpo_feat_cols(cpo_df_interp_l)
+    cpo_group_feat_cols = defaultdict(lambda: defaultdict(lambda: list()))
+    for g,t in cpo_feat_cols.items():
+        get_cp_feat_cols(g,t,cpo_group_feat_cols)
+    return cpo_group_feat_cols
+
+def get_cpo_group_feat_dfs(cpo_files, per_cp_normalize=False, per_cp_date_exog=True, per_cp_group_normalize=True, per_cp_group_date_exog=True, num_lags=4):
+    cpo_dfs, t_min, t_max = get_cpo_dfs(cpo_files, per_cp_normalize=per_cp_normalize, per_cp_date_exog=per_cp_date_exog)
+    # cpo_df_interp = {group: cp_df_interp for group, cp_df_interp in resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog)}
+    # cpo_df_interp_l = {group: cp_df_interp_l for group, cp_df_interp_l in add_all_lags(cpo_df_interp, num_lags=num_lags)}
+    # cpo_feat_cols = get_cpo_feat_cols(cpo_df_interp_l)
+    pred_col_init = "charge_speed"
+    pred_col = pred_col_init+"_lag_0"
+
+
+    for group, cp_df_interp in resample_cpo_dfs(cpo_dfs, t_min, t_max, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog):
+        cp_df_interp_l = add_lags(cp_df_interp, num_lags=num_lags, only_do_col=pred_col_init)
+        _cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: set()))
+        _get_cp_feat_cols(cp_df_interp_l, group, _cpo_feat_cols, pred_col=pred_col)
+        cpo_feat_cols = defaultdict(lambda: defaultdict(lambda: list()))
+        for g,t in _cpo_feat_cols.items():
+            get_cp_feat_cols(g,t,cpo_feat_cols)
+        yield {group: {"cp_g_df": cp_df_interp_l, 
+                        "feat_cols": cpo_feat_cols[group]["feat_cols"], 
+                        "pred_col": cpo_feat_cols[group]["pred_col"][0]}}
+
+
+
+    # cpo_df_interp_l_feat = {group: {"cp_g_df": cp_df_interp_l, 
+    #                                 "feat_cols": cpo_feat_cols[group]["feat_cols"], 
+    #                                 "pred_col": cpo_feat_cols[group]["pred_col"][0]} for group, cp_df_interp_l in cpo_df_interp_l.items()}
+    # return cpo_df_interp_l_feat
+    # for group, cp_df_interp_l in cpo_df_interp_l.items():
+    #     print(group, cp_df_interp_l)
+    #     print([c for c in cp_df_interp_l.columns if "_norm" not in c and "_lag_0" in c])
+    #     """['meter_value_sampled_value', 'custom_data_meter_no_atk', 'custom_data_scaling', 'custom_data_energy_interval', 'custom_data_original_energy_interval', 
+    #         'custom_data_average_power', 'custom_data_original_average_power', 'meter_diff', 'time_diff', 'charge_speed', 'custom_data_meter_diff', 'custom_data_charge_speed', 
+    #         'event_Ended', 'event_Started', 'event_Updated', 'is_attack', 
+    #         'date_exog_cp_dayofweek', 'date_exog_cp_hour', 'date_exog_cp_dayofweek_4', 'date_exog_cp_hour_8', 
+    #         'date_exog_cp_hour_9', 'date_exog_cp_hour_10', 'date_exog_cp_hour_11', 'date_exog_cp_hour_12', 'date_exog_cp_hour_13', 'date_exog_cp_hour_14', 
+    #         'date_exog_cp_discretize_hour_only_Play', 'date_exog_cp_discretize_hour_only_Work', 'date_exog_cp_discretize_hour_day_High-Work', 'date_exog_cp_discretize_day_is_work_True', 
+    #         'date_exog_cp_discretize_hour_balancing_low', 'date_exog_cp_discretize_hour_balancing_peak', 'date_exog_cp_dayofweek_3', 'date_exog_cp_hour_15', 'date_exog_cp_dayofweek_0', 
+    #         'date_exog_cp_hour_16', 'date_exog_cp_dayofweek_5', 'date_exog_cp_hour_0', 'date_exog_cp_hour_1', 'date_exog_cp_hour_2', 'date_exog_cp_hour_3', 'date_exog_cp_hour_4',
+    #         'date_exog_cp_hour_5', 'date_exog_cp_hour_6', 'date_exog_cp_hour_18', 'date_exog_cp_hour_19', 'date_exog_cp_hour_20', 'date_exog_cp_hour_21', 'date_exog_cp_hour_22', 
+    #         'date_exog_cp_hour_23', 'date_exog_cp_discretize_hour_only_Sleep', 'date_exog_cp_discretize_hour_day_High-Home', 'date_exog_cp_discretize_day_is_work_False', 'date_exog_cp_hour_7', 
+    #         'date_exog_cp_hour_17', 'date_exog_cp_dayofweek_1', 
+    #         'date_exog_group_dayofweek_0', 'date_exog_group_dayofweek_1', 'date_exog_group_dayofweek_3', 'date_exog_group_dayofweek_4', 
+    #         'date_exog_group_dayofweek_5', 'date_exog_group_dayofweek_6', 'date_exog_group_hour_0', 'date_exog_group_hour_1', 'date_exog_group_hour_2', 'date_exog_group_hour_3',
+    #         'date_exog_group_hour_4', 'date_exog_group_hour_5', 'date_exog_group_hour_6', 'date_exog_group_hour_7', 'date_exog_group_hour_8', 'date_exog_group_hour_9', 'date_exog_group_hour_10', 
+    #         'date_exog_group_hour_11', 'date_exog_group_hour_12', 'date_exog_group_hour_13', 'date_exog_group_hour_14', 'date_exog_group_hour_15', 'date_exog_group_hour_16', 'date_exog_group_hour_17', 
+    #         'date_exog_group_hour_18', 'date_exog_group_hour_19', 'date_exog_group_hour_20', 'date_exog_group_hour_21', 'date_exog_group_hour_22', 'date_exog_group_hour_23', 
+    #         'date_exog_group_discretize_hour_only_Play', 'date_exog_group_discretize_hour_only_Sleep', 'date_exog_group_discretize_hour_only_Work', 'date_exog_group_discretize_hour_day_High-Home', 
+    #         'date_exog_group_discretize_hour_day_High-Work', 'date_exog_group_discretize_day_is_work_False', 'date_exog_group_discretize_day_is_work_True', 'date_exog_group_discretize_hour_balancing_low', 
+    #         'date_exog_group_discretize_hour_balancing_peak']"""
+    #     break #TODO no
+    # exit()
+    #TODO dont cheat on features, eg only prev speed but not current...
+        
\ No newline at end of file
diff --git a/ids/features_dso.py b/ids/features_dso.py
new file mode 100644
index 0000000000000000000000000000000000000000..370fa2413104ecbcecb6b8e41f39cc241f52145a
--- /dev/null
+++ b/ids/features_dso.py
@@ -0,0 +1,348 @@
+
+from collections import defaultdict
+from datetime import datetime
+import os
+import warnings
+from matplotlib import pyplot as plt
+import pandas as pd
+from tqdm import tqdm
+
+import numpy as np
+
+import re
+
+import logging
+
+from features_aux import get_date_exog, get_date_exog_col, get_elems_attatched_to_bus, get_grid_pp, load_single_feat, normalize_cols, add_all_lags
+from features_clf import _interp
+
+
+logger = logging.getLogger("WATTSON_EV_IDS.FeaturesDSO")
+
+ROUND_TO=6
+
+def get_dso_files(x_path):
+    onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+    onlyfiles = [f for f in onlyfiles if f.startswith("dso_oscp_data_") and f.endswith(".csv.gz")]
+    return onlyfiles
+
+def read_dso_files(dir, files):
+    # print(dir, files)
+    regex = re.compile(r"^dso_oscp_data_(CP_\d+).csv.gz$")
+    dso_files=dict()
+    for f in tqdm(files, desc="loading read_dso_files"):
+        #dso_oscp_data_CP_1.csv.gz
+        #sim_time	oscp_interval_h	group_id	measurements_value	measurements_phase	measurements_unit	measurements_energy_type	measurements_direction	measurements_measure_time	measurements_initial_measure_time	meter_diff	time_diff	charge_speed
+        result = regex.search(f)
+        if result:
+            f_df = pd.read_csv(os.path.join(dir, f), index_col="sim_time", parse_dates=["sim_time", "measurements_measure_time", "measurements_initial_measure_time"])
+            if result.group(1) != f_df.iloc[0]["group_id"]:
+                logger.error(f"wrong cp_group {result.group(1)} != {f_df.iloc[0]['group_id']}")
+            # f_df = f_df[[c for c in f_df.columns if not c.startswith("custom_data") and c != "cp_c_id" and c != "cp_group"]]
+            dso_files[result.group(1)] = f_df
+            # print(f_df)
+            # break
+        else:
+            logger.error(f"unk file {f}")
+            continue
+    return dso_files
+
+def read_dso_estimations(file):
+    m_df = pd.read_csv(file, parse_dates=["Unnamed: 0"])
+    m_df.rename(columns={"Unnamed: 0":"sim_time"}, inplace=True)
+    m_df["sim_time"] = m_df["sim_time"].dt.tz_localize(None)
+    m_df = m_df.set_index("sim_time", drop=True)
+    return m_df
+
+def read_dso_measurements(file):
+    try:
+        m_df = pd.read_csv(file, parse_dates=["sim_time"])
+    except ValueError as e:
+        logger.info(f"{e} in read_dso_measurements... trying Unnamed: 0")
+        m_df = pd.read_csv(file, parse_dates=["Unnamed: 0"])
+        m_df.rename(columns={"Unnamed: 0":"sim_time"}, inplace=True)
+        logger.info(f"Unnamed: 0 worked.")
+    m_df["sim_time"] = m_df["sim_time"].dt.tz_localize(None)
+    m_df = m_df.set_index("sim_time", drop=True)
+    return m_df
+
+
+def get_features_dso(df, per_cp_group_normalize, per_cp_group_date_exog):
+    # ['oscp_interval_h', 'group_id', 'measurements_value',
+    #    'measurements_phase', 'measurements_unit', 'measurements_energy_type',
+    #    'measurements_direction', 'measurements_measure_time',
+    #    'measurements_initial_measure_time', 'meter_diff', 'time_diff',
+    #    'charge_speed']
+    #print(df)
+    # exit()
+    keep_cols = ["charge_speed", "measurements_value", "meter_diff", "time_diff"]
+    f_df = df[[c for c in df.columns if c != "oscp_interval_h" and c != "group_id" and c != "measurements_phase" and c != "measurements_unit" and c != "measurements_energy_type" and c != "measurements_direction"]].copy()
+    #f_df = pd.concat([f_df, pd.get_dummies(f_df["event"], prefix="event")], axis=1)
+    f_df["time_diff"] = f_df["time_diff"].fillna(0)
+
+    if per_cp_group_date_exog:
+        c1 = f_df.columns
+        get_date_exog(f_df, prefix="date_exog_cp_g_")
+        get_date_exog_col(f_df, col="measurements_measure_time", prefix="date_exog_cp_g_")
+        get_date_exog_col(f_df, col="measurements_initial_measure_time", prefix="date_exog_cp_g_")
+        c_del = [c for c in f_df.columns if c not in c1]
+        for c in f_df.columns:
+            if c.startswith("date_exog_cp_g_"):
+                f_df = pd.concat([f_df, pd.get_dummies(f_df[c], prefix=c)], axis=1)
+        f_df = f_df[[c for c in f_df.columns if c not in c_del]] #del non dummy date exog
+        keep_cols+= [c for c in f_df.columns if c.startswith("date_exog_cp_g_")]
+
+    f_df = f_df[[c for c in f_df.columns if c in keep_cols]]
+
+    if per_cp_group_normalize:
+        normalize_cols(f_df, "_cp_g_norm")
+    return f_df.copy()
+
+def resample_feats(df, t_min, t_max, freq='15Min'):
+    oidx = df.index
+    #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min')
+    nidx = pd.date_range(t_min, t_max, freq=freq)
+    res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+    return res
+
+
+def get_dso_dfs(dso_files, per_cp_group_normalize, per_cp_group_date_exog):
+    dso_dfs=dict()
+    t_min=dict()
+    t_max=dict()
+    for group, cpg_df in tqdm(dso_files.items(), desc="loading dso_dfs"):
+        cp_dff = get_features_dso(cpg_df, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog)
+        if group not in t_min:
+            t_min[group] = cp_dff.index.min()
+            t_max[group] = cp_dff.index.max()
+        else:
+            t_min[group] = min(t_min[group], cp_dff.index.min())
+            t_max[group] = max(t_max[group], cp_dff.index.max())
+        
+        dso_dfs[group] = cp_dff
+    return dso_dfs, t_min, t_max
+
+
+def get_group_meas_est_cols(pp, group, cols):
+    g_regex = re.compile(r"^CP_(\d+)$")
+    res = g_regex.search(group)
+    col_n = "bus."+res.group(1)+"."
+    group_meas_cols = [c for c in cols if col_n in c]
+
+    if pp is not None:
+        conn_cols = get_elems_attatched_to_bus(pp, int(res.group(1)))
+        logger.debug(f"get_group_meas_est_cols adding: {conn_cols} for {col_n}")
+        for col_c in conn_cols:
+            col_c=col_c+"."
+            group_meas_cols += [c for c in cols if col_c in c and (("line" in col_c and not "from_" in c) or ("line" not in col_c))]
+            group_meas_cols = sorted(set(group_meas_cols))
+        logger.debug(f"get_group_meas_est_cols new: {group_meas_cols}")
+    else:
+        logger.warning(f"empty pp grid in get_group_meas_est_cols")
+    return group_meas_cols
+
+def resample_dso_dfs(pp, dso_dfs, dso_meas_df, dso_est_df, t_min, t_max, post_interp_normalize, post_interp_date_exog, add_meas_data=True, add_est_data=False, further_meas_export_data=None, pred_col="charge_speed"):
+    g_regex = re.compile(r"^CP_(\d+)$")
+    for group, cp_dff in tqdm(dso_dfs.items(), desc="loading cp_df_interp"):
+        cp_df_interp = resample_feats(cp_dff, t_min[group], t_max[group])
+
+        if add_meas_data:
+            group_meas_cols = get_group_meas_est_cols(pp, group, dso_meas_df.columns)
+            dso_meas_df_g = dso_meas_df[group_meas_cols].copy()
+            dso_meas_df_g.rename(columns={c:"grid_meas_"+c for c in group_meas_cols}, inplace=True)
+            dso_meas_df_interp = resample_feats(dso_meas_df_g, t_min[group], t_max[group])
+            # print([c for c in dso_meas_df_interp.columns])
+            # print(dso_meas_df_interp["grid_meas_849.10030.bus.8.active_power"])
+            # print(cp_df_interp[pred_col])
+            cp_df_interp = pd.concat([cp_df_interp, dso_meas_df_interp], axis=1)
+            for c in dso_meas_df_interp.columns:
+                if ".bus." in c and ".active_power" in c: #should be only one
+                    cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c]
+
+                    cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] = cp_df_interp[c]
+                    for c2 in dso_meas_df_interp.columns:
+                        if ".sgen." in c2 and ".active_power" in c2:
+                            cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] += cp_df_interp[c2]
+                    cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")] = cp_df_interp.loc[:, c.replace("bus.", "bus_and_sgen.")] - cp_df_interp[pred_col]
+                    cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.") +"_relation_to_"+ pred_col] = cp_df_interp[pred_col] / cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")]
+                    cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.") +"_relation_to_bus"] =  cp_df_interp.loc[:, c.replace("grid_meas_", "grid_expo_").replace("bus.", "no_ev_load.")] / cp_df_interp[c]
+        
+
+        if further_meas_export_data is not None:
+            with warnings.catch_warnings():
+                warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
+                group_int = int(g_regex.search(group).group(1))
+                for exp_data in further_meas_export_data[group_int]:
+                    exp_data_interp = resample_feats(exp_data, t_min[group], t_max[group])
+                    rng = np.random.default_rng(seed=group_int)
+                    for x in range(10,110,10):
+                        rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=None)
+                        rnd_fac_s = rnd_fac * 0.01
+                        #print(rnd_fac_s)
+                        exp_data_interp_fac=exp_data_interp*rnd_fac_s
+                        exp_data_interp_fac.name = exp_data_interp.name+"_static_fac."+str(x) #same random fac over all measurements of a load (diff for diff loads)
+                        cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1)
+                    for x in range(10,110,10):
+                        rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=exp_data_interp.size)
+                        rnd_fac_s = pd.Series(rnd_fac, index = exp_data_interp.index)
+                        rnd_fac_s = rnd_fac_s * 0.01
+                        exp_data_interp_fac=exp_data_interp*rnd_fac_s
+                        exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load
+                        cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1)
+                    for c in [col for col in cp_df_interp.columns]: #grid_expo_load.5.MEASUREMENT.active_power_static_fac.80
+                        if ".load." in c and ".active_power" in c and "_relation_to_" not in c: #should be only one
+                            cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c]
+                            for c2 in dso_meas_df_interp.columns:
+                                if ".bus." in c2 and ".active_power" in c2 and "_relation_to_" not in c2: #should be only one
+                                    cp_df_interp.loc[:,c+"_relation_to_"+c2] = cp_df_interp[c2]/cp_df_interp[c]
+                    cp_df_interp = cp_df_interp.copy()
+        else:
+            group_int = int(g_regex.search(group).group(1))
+            rng = np.random.default_rng(seed=group_int)
+            load_cols=[]
+            for c in [col for col in cp_df_interp.columns]: #grid_meas_668.10340.load.11.active_power
+                if ".load." in c and ".active_power" in c and "_relation_to_" not in c: #should be only one or two
+                    load_cols.append(c)
+            load_cols_new = {l:l.replace("grid_meas_", "grid_expo_") for l in load_cols}
+            cp_df_interp = cp_df_interp.rename(columns=load_cols_new)
+            load_cols = [v for v in load_cols_new.values()]
+            for lc in load_cols:
+                exp_data_interp = cp_df_interp[lc]
+                #for x in range(10,110,10):
+                for x in [100]:
+                    rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=None)
+                    rnd_fac_s = rnd_fac * 0.01
+                    #print(rnd_fac_s)
+                    exp_data_interp_fac=exp_data_interp*rnd_fac_s
+                    exp_data_interp_fac.name = exp_data_interp.name+"_static_fac."+str(x) #same random fac over all measurements of a load (diff for diff loads)
+                    cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1)
+                for x in range(2,21,2):
+                    index_4 = exp_data_interp.index.values[::4]
+                    rnd_fac4 = rng.uniform(low=-1*x, high=x, size=len(index_4))
+                    rnd_fac_s4 = pd.Series(rnd_fac4, index = index_4)
+                    rnd_fac_s1 = _interp(rnd_fac_s4, exp_data_interp.index, col=None)
+                    rnd_fac_s = rnd_fac_s1 * 0.01
+                    exp_data_interp_fac=exp_data_interp*(1+rnd_fac_s)
+                    exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load
+                    cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1)
+                    # print(cp_df_interp)
+                    # exit()
+
+                for x in [90]: #TODO: deprecated... delete
+                    rnd_fac = rng.integers(low=x, high=min(x+11, 101), size=exp_data_interp.size)
+                    rnd_fac_s = pd.Series(rnd_fac, index = exp_data_interp.index)
+                    rnd_fac_s = rnd_fac_s * 0.01
+                    exp_data_interp_fac=exp_data_interp*rnd_fac_s
+                    exp_data_interp_fac.name = exp_data_interp.name+"_rnd_fac."+str(x) #diff random fac over all measurements of same load
+                    cp_df_interp = pd.concat([cp_df_interp, exp_data_interp_fac], axis=1)
+
+                for c in [col for col in cp_df_interp.columns]: #grid_expo_load.5.MEASUREMENT.active_power_static_fac.80
+                    if ".load." in c and ".active_power" in c and "_relation_to_" not in c and ("_static_fac." in c or "_rnd_fac." in c): #should be only one
+                        cp_df_interp.loc[:,c+"_relation_to_"+pred_col] = cp_df_interp[pred_col]/cp_df_interp[c]
+                        for c2 in dso_meas_df_interp.columns:
+                            if ".bus." in c2 and ".active_power" in c2 and "_relation_to_" not in c2: #should be only one
+                                cp_df_interp.loc[:,c+"_relation_to_"+c2] = cp_df_interp[c2]/cp_df_interp[c]
+                cp_df_interp = cp_df_interp.copy()
+
+        if add_est_data and dso_est_df is not None:
+            group_est_cols = get_group_meas_est_cols(pp, group, dso_est_df.columns)
+            dso_est_df_g = dso_est_df[group_est_cols].copy()
+            dso_est_df_g.rename(columns={c:"grid_est_"+c for c in group_est_cols}, inplace=True)
+            dso_est_df_interp = resample_feats(dso_est_df_g, t_min[group], t_max[group])
+            cp_df_interp = pd.concat([cp_df_interp, dso_est_df_interp], axis=1)
+
+
+        # print(group, [c for c in cp_df_interp.columns if "bus" in c])
+        cp_df_interp = cp_df_interp.groupby(cp_df_interp.index).sum()
+        cp_df_interp = cp_df_interp.round(ROUND_TO)
+        if post_interp_date_exog:
+            c1 = cp_df_interp.columns
+            get_date_exog(cp_df_interp, prefix="date_exog_group_")
+            c_del = [c for c in cp_df_interp.columns if c not in c1]
+            for c in cp_df_interp.columns:
+                if c.startswith("date_exog_group_"):
+                    cp_df_interp = pd.concat([cp_df_interp, pd.get_dummies(cp_df_interp[c], prefix=c)], axis=1)
+            cp_df_interp = cp_df_interp[[c for c in cp_df_interp.columns if c not in c_del]]
+
+        if post_interp_normalize:
+            normalize_cols(cp_df_interp, "_group_norm")
+        
+        # if len(cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()]) > 1:
+        #     print("1",cp_df_interp.loc[cp_df_interp.isna().any(axis=1), cp_df_interp.isna().any()])
+        #     exit()
+        # print(group, [c for c in cp_df_interp.columns if "bus" in c])
+        # exit()
+        # break
+        yield group, cp_df_interp
+
+def oscp_speed_workaround(cpo_group_speed, oscp_group_speed,  pred_col = "charge_speed_lag_0"):
+    oidx = cpo_group_speed.index
+    #nidx = pd.date_range(oidx.min(), oidx.max(), freq='5Min')
+    nidx = oscp_group_speed.index
+    res = cpo_group_speed[pred_col].reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+    return res
+
+# ['measurements_value_lag_0', 'meter_diff_lag_0', 'time_diff_lag_0', 'charge_speed_lag_0', 
+#  'date_exog_cp_g_...', 
+#  'date_exog_group_...']
+def _get_dso_feat_cols(pp,dso_df_interp_l, keep_only_normalized=False, pred_col = "charge_speed_lag_0"):
+    # print(dso_df_interp_l["CP_5"])
+    # print([c for c in dso_df_interp_l["CP_5"].columns if "_norm" not in c and "_lag_0" in c])
+    # exit()
+    keep_lag_0_n = ["time_diff", "date_exog_cp_g_", "date_exog_group_"]
+    keep_lag_1_n = ["measurements_value", "meter_diff", "charge_speed"]
+    
+    
+    dso_feat_cols = defaultdict(lambda: defaultdict(lambda: set()))
+    for group, cp_df_interp in dso_df_interp_l.items():
+        keep_lag_0_n += [c.replace("_lag_0","") for c in get_group_meas_est_cols(pp, group, cp_df_interp.columns) if ("grid_meas_" in c or "grid_est_" in c) and "grid_expo_" not in c and "load." not in c] #exclude grid_expo_ (for load active_power) by default
+        # print([c.replace("_lag_0","") for c in get_group_meas_est_cols(pp, group, cp_df_interp.columns) if "grid_meas_" in c or "grid_est_" in c])
+        # print(get_group_meas_est_cols(pp, group, cp_df_interp.columns))
+        # exit()
+        for col in cp_df_interp.columns:
+            for s in keep_lag_0_n:
+                if (s in col and "_lag_" in col) and ("custom_data_" not in col and "is_attack" not in col and "grid_expo_" not in col):
+                    if keep_only_normalized and "_norm" not in col:
+                        continue
+                    dso_feat_cols[group]["keep_lag_0_n_cols"].add(col)
+            for s in keep_lag_1_n:
+                if (s in col and "_lag_0" not in col) and ("custom_data_" not in col and "is_attack" not in col and "grid_expo_" not in col):
+                    if keep_only_normalized and "_norm" not in col:
+                        continue
+                    dso_feat_cols[group]["keep_lag_1_n_cols"].add(col)
+
+        dso_feat_cols[group]["pred_col"].add(pred_col)
+    return dso_feat_cols
+
+def get_dso_feat_cols(pp,dso_df_interp_l, pred_col = "charge_speed_lag_0"):
+    dso_feat_cols = _get_dso_feat_cols(pp,dso_df_interp_l, pred_col = pred_col)
+    dso_group_feat_cols = defaultdict(lambda: defaultdict(lambda: list()))
+    for g,t in dso_feat_cols.items():
+        for col_t,cols in t.items():
+            if col_t in ["keep_lag_0_n_cols", "keep_lag_1_n_cols"]:
+                dso_group_feat_cols[g]["feat_cols"].extend(sorted(cols))
+            elif col_t in ["pred_col"]:
+                dso_group_feat_cols[g]["pred_col"].extend(sorted(cols))
+    return dso_group_feat_cols
+
+def get_dso_group_feat_dfs(DIR, dso_files, dso_meas_df, dso_est_df, per_cp_group_normalize=False, per_cp_group_date_exog=True, post_interp_normalize=True, post_interp_date_exog=True, num_lags=4):
+    pp = get_grid_pp(DIR)
+    pred_col_init = "charge_speed"
+    pred_col = pred_col_init+"_lag_0"
+
+    if False:
+        further_meas_export_data = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" )
+    else:
+        further_meas_export_data=None
+
+    dso_dfs, t_min, t_max = get_dso_dfs(dso_files, per_cp_group_normalize=per_cp_group_normalize, per_cp_group_date_exog=per_cp_group_date_exog)
+    dso_df_interp = {group: cp_df_interp for group, cp_df_interp in resample_dso_dfs(pp, dso_dfs, dso_meas_df, dso_est_df, t_min, t_max, post_interp_normalize=post_interp_normalize, further_meas_export_data=further_meas_export_data, post_interp_date_exog=post_interp_date_exog)}
+    dso_df_interp_l = {group: cp_df_interp_l for group, cp_df_interp_l in add_all_lags(dso_df_interp, num_lags=num_lags, only_do_col=pred_col_init)}
+    dso_feat_cols = get_dso_feat_cols(pp,dso_df_interp_l, pred_col=pred_col)
+
+    dso_df_interp_l_feat = {group: {"cp_g_df": cp_df_interp_l, 
+                                    "feat_cols": dso_feat_cols[group]["feat_cols"], 
+                                    "pred_col": dso_feat_cols[group]["pred_col"][0]} for group, cp_df_interp_l in dso_df_interp_l.items()}
+    return dso_df_interp_l_feat
+
+       
\ No newline at end of file
diff --git a/ids/ids.conf b/ids/ids.conf
new file mode 100644
index 0000000000000000000000000000000000000000..4879b01515fdec38e36dd800a958fb191e7e844e
--- /dev/null
+++ b/ids/ids.conf
@@ -0,0 +1,9 @@
+{
+    "configs": {
+        "latest": {"DIR": "../../wattson-artifacts/latest"}, 
+        "test": {"DIR": "../../wattson-artifacts/elaadnl/test_powerowl_example", "TRAIN_START_DATE": "2023-11-01", "VALIDATION_START_DATE": "2023-12-01", "ATK_START_DATE": "2023-12-31"}, 
+        "elaadnl": {"DIR": "../../wattson-artifacts/elaadnl/year_powerowl_example", "TRAIN_START_DATE": "2023-01-01", "VALIDATION_START_DATE": "2023-11-01", "ATK_START_DATE": "2023-12-01"}, 
+        "elaadnl_atk": {"DIR": "../../wattson-artifacts/elaadnl/", "BASE": "elaadnl"}
+    },
+    "NUM_THREADS": 8
+}
diff --git a/ids/ids.py b/ids/ids.py
new file mode 100644
index 0000000000000000000000000000000000000000..4017eaf24b6db3f09ee21729656004b0c7a26e72
--- /dev/null
+++ b/ids/ids.py
@@ -0,0 +1,3426 @@
+
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from copy import deepcopy
+from datetime import datetime
+import json
+import os
+import re
+from subprocess import PIPE, Popen
+import time
+from cycler import cycler
+import matplotlib
+from matplotlib.dates import DateFormatter
+from numpy import mean
+import numpy as np
+
+from sklearn.metrics import RocCurveDisplay, auc, confusion_matrix, roc_auc_score, roc_curve
+import yaml
+
+
+with open("ids.conf", 'r') as f:
+    conf = json.load(f)
+    configs = conf["configs"]
+    # print(conf)
+    # exit()
+    NUM_THREADS = conf["NUM_THREADS"]
+    if "affinity_mask" in conf:
+        affinity_mask = conf["affinity_mask"]
+        pid = os.getpid()
+        os.sched_setaffinity(pid, affinity_mask) 
+
+
+os.environ["OMP_NUM_THREADS"] = str(NUM_THREADS)
+os.environ["MKL_NUM_THREADS"] = str(NUM_THREADS)
+os.environ["OPENBLAS_NUM_THREADS"] = str(NUM_THREADS)
+os.environ["BLIS_NUM_THREADS"] = str(NUM_THREADS)
+
+import argparse
+import ast
+from collections import defaultdict
+import gzip
+import pickle
+
+
+from pathlib import Path
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import logging
+
+from tqdm import tqdm
+from load_data import clean_dataset_l, get_dataset_l, get_measurements_expo, print_data_df, get_cpo_ocpp_data, get_dso_oscp_data, get_file_dfs, get_measurements, get_estimations, plot_cpo_ocpp_data, plot_dso_oscp_data
+from features_cpo import get_cpo_files, read_cpo_files, get_cpo_group_feat_dfs
+from features_dso import get_dso_files, get_elems_attatched_to_bus, get_grid_pp, oscp_speed_workaround, read_dso_files, get_dso_group_feat_dfs, read_dso_measurements, read_dso_estimations
+from features_aux import iter_feats, load_feats_CPg, load_feats_len, load_single_feat, prune_feats, save_feats, load_feats, get_grids_json, get_ts_sim_map, interpolate_sim_time, get_grid_measurements_from_export
+from regression import eval_tuning, get_cp_group_eval_dicts, get_eval_dicts, get_prediction_dicts, get_regression_pred, get_regression_pred_conc, get_regression_pred_conc_kfold, optimize_regression
+from features_clf import _interp, cross_val_clf, do_clf, do_get_clf_is_atk_conc, get_atks, get_clf_eval, get_clf_feat_dfs, get_clf_feat_file_dicts, get_clf_feat_for_conf, get_clf_is_atk, get_clf_is_atk_dfs, get_clf_result_output, get_clf_result_output_conc, get_eval_dicts_clf, get_param_grid_len, load_eval_dicts_clf, prune_clf_feats, save_clf_data, save_eval_dicts_clf, to_float_list
+
+logger = logging.getLogger("WATTSON_EV_IDS")
+
+# os.environ["QT_QPA_PLATFORM"] = "wayland"
+#os.environ["QT_QPA_PLATFORM"] = "xcb"
+
+# py ids.py -c=load_data -v -d test
+# py ids.py -c=get_features_dso -l 4 -v -d test
+# py ids.py -c=get_features_cpo -l 4 -v -d test
+# py ids.py -c=train_reg_dso -r test -v -d test
+# py ids.py -c=train_reg_cpo -r test -f no_cps no_grid -v -d test
+# py ids.py -c=eval_tuning -v -d elaadnl
+# py ids.py -c=do_pred_dso -s 5 -v -d elaadnl
+# py ids.py -c=do_pred_cpo -s 5 -v -d elaadnl
+
+# py ids.py -c=get_features_clf -v -d elaadnl
+
+# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_reg only_norm -v -d elaadnl
+# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_reg no_norm -v -d elaadnl
+# py ids.py -c=optimize_clf -C LocalOutlierFactor -F only_norm -v -d elaadnl
+# py ids.py -c=optimize_clf -C LocalOutlierFactor -F no_norm -v -d elaadnl
+
+
+# py ids.py -c=do_clf_dso -v -d elaadnl_atk
+
+
+for k in configs.keys():
+    configs[k]["DataPointMapsDIR"] = "../scenarios/powerowl_example/data-points"
+    configs[k]["OutDataDIR"] = "data/"+k
+    if "atk" in k:
+        sub_folders = [os.path.join(configs[k]["DIR"], name) for name in os.listdir(configs[k]["DIR"]) if os.path.isdir(os.path.join(configs[k]["DIR"], name)) if name.startswith("atk_")]
+        configs[k]["DIR"] = sub_folders
+
+for k in configs.keys():
+    if "TRAIN_START_DATE" in configs[k]:
+        configs[k]["TRAIN_START_DATE"] += " 00:00:00.0"
+        configs[k]["VALIDATION_START_DATE"] += " 00:00:00.0"
+        configs[k]["ATK_START_DATE"] += " 00:00:00.0"
+
+dataset_choices = get_dataset_l(configs)
+
+#missing measurements in /home/dk/git/wattson-artifacts/elaadnl/atk_4_1.0_powerowl_example; eg for data points 818 ie bus.7
+
+pd.reset_option('^display.', silent=True)
+DO_OSCP_WORKAROUND=True
+USE_TEX=True
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog='ids.py', usage='%(prog)s [options]')
+    parser.add_argument('-d', '--dataset', nargs='+',
+                        choices=dataset_choices,
+                        help='dataset selection')           # 
+    parser.add_argument('-c', '--case',
+                        choices=["test", 'load_data', "get_features_dso", "get_features_cpo",
+                                 'plot_atks', 'plot_atks2', 'get_load_yml', "plot_base",
+                                 "train_reg_dso",'train_reg_cpo', 
+                                 "eval_tuning", "do_pred_dso", "do_pred_cpo", 
+                                 "get_features_clf", "get_features_clf_cpo", "get_features_clf_dso",
+                                 "optimize_clf", "optimize_clf_cpo", "optimize_clf_dso",
+                                 "eval_tuning_clf", "print_eval_tuning_clf",
+                                 "get_is_atk_dfs", "get_is_atk_dfs_cpo", "get_is_atk_dfs_dso",
+                                 "do_clf", "do_clf_cpo", "do_clf_dso",
+                                 "eval_clf_results1", "eval_clf_results2", "eval_clf_results3", "eval_clf_results4"],
+                        help='evaluation case selection')      # 
+    parser.add_argument('-f', '--features',
+                        help="feature set selection",
+                        default=["all"], nargs='+',
+                        choices=["all", "no_cps", "no_cps_but_speed", 
+                                 "no_grid", "no_grid_storage", "no_grid_line", "no_grid_sgen", "no_grid_bus_rel_to_pred", "add_bus_relations", ]+
+                                 ["add_grid_load_expo_static_"+str(x) for x in range(10,110,10)]+
+                                 ["add_grid_load_expo_rnd_"+str(x) for x in range(2,21,2)]+
+                                 ["no_est", "no_meas", "only_pred_lag", "no_time_diff",                                  
+                                 "only_norm", "only_norm_but_pred", "no_norm", "no_norm1", "no_norm2", "no_date1", "no_date2",
+                                 "no_hour_date", "no_hour_only_date", "no_hour_day_date", "no_day_of_week_date", "no_day_is_work_date", "no_hour_balancing_date"],)
+    parser.add_argument('-r', '--reg_type',
+                        choices=['RandomForestRegressor', "DecisionTreeRegressor",'GradientBoostingRegressor', "HistGradientBoostingRegressor", "LinearSVR", "MLPRegressor", "test"],
+                        help='regression algorithm selection (only for train_reg_dso/cpo)', default=None)      # 
+    parser.add_argument('-s', '--shifts',
+                        help="amount of (shifted) preditctions to generate (>=1) (only for do_pred_dso/cpo)",
+                        type=int, default=1)
+    parser.add_argument('-l', '--lags',
+                        help="Amount of lags to add during feature extraction (only for get_features_dso/cpo)",
+                        type=int, default=4)
+    parser.add_argument('-C', '--clf_type',
+                        choices=['LocalOutlierFactor', "OneClassSVM", "EllipticEnvelope", "IsolationForest", "test"],
+                        help='classification/novelty detection algorithm selection', default=None)      # 
+    parser.add_argument('-F', '--clf_features',
+                        help="novelty detection feature set selection",
+                        default=["all"], nargs='+',
+                        choices=["all", "only_norm", "no_norm", "copy_reg", "no_reg", "reg_no_cps", "train_conta", "only_diff"]+
+                                 ["set_1", "set_2", "set_3", "set_4", "set_5", "set_6", "set_7", "set_8", "set_9", "set_10", "set_11", "set_12", "set_13"]+
+                                 ["set_31", "set_32", "set_33", "set_34", "set_35", "set_352"]+
+                                 ["add_grid_load_expo_static_"+str(x) for x in range(10,110,10)]+
+                                 ["add_grid_load_expo_rnd_"+str(x) for x in range(2,21,2)],)
+    
+    parser.add_argument('-A', '--atk_subset',
+                        help="set considered attack subset",
+                        choices=["mad", "fdi", "combo"],)
+    
+    parser.add_argument('-a', '--affinity_mask',
+                        help="set CPU affinity_mask",
+                        default=[], nargs='+',)
+    
+    parser.add_argument('-e', '--eval_fac',
+                        help="Factor for evaluation attack size",
+                        type=str)
+    
+    parser.add_argument('-n', '--noise',
+                        help="Add noise to grid measurements for testing",
+                        type=int)
+    
+    parser.add_argument(
+        '-U', '--use_cpo_pred',
+        help="Use CPO predictions at DSO.",
+        action="store_const", dest="use_cpo_pred", const=True,
+        default=False,
+    )
+    parser.add_argument(
+        '-R', '--recursive_regression',
+        help="Use recursive_regression instead of training multiple regression models per prediction distance.",
+        action="store_const", dest="recursive_regression", const=True,
+        default=False,
+    )
+    parser.add_argument(
+        '-O', '--overwrite',
+        help="Overwrite existing files.",
+        action="store_const", dest="overwrite", const=True,
+        default=False,
+    )
+    parser.add_argument(
+        '-D', '--debug',
+        help="Print lots of debugging statements",
+        action="store_const", dest="loglevel", const=logging.DEBUG,
+        default=logging.WARNING,
+    )
+    parser.add_argument(
+        '-v', '--verbose',
+        help="Be verbose",
+        action="store_const", dest="loglevel", const=logging.INFO,
+    )
+    # parser.print_help()
+
+    args = parser.parse_args()
+    NUM_LAGS=args.lags
+    logging.basicConfig(level=args.loglevel)
+    args.dataset = clean_dataset_l(args.dataset, configs) #
+    logger.info(f"{args.dataset=}, {args.case=}, {args.loglevel=}, {args.features=}, {args.reg_type=}, {args.shifts=}, {args.lags=}, {args.clf_type=}, {args.clf_features=}, {args.affinity_mask=}, {args.overwrite=}")
+
+
+    if args.affinity_mask:
+        pid = os.getpid()
+        os.sched_setaffinity(pid, [int(a) for a in args.affinity_mask]) 
+
+    file_dfs = get_file_dfs(args.dataset, configs)
+
+
+    if args.case == "load_data":
+        for dataset in args.dataset:
+            print_data_df(file_dfs[dataset])
+            file_df = file_dfs[dataset]
+            DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"]
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+
+            cpo_tx_df_cps = get_cpo_ocpp_data(file_df)
+            dso_tx_df_cp_dict = get_dso_oscp_data(file_df)
+
+            #TODO: fix oscp data...
+            # cp_dff = dso_tx_df_cp_dict["CP_11"]
+            # fig, ax = plt.subplots(figsize=(16,9))
+            # ax.plot(cp_dff.index, cp_dff["charge_speed"], label="charge_speed_lag_0") 
+            # ax.plot(cpp_dff.index, cpp_dff["charge_speed"], label="charge_speed_lag_0 cpo") 
+            # plt.legend()
+            # plt.show()
+            # plt.close()
+            # exit()
+        
+
+            if logger.isEnabledFor(logging.DEBUG):
+                for k,v in cpo_tx_df_cps.items():
+                    plot_cpo_ocpp_data(v)
+                plot_dso_oscp_data(dso_tx_df_cp_dict)
+                plt.show()
+                plt.close()
+
+            clean_meas_df = get_measurements_expo(file_df, configs[dataset]["DIR"])
+            # print(clean_meas_df)
+            # clean_meas_df = get_measurements(file_df, DataPointMapsDIR)
+            # print(clean_meas_df)
+            # exit()
+            est_df=None
+            if False:
+                est_df = get_estimations(file_df, drop_dups=False)
+
+            Path(OutDataDIR).mkdir(parents=True, exist_ok=True)
+            logger.info(f"Writing results to {OutDataDIR=} ...")
+            for k,v in cpo_tx_df_cps.items():
+                Path(OutDataDIR+"/"+k).mkdir(parents=False, exist_ok=True)
+                for k2,v2 in v.items():
+                    for cp_id, cp_df in v2:
+                        cp_df.to_csv(OutDataDIR+"/"+k+"/cpo_ocpp_data_"+k2+"_"+cp_id+".csv.gz")
+            Path(OutDataDIR+"/DSO").mkdir(parents=False, exist_ok=True)
+            for k,v in dso_tx_df_cp_dict.items():
+                v.to_csv(OutDataDIR+"/DSO/dso_oscp_data_"+k+".csv.gz")
+            clean_meas_df.to_csv(OutDataDIR+"/measurements.csv.gz")
+            if est_df is not None:
+                est_df.to_csv(OutDataDIR+"/estimations.csv.gz")
+            logger.info(f"Done. Results in {OutDataDIR=}")
+                    
+    elif args.case == "get_load_yml":
+        for dataset in args.dataset:
+            DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"]
+            regex_dp = re.compile(r"^(\d+)\-data\-points\.yml$")
+            # DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"]
+            onlyfiles = [f for f in os.listdir(DataPointMapsDIR) if os.path.isfile(os.path.join(DataPointMapsDIR, f))]
+            logger.info(onlyfiles)
+            
+            dp_dict = defaultdict(lambda: [])
+            for f in tqdm(onlyfiles, desc="Loading Data Point Maps"):
+                result = regex_dp.search(f)
+                if result:
+                    with open(os.path.join(DataPointMapsDIR, f), "rb") as f:
+                        dp_data = yaml.load(f, Loader=yaml.FullLoader)
+                        dp_dict[result.group(1)].append(dp_data)
+                else:
+                    logger.error(f"unk file {f}")
+            
+            #[{'401': [{'identifier': '401.10010', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, 'direction': 'monitoring', 'ioa': 10010, 'type_id': 13}, 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'voltage', 'context': 'MEASUREMENT', 'grid_element': 'bus.0', 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 'value': None}, {'identifier': '401.10020', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, .... 
+            bus_map = dict()
+            for k,v in dp_dict.items():
+                #print(k,v)
+                for x in v[0][k]:
+                    if "providers" not in x:
+                        continue
+                    for ts, prov in x["providers"].items():
+                        ge = prov[0]["provider_data"]["grid_element"]
+                        if "bus" in ge:
+                            #print(k, ge)
+                            bus_map[ge] = k
+            pp=get_grid_pp(configs[dataset]["DIR"])
+            #print(pp["load"])
+            load_map=dict()
+            for bus, dp in bus_map.items():
+                bus_i = int(bus.split(".")[1])
+                # if bus_i == 8:
+                #     print(dp) #849
+                #     exit()
+                bus_l = pp["load"][pp["load"]["bus"] == bus_i]
+                if len(bus_l) > 0:
+                    load_map[bus] = bus_l
+                    #print(bus_l)
+            #exit()
+
+            for bus, bus_l in tqdm(load_map.items()):
+                dp = bus_map[bus]
+                max_ioa=0
+                existing=0
+                out_dict = deepcopy(dp_dict[dp][0])
+                out_dict[dp] = list()
+                for x in dp_dict[dp][0][dp]:
+                    #print(x)
+                    max_ioa=max(max_ioa,x["protocol_data"]["ioa"])
+                    for st, vs in x["providers"].items():
+                        for v in vs:
+                            if v["provider_data"]["attribute"] == "active_power" or v["provider_data"]["attribute"] == "active_power_to": #"bus" in v["provider_data"]["grid_element"] and 
+                                out_dict[dp].append(x)
+                            for idx,bus_load in bus_l.iterrows():
+                                if v["provider_data"]["grid_element"] == 'load.'+str(idx):
+                                    existing+=0
+                                    logger.warning(f"load.{idx} already existing in {dp} {existing=}")
+                                    break
+                            if existing > 0:
+                                break
+                        if existing > 0:
+                            break
+                    # if existing > 0:
+                    #     break
+                if existing > 0:
+                    continue
+                for idx,bus_load in bus_l.iterrows():
+                    max_ioa=max_ioa+10
+                    new_val={'identifier': str(dp)+'.'+str(max_ioa), 'protocol': '60870-5-104', 
+                            'protocol_data': {'coa': int(dp), 'cot': 1, 'direction': 'monitoring', 'ioa': int(max_ioa), 'type_id': 13}, 
+                            'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'active_power', 'context': 'MEASUREMENT', 'grid_element': 'load.'+str(idx), 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 
+                            'value': None}
+                    dp_dict[dp][0][dp].append(new_val)
+                    out_dict[dp].append(new_val)
+                    if False:
+                        max_ioa=max_ioa+10
+                        new_val={'identifier': str(dp)+'.'+str(max_ioa), 'protocol': '60870-5-104', 
+                                'protocol_data': {'coa': int(dp), 'cot': 1, 'direction': 'monitoring', 'ioa': int(max_ioa), 'type_id': 13}, 
+                                'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'reactive_power', 'context': 'MEASUREMENT', 'grid_element': 'load.'+str(idx), 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 
+                                'value': None}
+                        #print()
+                        #print(new_val)
+                        dp_dict[dp][0][dp].append(new_val)
+                        out_dict[dp].append(new_val)
+                f_name=f"{dp}-data-points.yml"
+                out_f = os.path.join(DataPointMapsDIR,f_name)
+                #print(out_f)
+                if False:
+                    with open(out_f, "w") as f:
+                        yaml.dump(dp_dict[dp][0], f)
+                with open(out_f, "w") as f:
+                    yaml.dump(out_dict, f)
+                #exit()
+
+
+    elif args.case == "get_features_dso":
+        reg_type=args.reg_type
+        atk_subset_out_name=None
+
+        for dataset in args.dataset:
+            logger.info(f"{args.case} {dataset=}")
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            OutDataDIR_DSO = os.path.join(OutDataDIR, "DSO")
+
+
+            dso_files_l = get_dso_files(OutDataDIR_DSO)
+            dso_files = read_dso_files(OutDataDIR_DSO, dso_files_l)
+
+            if DO_OSCP_WORKAROUND:      
+                logger.warning("using oscp workaround. TODO: fix oscp calc by CPO in simulation...")          
+                cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")]
+                cpo = cpos[0] #TODO: iterate cpos
+                cpo_feat = load_feats(OutDataDIR, cpo) #TODO: tqdm...
+                for group, df_dict in cpo_feat:
+                    new_speed = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group])
+                    dso_files[group]["charge_speed"] = new_speed
+
+                    if args.atk_subset:
+                        new_speed_should = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group], pred_col="custom_data_charge_speed_lag_0")
+                        new_is_atk = oscp_speed_workaround(df_dict["cp_g_df"], dso_files[group], pred_col="is_attack_lag_0")
+
+                        dso_files[group]["custom_data_charge_speed"] = new_speed_should
+                        dso_files[group]["is_attack"] = new_is_atk
+
+                        dso_files[group]["custom_data_considered_atk"] = 0
+                        atk_subset_out_name=args.atk_subset
+                        atk_l = get_atks(l_buffer=6, start_sim_offset_h=-1)
+                        #print(atk_l)
+                        for atk in atk_l:
+                            if atk["type"] == None:
+                                pass
+                            elif args.atk_subset=="combo" and "FDI" in atk["type"] and "MAD" in atk["type"]:
+                                dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1
+                            elif args.atk_subset=="fdi" and "FDI" in atk["type"]:
+                                dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1
+                            elif args.atk_subset=="mad" and "MAD" in atk["type"]:
+                                dso_files[group].loc[atk["start"]:atk["end"], "custom_data_considered_atk"] = 1
+                        #print(dso_files[group][dso_files[group]["is_attack"] != 0])
+                        dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "charge_speed"] = dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "custom_data_charge_speed"]
+                        dso_files[group].loc[(dso_files[group]["custom_data_considered_atk"] == 0), "is_attack"] = 0
+                        #print(dso_files[group][dso_files[group]["is_attack"] != 0])
+                        dso_files[group] = dso_files[group].drop(columns=['custom_data_considered_atk'])
+                        dso_files[group] = dso_files[group].drop(columns=['is_attack'])
+                        dso_files[group] = dso_files[group].drop(columns=['custom_data_charge_speed'])
+                        #print(dso_files[group][dso_files[group]["is_attack"] != 0])
+                    #exit()
+
+            dso_est_df=None
+            if False:
+                dso_est_df = read_dso_estimations(OutDataDIR+"/estimations.csv.gz")
+            dso_meas_df = read_dso_measurements(OutDataDIR+"/measurements.csv.gz")
+            
+            if False:
+                print([c for c in dso_meas_df.columns if ".active_power" in c and "bus" in c])
+                print([c for c in dso_meas_df.columns if ".active_power" in c and "sgen" in c])
+                bus_ns = sorted(set([int(c.split(".")[3]) for c in dso_meas_df.columns if ".active_power" in c and "bus" in c]))
+                pp=get_grid_pp(configs[dataset]["DIR"])
+
+                _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=bus_ns, target_elem="bus", target_var=".MEASUREMENT.active_power" )
+                _plot_l_s2 = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="sgen", target_var=".MEASUREMENT.active_power" )
+                
+                def get_intert_expo(bus_meas,dso_meas_df):
+                    interp_map=[]
+                    bus_i = int(bus_meas.split(".")[3])
+                    bus_expo = _plot_l_s[bus_i][0]
+                    bus_expo_interp = _interp(bus_expo.to_frame(), dso_meas_df.index, col=None)
+                    #dso_meas_df[bus_meas] = bus_expo_interp
+                    interp_map.append((bus_meas,bus_expo_interp))
+
+                    for meas in [c for c in dso_meas_df.columns if ".active_power" in c and "sgen" in c]:
+                        meas_i = int(meas.split(".")[3])
+                        for meas_expo in _plot_l_s2[bus_i]:
+                            if "sgen."+str(meas_i)+"." in meas_expo.name:
+                                meas_expo_interp = _interp(meas_expo.to_frame(), dso_meas_df.index, col=None)
+                                #dso_meas_df[meas] = meas_expo_interp
+                                interp_map.append((meas,meas_expo_interp))
+                    return interp_map
+                with ProcessPoolExecutor(NUM_THREADS) as pool:
+                    results=[]
+                    for bus_meas in tqdm([c for c in dso_meas_df.columns if ".active_power" in c and "bus" in c]):
+                        results.append(pool.submit(get_intert_expo, bus_meas,dso_meas_df))
+                    for res in tqdm(results):
+                        interp_map = res.result()
+                        for meas,meas_expo_interp in interp_map:
+                            dso_meas_df[meas] = meas_expo_interp
+                #exit()
+            #print([c for c in dso_meas_df if "bus." in c])
+            DIR = configs[dataset]["DIR"]
+            dso_feat = get_dso_group_feat_dfs(DIR, dso_files, dso_meas_df, dso_est_df, num_lags=NUM_LAGS)
+            
+            # print(dso_feat["CP_11"]["cp_g_df"].columns)
+            # print(dso_feat["CP_11"]["cp_g_df"])
+            # cp_g_df= dso_feat["CP_11"]["cp_g_df"]
+            # #exit()
+            # fig, ax = plt.subplots(figsize=(16,9))
+            # ax.plot(cp_g_df.index, cp_g_df["charge_speed_lag_0"], label="charge_speed_lag_0") 
+            # plt.legend()
+            # plt.show()
+            # plt.close()
+            # # print(row)
+            # exit()
+
+            # logger.info(f'Writing Results to {OutDataDIR+"/feats/dso.gz"} ...')
+            if atk_subset_out_name is None:
+                save_feats(dso_feat, OutDataDIR, "DSO")
+            else:
+                save_feats(dso_feat, OutDataDIR, "DSO_"+atk_subset_out_name)
+
+    elif args.case == "plot_base":
+        for dataset in args.dataset:
+            logger.info(dataset)
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            #ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+
+            cpo_feat = load_feats(OutDataDIR, "CPO_0")
+            cpo_files=dict()
+            for group, df_dict in tqdm(cpo_feat, total=load_feats_len(OutDataDIR, "CPO_0"), desc="eval cpo regression "):
+                prune_feats(df_dict, args.features)
+                #print(df_dict["cp_g_df"])
+                cpo_files[group] = df_dict["cp_g_df"]
+                #break
+
+            print(OutDataDIR)
+            OutDataDIR_DSO = os.path.join(OutDataDIR, "DSO")
+            dso_files_l = get_dso_files(OutDataDIR_DSO)
+            print(dso_files_l)
+            dso_files = read_dso_files(OutDataDIR_DSO, dso_files_l)
+            for cp_g,df in dso_files.items():
+                print(cp_g,df)
+                break
+
+
+            for cp_g in dso_files.keys():
+                fig, ax = plt.subplots()
+                dso_df = dso_files[cp_g]
+                cpo_df = cpo_files[cp_g]
+                ax.plot(dso_df["measurements_measure_time"], dso_df["charge_speed"], label="dso "+cp_g)
+                #ax.plot(dso_df["measurements_measure_time"], dso_df["measurements_value"], label="dso "+cp_g)
+                #ax.plot(dso_df.index, dso_df["measurements_value"], label="dso "+cp_g)
+                ax.plot(cpo_df.index, cpo_df["charge_speed_lag_0"], label="cpo "+cp_g)
+                
+                plt.legend(loc="upper left")
+                plt.show()
+                plt.close()
+            exit()
+                
+            #datasetb = configs[dataset]["BASE"]
+            #base_dir = configs[datasetb]["OutDataDIR"]
+
+            # base_dfs=dict()
+            # base_feat = load_feats(base_dir,"DSO")
+            # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "):
+            #     base_dfs[group] = df_dict
+
+            do_bus=8
+            dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO")
+            for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "):
+                if "_"+str(do_bus) not in group:
+                    continue
+                prune_feats(df_dict, args.features)
+                print(df_dict["cp_g_df"].loc["2023-12-01 14:30:10":])
+                break
+
+
+
+    elif args.case == "plot_atks":
+        for dataset in args.dataset:
+            logger.info(dataset)
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            #ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+                
+            cp_g_atk_types = defaultdict(lambda: list())
+            cpo_files_d = get_cpo_files(OutDataDIR)
+            for cpo, cps in cpo_files_d.items():
+                cpo_files = read_cpo_files(os.path.join(OutDataDIR, cpo), cps)
+                for cp_g,v in cpo_files.items():
+                    for cp,df in v.items():
+                        # print([c for c in df.columns])
+                        # print(df["custom_data_atk_type"])
+                        df1 = df["custom_data_atk_type"].loc[df["custom_data_atk_type"].shift(-1) != df["custom_data_atk_type"]]
+                        df2 = df["custom_data_atk_type"].loc[df["custom_data_atk_type"].shift(1) != df["custom_data_atk_type"]]
+                        df3 = pd.concat([df1,df2]).sort_index()
+                        df3 = df3[~df3.index.duplicated(keep='first')]
+                        cp_g_atk_types[cp_g].append(df3) #cahnges in atk type (start and end)
+                        # print(df3)
+                        # exit()
+
+            datasetb = configs[dataset]["BASE"]
+            base_dir = configs[datasetb]["OutDataDIR"]
+
+            # base_dfs=dict()
+            # base_feat = load_feats(base_dir,"DSO")
+            # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "):
+            #     base_dfs[group] = df_dict
+
+            do_bus=8
+            dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO")
+            for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "):
+                if "_"+str(do_bus) not in group:
+                    continue
+                prune_feats(df_dict, args.features)
+                print(df_dict["cp_g_df"].loc["2023-12-01 14:30:10":])
+                # print([c for c in df_dict["cp_g_df"].columns if "grid_expo" in c]) #grid_expo_load.7.MEASUREMENT.active_power_lag_0
+                # print([c for c in df_dict["feat_cols"] if "grid_expo" in c])
+                # print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c])
+                # print([c for c in df_dict["feat_cols"] if "grid_meas" in c])
+                #exit()
+                print(group)
+                # print(base_dfs[group]["cp_g_df"]["charge_speed_lag_0"])
+                # min_speed = base_dfs[group]["cp_g_df"]["charge_speed_lag_0"].min()
+                # max_speed = base_dfs[group]["cp_g_df"]["charge_speed_lag_0"].max()
+                #exit()
+                # print([c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".active_power_" in c and "_relation_to_" not in c])
+                # exit()
+                if False:
+                    fig, ax = plt.subplots()
+                    print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "bus." in c and "_relation_to_" not in c and "norm_" not in c])
+                    bus_vals=[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "bus." in c and "_relation_to_" not in c and "norm_" not in c]
+                    load_vals=[c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".active_power_static_fac.100_lag_0" in c and "load." in c and "_relation_to_" not in c and "norm_" not in c]
+                    sgen_vals=[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and "_lag_0" in c and "sgen." in c and "_relation_to_" not in c and "norm_" not in c]
+                    for b in bus_vals:
+                        ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b )
+                    for b in load_vals:
+                        ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b )
+                    for b in sgen_vals:
+                        ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][b], label=b )
+                    plt.legend(loc="upper left")
+                    plt.show()
+                    exit()
+                grid_meas = [c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".bus." in c and "_relation_to_" not in c][0]
+                print(df_dict["cp_g_df"][grid_meas])
+                # print([f for f in df_dict["cp_g_df"].columns if "lag_0" in f and "norm" not in f and "date" not in f])
+                # exit()
+                print(df_dict["cp_g_df"]["charge_speed_lag_0"])
+                print()
+                print([f for f in df_dict["cp_g_df"].columns if "grid_est" in f and "lag_0" in f and "norm" not in f])
+                fig, ax = plt.subplots()
+                print([f for f in df_dict["cp_g_df"].columns if "grid_expo_" in f and "lag_0" in f and "norm" not in f])
+                #exit()
+
+                # print([c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power" in c and "load." in c and "grid_expo_" not in c])
+                # exit()
+                sgen_meas = [c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".sgen." in c][0]
+                df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]=0
+                df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]+=df_dict["cp_g_df"][grid_meas]
+                for sgen_meas in[c for c in df_dict["cp_g_df"].columns if "grid_meas" in c and ".active_power_lag_0" in c and ".sgen." in c]:
+                    df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]+=df_dict["cp_g_df"][sgen_meas]
+                ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"], label="grid_bus."+str(do_bus)+"_load_and_sgen_total" )
+
+                if False:
+                    shift_bus=df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"].to_frame()
+                    for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0
+                        shift_bus[grid_expo_load] =df_dict["cp_g_df"][grid_expo_load]
+                    shift_bus["idx"]=shift_bus.index
+                    shift_bus["idx_diff"] = shift_bus["idx"] - shift_bus["idx"].iloc[0]
+                    shift_bus["idx_diff_fix"] = shift_bus["idx_diff"] * 0.96
+                    shift_bus["idx_fix"] = shift_bus["idx"].iloc[0] +  shift_bus["idx_diff_fix"]
+                    shift_bus = shift_bus.set_index(shift_bus["idx_fix"])
+                    shift_bus2=_interp(shift_bus["grid_bus."+str(do_bus)+"_load_total"], df_dict["cp_g_df"].index, col=None)
+                    print(shift_bus2)
+                    df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"] = shift_bus2
+                    for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]:
+                        shift_bus2=_interp(shift_bus[grid_expo_load], df_dict["cp_g_df"].index, col=None)
+                        print(shift_bus2)
+                        df_dict["cp_g_df"][grid_expo_load] = shift_bus2
+                    ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"], label="grid_bus."+str(do_bus)+"_load_and_sgen_total" )
+                #exit()
+
+                #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"][grid_meas], label=grid_meas )
+                #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_expo_load.7.MEASUREMENT.active_power_lag_0"], label="grid_expo_load.7.MEASUREMENT.active_power_lag_0" )
+
+
+                df_dict["cp_g_df"]["grid_expo_load_total"]=0
+                for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_load" in c and ".MEASUREMENT.active_power_static_fac.100_lag_0" in c]:
+                    df_dict["cp_g_df"]["grid_expo_load_total"] +=df_dict["cp_g_df"][grid_expo_load]
+                #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_expo_load_total"], label="grid_expo_load_total" )
+
+                df_dict["cp_g_df"]["grid_vs_expo_remaining_load_total"]= df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]-df_dict["cp_g_df"]["grid_expo_load_total"]
+                # ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_vs_expo_remaining_load_total"], label="grid_vs_expo_remaining_load_total" )
+
+
+                # df_dict["cp_g_df"]["grid_meas_load_total80"]=0
+                # for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_meas_" in c and ".load." in c and ".active_power_static_fac.80_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0
+                #     df_dict["cp_g_df"]["grid_meas_load_total80"] +=df_dict["cp_g_df"][grid_expo_load]
+                # ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total80"], label="grid_meas_load_total80" )
+
+                df_dict["cp_g_df"]["grid_meas_load_total"]=0
+                for grid_expo_load in [c for c in df_dict["cp_g_df"].columns if "grid_expo_" in c and ".load." in c and ".active_power_static_fac.100_lag_0" in c]: #grid_meas_668.10320.load.1.active_power_static_fac.10_lag_0
+                    df_dict["cp_g_df"]["grid_meas_load_total"] +=df_dict["cp_g_df"][grid_expo_load]
+                ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total"], label="grid_meas_load_total" )
+
+
+                # pp=get_grid_pp(configs[dataset]["DIR"])
+                # _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" )
+                # for cp_group_grid_power in _plot_l_s[int(group.split("_")[1])]:
+                #     print(f"{cp_group_grid_power=}")
+                #     cp_group_grid_power_interp=_interp(cp_group_grid_power, df_dict["cp_g_df"].index, col=None)
+                #     df_dict["cp_g_df"]["grid_meas_load_total"] += cp_group_grid_power_interp
+                #     ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_meas_load_total"], label="grid_meas_load_total+storage" )
+
+
+                df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"]= df_dict["cp_g_df"]["grid_bus."+str(do_bus)+"_load_total"]-df_dict["cp_g_df"]["grid_meas_load_total"]
+                # out_vals= df_dict["cp_g_df"][(df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"] > max_speed) | (df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"] < min_speed)]
+                # df_dict["cp_g_df"].loc[out_vals.index, "grid_vs_meas_remaining_load_total"] = df_dict["cp_g_df"].loc[out_vals.index]["charge_speed_lag_0"]
+                ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_vs_meas_remaining_load_total"], label="grid_vs_meas_remaining_load_total" )
+                
+
+
+                ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["charge_speed_lag_0"], label="charge_speed_lag_0")
+                #ax.plot(df_dict["cp_g_df"].index, df_dict["cp_g_df"]["grid_est_bus.11.p_mw_lag_0"]*1000*1000*1000, label="grid_est_bus.11.p_mw_lag_0")
+                ax.plot(df_dict["cp_g_df"].index, [160000]*len(df_dict["cp_g_df"].index), label="base")
+
+                file_df = file_dfs[dataset]
+                DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"]
+                # clean_meas_df = get_measurements(file_df, DataPointMapsDIR)
+                # print(clean_meas_df)
+                # print(clean_meas_df.columns)
+                # ax.plot(clean_meas_df.index, clean_meas_df["585.10030.bus.11.active_power"] )
+                #clean_meas_df.to_csv("test.csv.gz")
+
+                if False:
+                    pp=get_grid_pp(configs[dataset]["DIR"])
+                    print(pp)
+                    print(pp["bus"])
+                    print(pp["load"])
+                    print(pp["sgen"])
+                    print(pp["storage"])
+                    elems = get_elems_attatched_to_bus(pp, do_bus)
+                    print(elems)
+
+
+                    _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" )
+                    cp_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0]
+                    ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks)
+
+                    _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="bus", target_var=".MEASUREMENT.active_power" )
+                    bus_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0]
+                    #ax.plot(bus_group_grid_power.index, bus_group_grid_power.values, label=bus_group_grid_power.name) #grid values (w/mad attacks)
+
+                    _plot_l_s2 = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="load", target_var=".MEASUREMENT.active_power" )
+                    bus_group_grid_power = bus_group_grid_power + df_dict["cp_g_df"][sgen_meas].mean()
+                    for p_lad in _plot_l_s2[int(group.split("_")[1])]:
+                        bus_group_grid_power = bus_group_grid_power - p_lad
+                    ax.plot(bus_group_grid_power.index, bus_group_grid_power.values, label="expo_bus_rem") #grid values (w/mad attacks)
+                    
+
+                plt.legend(loc="upper left")
+                plt.show()
+                exit()
+                break
+            
+            pp=get_grid_pp(configs[dataset]["DIR"])
+            print(pp)
+            lines=pp["line"]
+            lines=lines[lines["from_bus"] < lines["to_bus"]]
+            print(lines[lines["to_bus"] == do_bus])
+            print("line."+str(lines[lines["to_bus"] == do_bus].index[0]))
+
+            ret = []
+            elems = ["load", "sgen", "storage"] #, "storage"?
+            for e in elems:
+                el=pp[e]
+                print(el)
+                el = e+"."+str(el[el["bus"] == do_bus].index[0])
+                ret.append(el)
+            print(ret)
+            print()
+
+            wall_sim_map=get_ts_sim_map(configs[dataset]["DIR"]).sort_index()
+            print(wall_sim_map)
+            print(interpolate_sim_time(wall_sim_map, pd.to_datetime('2024-01-05 16:30:02.00+00:00')))
+            x = pd.Series([datetime.utcfromtimestamp(sim) for sim in wall_sim_map.values], index = [datetime.utcfromtimestamp(wall) for wall in wall_sim_map.index]) 
+            print(x.sort_index())
+            #print(datetime.utcfromtimestamp(interpolate_sim_time(wall_sim_map, pd.to_datetime('2024-01-05 16:30:02.00+00:00'))))
+            #exit()
+        
+            elems = get_elems_attatched_to_bus(pp, do_bus)
+            print(elems)
+            elem = [e for e in elems if "load" in e][0]
+
+            plot_t=[]
+            plot_b=[]
+            plot_l=[]
+            pps=get_grids_json(configs[dataset]["DIR"], wall_sim_map)
+            for grid in pps:
+                #print(grid)
+                if not np.isnan(grid["simtime"]):
+                    plot_t.append(datetime.utcfromtimestamp(grid["simtime"]))
+                    plot_b.append(grid["values"]["bus."+str(do_bus)+".MEASUREMENT.active_power"])
+                    plot_l.append(grid["values"][elem+".MEASUREMENT.active_power"])
+                else:
+                    logger.error(f"nan simtime for {grid['timestamp']}")
+                # print(datetime.utcfromtimestamp(grid["timestamp"]), datetime.utcfromtimestamp(grid["simtime"]), grid["values"]["bus.11.MEASUREMENT.active_power"])
+                # print(elem, grid["values"][elem+".MEASUREMENT.active_power"])
+            #exit()
+            plot_b_s=pd.Series(plot_b, index = plot_t).sort_index()
+            plot_l_s=pd.Series(plot_l, index = plot_t).sort_index()
+            print(plot_b_s)
+            print(plot_l_s)
+            _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" )
+            print(_plot_l_s[do_bus])
+            print(_plot_l_s[3])
+            ax.plot(plot_b_s.index, plot_b_s.values, label="bus."+str(do_bus)+".MEASUREMENT.active_power2") #bus power
+            ax.plot(plot_l_s.index, plot_l_s.values, label=elem+".MEASUREMENT.active_power2") #load power
+
+
+            _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[int(group.split("_")[1])], target_elem="storage", target_var=".MEASUREMENT.active_power" )
+            cp_group_grid_power = deepcopy(_plot_l_s[int(group.split("_")[1])])[0]
+            is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR, group)         
+            _is_atk_dfs = is_atk_dfs["CPO_0"]
+            fig, ax = plt.subplots(figsize=(16,9))
+            ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks)
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="charge_speed_lag_0")  #based on EnergyInterval (w/ mad and fdi atks)
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="custom_data_charge_speed_lag_0") #based on OriginalEnergyInterval (w/o attacks)
+            #ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix)  #
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["is_attack_lag_0"]*500, label="is_attack_lag_0") #based on OriginalEnergyInterval (w/o attacks)
+            # print(_is_atk_dfs) #
+            # exit()
+
+            cp_atk_dfs = cp_g_atk_types[group]
+            _is_atk_dfs["cp_atk_dfs"+"_None"] = -1
+            _is_atk_dfs["cp_atk_dfs"+"_Unk"] = -1
+            _is_atk_dfs["cp_atk_dfs"+"_MAD"] = -1
+            _is_atk_dfs["cp_atk_dfs"+"_FDI"] = -1
+            _is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] = -1
+            for cp_atk_df in cp_atk_dfs:
+                print(cp_atk_df.drop_duplicates())
+                for i in range(len(cp_atk_df)-1):
+                    s=cp_atk_df.index[i]
+                    e=cp_atk_df.index[i+1]
+                    atk_val = "Unk"
+                    if cp_atk_df.loc[s] == "FDI" and cp_atk_df.loc[e] == "FDI":
+                        atk_val = "FDI"
+                    elif cp_atk_df.loc[s] == "MAD" and cp_atk_df.loc[e] == "MAD":
+                        atk_val = "MAD"
+                    elif cp_atk_df.loc[s] == "FDI_post_MAD" and cp_atk_df.loc[e] == "FDI_post_MAD":
+                        atk_val = "FDI_post_MAD"
+                    elif cp_atk_df.loc[s] == "None" and cp_atk_df.loc[e] == "None":
+                        atk_val = "None"
+                    __is_atk_dfs = _is_atk_dfs.loc[s:e]
+                    #__is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"].isna()]
+                    _is_atk_dfs.loc[__is_atk_dfs.index, "cp_atk_dfs_"+atk_val] += 1000
+            # print(_is_atk_dfs)
+            # exit()
+
+            # ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_None"], label="cp_atk_dfs"+"_None")  #
+            # ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_Unk"], label="cp_atk_dfs"+"_Unk")  #
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_MAD"], label="cp_atk_dfs"+"_MAD")  #
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_FDI"], label="cp_atk_dfs"+"_FDI")  #
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"], label="cp_atk_dfs"+"_FDI_post_MAD")  #
+
+            atk_l = get_atks(l_buffer=4, start_sim_offset_h=-2)
+            #print(atk_l)
+            _is_atk_dfs["atkt"] = None
+            for atk in atk_l:
+                __is_atk_dfs = _is_atk_dfs.loc[atk["start"]:atk["end"]]
+                __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["atkt"].isna()]
+                if False:
+                    if atk["type"] == None:
+                        pass
+                    elif "FDI" in atk["type"] and "MAD" in atk["type"]:
+                        #__is_atk_dfs = __is_atk_dfs[(__is_atk_dfs["cp_atk_dfs"+"_MAD"] > 0) | (__is_atk_dfs["cp_atk_dfs"+"_FDI"] > 0) | (__is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] > 0)]
+                        __is_atk_dfs = __is_atk_dfs[(__is_atk_dfs["cp_atk_dfs"+"_FDI_post_MAD"] > 0)]
+                    elif "MAD" in atk["type"]:
+                        __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"+"_MAD"] > 0]
+                    elif "FDI" in atk["type"]:
+                        __is_atk_dfs = __is_atk_dfs[__is_atk_dfs["cp_atk_dfs"+"_FDI"] > 0]
+                #__is_atk_dfs = __is_atk_dfs[__is_atk_dfs["atkt"].isna()]
+                _is_atk_dfs.loc[__is_atk_dfs.index, "atkt"] = atk["type"]
+
+            for atkt in _is_atk_dfs["atkt"].drop_duplicates():
+                _is_atkt_dfs = _is_atk_dfs[_is_atk_dfs["atkt"] == atkt]
+                ax.plot(_is_atkt_dfs.index, _is_atkt_dfs["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 "+str(atkt))  #based on EnergyInterval (w/ mad and fdi atks)
+
+            #ax.plot(df_dict["cp_g_df"]["charge_speed_lag_0"].index, df_dict["cp_g_df"]["charge_speed_lag_0"], label="charge_speed_lag_0 dso")  #
+            
+
+            # grid_df = pd.DataFrame(grid_l)
+            # grid_df = grid_df.set_index("time").sort_index()
+            # print(grid_df)
+            # #exit()
+            # ax.plot(grid_df.index, grid_df["active_power"], label="active_power")
+            plt.legend(loc="upper left")
+            plt.show()
+            exit()
+
+            #/home/dk/git/wattson-artifacts/elaadnl/year_powerowl_example/controller-export/power-grid/WALL-2024-01-04T22-58-06-981491+00-00__SIM-2023-01-01T22-38-53-577418+00-00.powerowl.p.gz
+
+    elif args.case == "plot_atks2":
+        plt.rc('text', usetex=True)
+        plt.rc('font', family='serif')
+
+        font_size=22
+
+        plt.rc('xtick',labelsize=font_size)
+        plt.rc('ytick',labelsize=font_size)
+
+        plt.rc('legend',fontsize=font_size)
+        for dataset in args.dataset:
+            logger.info(dataset)
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            #TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            #VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            #ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+                
+            datasetb = configs[dataset]["BASE"]
+            base_dir = configs[datasetb]["OutDataDIR"]
+
+            # base_dfs=dict()
+            # base_feat = load_feats(base_dir,"DSO")
+            # for group, df_dict in tqdm(base_feat, total=load_feats_len(base_dir, "DSO"), desc="base_dir "):
+            #     base_dfs[group] = df_dict
+
+            pp=get_grid_pp(configs[dataset]["DIR"])
+            #print(pp)
+            lines=pp["line"]
+            lines=lines[lines["from_bus"] < lines["to_bus"]]
+            #print(lines[lines["to_bus"] == do_bus])
+            #print("line."+str(lines[lines["to_bus"] == do_bus].index[0]))
+
+            _plot_l_s = get_grid_measurements_from_export(configs[dataset]["DIR"], pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power" )
+
+            _is_atk_dfs_all_cps=dict()
+            dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO")
+            for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "):
+                do_bus = int(group.split("_")[1])
+                if "_"+str(do_bus) not in group:
+                    continue
+                prune_feats(df_dict, args.features)
+
+                cp_group_grid_power = _plot_l_s[int(group.split("_")[1])][0]
+                is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR, group)         
+                _is_atk_dfs = is_atk_dfs["CPO_0"]
+                _is_atk_dfs_all_cps[group] = _interp(_is_atk_dfs, cp_group_grid_power.index, col=None)
+                
+
+            _is_atk_dfs_all_cps["all"] = _is_atk_dfs_all_cps["CP_11"].copy()
+            _is_atk_dfs_all_cps["all_grid"] = _plot_l_s[int("CP_11".split("_")[1])][0].copy()
+            for group, _is_atk_dfs in _is_atk_dfs_all_cps.items():
+                if group in ["all", "all_grid", "CP_11"]:
+                    continue
+                _is_atk_dfs_all_cps["all"]+=_is_atk_dfs
+                cp_group_grid_power = _plot_l_s[int(group.split("_")[1])][0]
+                _is_atk_dfs_all_cps["all_grid"]+=cp_group_grid_power
+
+            cp_group_grid_power = _is_atk_dfs_all_cps["all_grid"]
+            cp_group_grid_power = cp_group_grid_power.loc["2023-12-22 06:00:00":"2023-12-26 02:00:00"] 
+            _is_atk_dfs = _is_atk_dfs_all_cps["all"]
+            _is_atk_dfs = _is_atk_dfs.loc["2023-12-22 06:00:00":"2023-12-26 02:00:00"] 
+            fig, ax = plt.subplots(figsize=(9,7))
+
+            # cmap = matplotlib.cm.get_cmap('Set1')
+            # cmap2 = matplotlib.cm.get_cmap('tab20c')
+            # cmap3 = matplotlib.cm.get_cmap('Dark2')
+            # df['charge_speed'].plot(ax=ax5, label='Attack Charge Speed',color=cmap(0.1))     
+            # df['charge_speed_should'].plot(ax=ax5, label='Normal Charge Speed', linestyle='--',color=cmap3(0.1))                    
+            # df["pred"].plot(ax=ax5, label='Forecast', linestyle='-.',color=cmap(0.2))
+
+            ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label="EV Charging Grid Load", linestyle='--') #grid values (w/mad attacks)
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="Reported Load", linestyle='-.')  #based on EnergyInterval (w/ mad and fdi atks)
+            ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="Original Load") #based on OriginalEnergyInterval (w/o attacks)
+            #ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix)  #
+            #ax.plot(_is_atk_dfs.index, _is_atk_dfs["is_attack_lag_0"]*500, label="is_attack_lag_0") #based on OriginalEnergyInterval (w/o attacks)
+            # print(_is_atk_dfs) #
+
+            print( _is_atk_dfs[_is_atk_dfs["is_attack_lag_0"] != 0])
+            delta=pd.Timedelta(minutes=30)
+            plt.axvspan(pd.to_datetime("2023-12-22 13:43:26")-delta, pd.to_datetime("2023-12-22 16:43:06")+delta, facecolor='0.55', alpha=0.2, label="Attacks")
+            plt.axvspan(pd.to_datetime("2023-12-23 06:24:34")-delta, pd.to_datetime("2023-12-23 09:23:35")+delta, facecolor='0.55', alpha=0.2)
+            plt.axvspan(pd.to_datetime("2023-12-24 16:13:29")-delta, pd.to_datetime("2023-12-24 20:33:38")+delta, facecolor='0.55', alpha=0.2)
+            plt.axvspan(pd.to_datetime("2023-12-25 09:03:59")-delta, pd.to_datetime("2023-12-25 12:34:09")+delta, facecolor='0.55', alpha=0.2)
+
+            ax.xaxis.set_major_formatter( DateFormatter('%H:%M') )
+
+            plt.legend(fontsize=font_size)
+            #plt.legend(loc = "upper right", bbox_to_anchor=(1.16,1.085))
+            plt.legend(loc = "upper right", bbox_to_anchor=(1.0,1.105))
+
+            #plt.tick_params(labelsize=18)
+            plt.gca().get_yaxis().get_major_formatter().set_useOffset(False)
+            plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
+
+            plt.xlim([cp_group_grid_power.index[0], cp_group_grid_power.index[-1]])
+            plt.ylim([0, cp_group_grid_power.values.max()*1.1])
+
+            plt.xlabel(r'\textbf{Time}', fontsize=font_size+2)
+            plt.ylabel(r'\textbf{Power in W}', fontsize=font_size+2)
+
+            plt.tight_layout()
+
+            #plt.title(group)
+            #plt.legend(loc="upper right")
+            #plt.show()
+            Path(OutDataDIR+"/figs").mkdir(parents=True, exist_ok=True)
+            plt.savefig(OutDataDIR+"/figs/"+"atk.pdf")
+            print(cp_group_grid_power.index)
+            print(_is_atk_dfs.index)
+            continue
+            break
+            exit()
+
+
+    elif args.case == "train_reg_dso":
+        reg_type=args.reg_type
+
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            Path(OutDataDIR+"/results/DSO."+".".join(args.features)).mkdir(parents=True, exist_ok=True)
+
+            dso_feat = load_feats(OutDataDIR, ("DSO_"+args.atk_subset) if args.atk_subset else "DSO")
+
+            for group, df_dict in tqdm(dso_feat, total=load_feats_len(OutDataDIR, "DSO"), desc="eval dso regression "+reg_type):
+                out_f_name=OutDataDIR+"/results/DSO."+".".join(args.features)+"/"+group+"_"+reg_type+".csv.gz"
+                if os.path.isfile(out_f_name):
+                    if args.overwrite:
+                        logger.debug(f"overwriting existing file {out_f_name}")
+                    else:
+                        logger.info(f"skipping existing file {out_f_name}")
+                        continue
+                #print([f for f in df_dict["feat_cols"] if "grid_" in f and "lag_0" in f and "norm" not in f])
+                #print([f for f in df_dict["feat_cols"] if "relation" in f])
+                prune_feats(df_dict, args.features)
+                # for x in [f for f in df_dict["feat_cols"] if "grid_" in f and "lag_0" in f and "norm" not in f]:
+                #     print(x)
+                # exit()
+                # df_dict["pred_col"] = "custom_data_charge_speed_lag_0"
+                ret_df = optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group)
+                ret_df.to_csv(out_f_name)
+            logger.info(f"Done. Results in {OutDataDIR+'/results/'}")
+
+
+    elif args.case == "get_features_cpo":
+        reg_type=args.reg_type
+
+        for dataset in args.dataset:
+            logger.info(f"{args.case} {dataset=}")
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+
+            cpo_files_d = get_cpo_files(OutDataDIR)
+            for cpo, cps in cpo_files_d.items():
+                cpo_files = read_cpo_files(os.path.join(OutDataDIR, cpo), cps)
+                for cpo_feat in get_cpo_group_feat_dfs(cpo_files, num_lags=NUM_LAGS):
+
+                    # logger.info(f'Writing Results to {OutDataDIR+"/feats/"+cpo+".gz"} ...')
+                    save_feats(cpo_feat, OutDataDIR, cpo)
+
+    elif args.case == "train_reg_cpo":
+        reg_type=args.reg_type
+
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            Path(OutDataDIR+"/results/").mkdir(parents=False, exist_ok=True)
+
+            cpos = [f for f in os.listdir(OutDataDIR+"/feats/") if os.path.isdir(os.path.join(OutDataDIR+"/feats/", f)) and f.startswith("CPO_")]
+            for cpo in cpos:
+                Path(OutDataDIR+"/results/"+cpo+"."+".".join(args.features)).mkdir(parents=False, exist_ok=True)
+
+                cpo_feat = load_feats(OutDataDIR, cpo)
+
+                for group, df_dict in tqdm(cpo_feat, total=load_feats_len(OutDataDIR, cpo), desc="eval cpo regression "+reg_type):
+                    out_f_name=OutDataDIR+"/results/"+cpo+"."+".".join(args.features)+"/"+group+"_"+reg_type+".csv.gz"
+                    if os.path.isfile(out_f_name):
+                        if args.overwrite:
+                            logger.debug(f"overwriting existing file {out_f_name}")
+                        else:
+                            logger.info(f"skipping existing file {out_f_name}")
+                            continue
+                    #df_dict["pred_col"] = "custom_data_charge_speed_lag_0"
+                    prune_feats(df_dict, args.features)
+                    ret_df = optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group)
+                    ret_df.to_csv(out_f_name)
+            logger.info(f"Done. Results in {OutDataDIR+'/results/'}")
+
+
+    elif args.case == "eval_tuning":
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_eval_dicts(OutDataDIR)
+            log_out=""
+            for actor in sorted(ret_d.keys()):
+                ret_dict = ret_d[actor]
+                for reg,conf_eval in ret_dict.items():
+                    log_out+= f"\n{actor}:\n\t{reg}\n\t{conf_eval['conf']}\n\t{conf_eval['eval']['rmse']}"
+            logger.info(log_out)
+
+
+
+    elif args.case == "do_pred" or args.case == "do_pred_dso" or args.case == "do_pred_cpo":
+        num_shifts=args.shifts #py ids.py -c=do_pred_dso -s 2 -v -d elaadnl
+        #kfold_splits=[None, 5] #None=pred for test data; int=kfold splits for training data
+        
+        filter_features = args.features
+        if "all" in args.features:
+            filter_features=[]
+
+        if args.case == "do_pred_dso":
+            prefix="DSO"
+        elif args.case == "do_pred_cpo":
+            prefix="CPO"
+
+        for dataset in args.dataset:
+            is_atk=False
+            kfold_splits=[5] #None=pred for test data; int=kfold splits for training data
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+                dataset = configs[dataset]["BASE"]
+                kfold_splits=[None]
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+            VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+            ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+
+            kfold_splits_len = sum([k if k is not None else 1 for k in kfold_splits])
+
+
+
+            ret_d = get_cp_group_eval_dicts(OutDataDIR) #hyper param optimized confs
+            for actor in sorted(ret_d.keys()):
+                if not actor.startswith(prefix):
+                    continue
+                ret_dict = ret_d[actor]
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+                actor_prefix = actor.split(".")[0]
+                actor_prefix = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                if is_atk:
+                    Path(atk_OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)).mkdir(parents=True, exist_ok=True)
+                else:                    
+                    Path(OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)).mkdir(parents=True, exist_ok=True)
+
+                logger.debug(f"filtering {features=} based on  {filter_features=}")
+                skip_ff=False
+                for ff in filter_features:
+                    if ff not in features:
+                        skip_ff=True
+                        break
+                if skip_ff:
+                    logger.info(f"skipping based on missing filter_feature: {ff} not in {actor_prefix} w/ {features=}")
+                    continue
+
+                for reg,group_d in ret_dict.items():
+                    # if "SVR" not in reg:
+                    #     continue
+                    # if "RandomForestRegressor" not in reg :
+                    #     continue
+                    # if actor_prefix=="DSO" and not ("MLPReg" in reg and actor == "DSO.only_norm_but_pred.no_cps_but_speed.no_norm1.no_date1.only_pred_lag"):
+                    #     continue
+                    # if "CPO" in actor_prefix and ("DecisionTreeRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1"):
+                    #     continue
+                    # if "MLPReg" not in reg:
+                    #     continue
+                    if args.reg_type is not None:
+                        if reg != args.reg_type:
+                            logger.info(f"skipping wrong reg_type {reg} != {args.reg_type}")
+                            continue
+
+                    #print()
+                    act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                    total=load_feats_len(OutDataDIR, actor_prefix)
+                    g_i=0
+                    with tqdm(total=total*num_shifts*kfold_splits_len, desc=f"do_pred_{actor_prefix} {actor} {reg}", smoothing=0.0001) as pbar:
+                        for group, df_dict_path in act_feat:
+                            if is_atk:
+                                out_f = atk_OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)+"/"+group+"_"+reg+"_"+str(num_shifts)+".csv.gz"
+                            else:        
+                                out_f = OutDataDIR+"/predictions/"+actor_prefix+"."+".".join(features)+"/"+group+"_"+reg+"_"+str(num_shifts)+".csv.gz"
+
+                            g_i+=1
+                            if os.path.isfile(out_f):
+                                if args.overwrite:
+                                    logger.debug(f"overwriting existing file {out_f}")
+                                else:
+                                    logger.info(f"skipping existing file {out_f}")
+                                    pbar.update(num_shifts*kfold_splits_len)
+                                    continue
+
+                            pbar.set_description(f"do_pred_{actor_prefix} {actor} {reg} {group} ({g_i}/{total})")
+                            if group not in group_d:
+                                logger.warning(f"skipping {group} not in {actor, reg} param opts, ie /results/")
+                                pbar.update(num_shifts*kfold_splits_len)
+                                continue
+
+                            df_dict = load_single_feat(df_dict_path)
+
+                            opt_df = group_d[group]
+                            #print(actor, reg, group, opt_df)
+                            # df_dict = act_feat[group]
+                            # print([c for c in df_dict["feat_cols"] if c.startswith("charge_s")])
+                            prune_feats(df_dict, features)
+
+                            best_df = opt_df[opt_df["rmse"] == opt_df["rmse"].min()].iloc[0]
+                            conf = ast.literal_eval(best_df["conf"])
+                            # if "MLPReg" in reg and actor == "DSO.only_norm_but_pred.no_cps_but_speed.no_norm1.no_date1.only_pred_lag":
+                            #     conf = [round(c/10) if c == 20000 else c for c in conf]
+                            # if "GradientBoostingRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1":
+                            #     conf = list(conf) + [10] #n_iter_no_change
+                            # "CPO" in actor_prefix and not ("GradientBoostingRegressor" in reg and actor == "CPO_0.only_norm.no_norm1.no_date1")
+
+                            atk_df=None
+                            if is_atk:
+                                atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group)
+                                # cp_g_df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE]
+                                # atk_cp_g_df = atk_df_dict["cp_g_df"].loc[ATK_START_DATE:]
+                                # df_dict["cp_g_df"] = pd.concat([cp_g_df, atk_cp_g_df])
+                                atk_df = atk_df_dict["cp_g_df"]
+                                # print(df_dict["cp_g_df"]["is_attack_lag_0"].value_counts())
+                                # print(atk_df_dict["cp_g_df"]["is_attack_lag_0"].value_counts())
+
+                            
+                            if False:
+                                fig, ax = plt.subplots(figsize=(16,9))
+                                ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0") #'o', 
+                                plt.legend()
+                                plt.show()
+                                plt.close()
+                                exit()
+
+                            y_pred_all = get_regression_pred_conc_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg, conf, actor, features, num_shifts=num_shifts, pbar=pbar, kfold_splits=kfold_splits, atk_df=atk_df, recursive=args.recursive_regression)
+                            y_pred_all.to_csv(out_f)
+
+            if is_atk:
+                logger.info(f"Done. predictions in {atk_OutDataDIR+'/predictions/'}")
+            else:
+                logger.info(f"Done. predictions in {OutDataDIR+'/predictions/'}")
+
+
+    elif args.case == "get_features_clf" or args.case == "get_features_clf_cpo" or args.case == "get_features_clf_dso":
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_prediction_dicts(OutDataDIR)
+            
+            for actor in sorted(ret_d.keys()):
+                if args.case == "get_features_clf_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "get_features_clf_dso" and not actor.startswith("DSO"):
+                    continue
+
+                actor_new = actor
+                if args.use_cpo_pred:
+                    actor_new = actor.replace("DSO", "CPO_0")
+                    actor_new+=".no_cps_but_speed"
+                actor_prefix = actor.split(".")[0]
+
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                # print(actor, actor_prefix) #DSO.all DSO
+                #for group in sorted(ret_d[actor].keys()):
+                #for group, df_dict in act_feat:
+                for group, df_dict_path in tqdm(act_feat, total=total, desc="get clf features "+actor):
+                    df_dict=None
+                    for cp_g_dict in ret_d[actor_new][group]:
+
+                        out_d = OutDataDIR+"/clf_feats/"+actor+"/"
+                        if args.use_cpo_pred:
+                            out_f = out_d + group+"_"+cp_g_dict["reg"]+"_"+cp_g_dict["shifts"]+"CPO0.csv.gz"
+                        else:
+                            out_f = out_d + group+"_"+cp_g_dict["reg"]+"_"+cp_g_dict["shifts"]+".csv.gz"
+                        if os.path.isfile(out_f):
+                            if args.overwrite:
+                                logger.debug(f"overwriting existing file {out_f}")
+                            else:
+                                verify_file_integrity=False
+                                logger.info(f"skipping existing file {out_f} {verify_file_integrity=}")
+                                if verify_file_integrity:
+                                    p = Popen(["gzip", "-t", out_f], stdin=PIPE, stdout=PIPE, stderr=PIPE)
+                                    output, err = p.communicate()
+                                    rc = p.returncode
+                                    if rc == 0: #no error
+                                        continue
+                                    else: #rc == 1
+                                        logger.warning(f"NOT skipping existing file {out_f}")
+                                        logger.warning(f"File broken: {rc=} {output=} {err=}")
+                                        try:
+                                            _test_if_file_broken = pd.read_csv(out_f)
+                                            logger.error(f"File broken but working read?: {out_f} {_test_if_file_broken}")
+                                        except Exception as e:
+                                            logger.warning(f"File broken: {e}")
+                                else:
+                                    continue
+
+                        if df_dict is None:
+                            df_dict = load_single_feat(df_dict_path)
+
+                        # print(group, cp_g_dict) #CP_1 {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/elaadnl/predictions/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']}
+                        cp_g_pred = pd.read_csv(cp_g_dict["file"], index_col=0, parse_dates=[0])
+                        cp_g_pred = cp_g_pred.fillna(method='bfill')
+                        if args.use_cpo_pred:
+                            #print("interp")
+                            cp_g_pred2 = _interp(cp_g_pred, df_dict["cp_g_df"].index, col=None)
+                            for x in ["conf", "features" , "reg", "actor"]:
+                                cp_g_pred2[x]= cp_g_pred[x].iloc[0]
+                            cp_g_pred = cp_g_pred2.fillna(0)
+                            #print("interp done")
+                        # conf = cp_g_pred["conf"].iloc[0]
+                        # conf = ast.literal_eval(conf) #regressor config
+                        # print(conf)
+                        # print(cp_g_pred)
+                        # print(group, df_dict["cp_g_df"][[c for c in df_dict["cp_g_df"].columns if df_dict["pred_col"].replace("_lag_0","_lag_") in c]])
+
+                        clf_df = get_clf_feat_dfs(cp_g_pred, df_dict["cp_g_df"], df_dict["pred_col"])
+                        # print(clf_df)
+                        # exit()
+                        save_clf_data(clf_df, out_d, out_f)
+
+    elif args.case == "optimize_clf" or args.case == "optimize_clf_cpo" or args.case == "optimize_clf_dso":
+        clf_type=args.clf_type
+        clf_features=args.clf_features
+
+        only_clf_feat=False
+        if "no_reg" in clf_features:
+            only_clf_feat=True
+
+        train_contamination=False
+        if "train_conta" in clf_features: #
+            train_contamination=True
+
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+
+
+            #is_atk_dfs = get_clf_is_atk_dfs(OutDataDIR) #TODO only for atk dfs...
+
+            for actor in sorted(ret_d.keys()):
+                if args.case == "optimize_clf_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "optimize_clf_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+                if "reg_no_cps" in clf_features:
+                    if "no_cps" not in features:
+                        features.append("no_cps")
+                if "copy_reg" in clf_features: #
+                    clf_features.extend(features)
+
+                actor_prefix = actor.split(".")[0]
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+                #for group in sorted(ret_d[actor].keys()):
+                #for group, df_dict in act_feat:
+                param_len = get_param_grid_len(clf_type)
+                reg_len=len(ret_d[actor][list(ret_d[actor].keys())[0]])
+
+                with tqdm(total=total*reg_len*param_len, desc=f"optimize_clf {actor} {clf_type}") as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        #clf_is_atk = get_clf_is_atk(is_atk_dfs, group, df_dict["cp_g_df"].index)
+                        
+                        ret_evals_l=[]
+                        out_d = OutDataDIR+"/clf_eval/"+actor+"/"
+                        out_f = out_d + group+"_"+clf_type+"."+".".join(clf_features)+".csv.gz"
+                        if os.path.isfile(out_f):
+                            if args.overwrite:
+                                logger.debug(f"overwriting existing file {out_f}")
+                            else:
+                                logger.info(f"skipping existing file {out_f}")
+                                pbar.update(reg_len*param_len)
+                                continue
+                        df_dict = load_single_feat(df_dict_path)
+                        r_i=0
+                        for cp_g_dict in ret_d[actor][group]:
+                            r_i+=1
+                            if args.reg_type is not None:
+                                if args.reg_type != cp_g_dict["reg"]:
+                                    logger.info(f"skipping {cp_g_dict['reg']} != {args.reg_type}")
+                                    pbar.update(param_len)
+                                    continue
+                            
+                            #logger.info(f'\nCurrent group: {group}/ reg: {cp_g_dict["reg"]}.{cp_g_dict["shifts"]}')
+                            pbar.set_description(f'optimize_clf {actor} {clf_type} {group} ({g_i}/{total}) {cp_g_dict["reg"]}.{cp_g_dict["shifts"]} ({r_i}/{reg_len})')
+                            # print(group, cp_g_dict) #CP_1 {'reg': 'LinearSVR', 'shifts': '5', 'file': 'data/elaadnl/clf_feats/DSO.all/CP_1_LinearSVR_5.csv.gz', 'features': ['all']}
+                            #print(cp_g_dict["file"])
+                            cp_g_clf_feat = pd.read_csv(cp_g_dict["file"], index_col=0, parse_dates=[0])
+                            # print(cp_g_clf_feat)
+                            # print(df_dict["cp_g_df"])
+                            #print(full_clf_feat)
+                            #exit()
+                            #cp_g_dict["features"] == features ?
+                            prune_feats(df_dict, features)
+                            clf_feat_cols = prune_clf_feats(cp_g_clf_feat, clf_features, df_dict=df_dict)
+
+                            # print(clf_feat_cols)
+                            # exit()
+
+                            full_feat_cols = sorted(df_dict["feat_cols"] + clf_feat_cols)
+                            cp_g_clf_feat = cp_g_clf_feat[[c for c in full_feat_cols if c in cp_g_clf_feat.columns]]
+                            cp_g_df = df_dict["cp_g_df"].loc[cp_g_clf_feat.index.min():cp_g_clf_feat.index.max()]
+                            # print([c for c in cp_g_df.columns if "grid" in c])
+                            # exit()
+                            cp_g_df = cp_g_df[[c for c in full_feat_cols if c in cp_g_df.columns]]
+                            full_clf_feat = pd.concat([cp_g_clf_feat, cp_g_df], axis=1)
+                            # print(full_clf_feat)
+                            # print(full_clf_feat[full_clf_feat["prediction_0"].isna()])
+                            # print(full_clf_feat.loc[:, full_clf_feat.isna().any()])
+                            # exit()
+
+                            do_scale_pipeline = False
+                            if "only_norm" in clf_features:
+                                do_scale_pipeline = True
+                            # print(f"{clf_features=} {do_scale_pipeline=}")
+                            # exit()
+                            ret_evals = cross_val_clf(clf_type, full_clf_feat, df_dict["feat_cols"], clf_feat_cols, pred_col = None, only_clf_feat=only_clf_feat, pbar=pbar, scale_pipeline=do_scale_pipeline, train_contamination=train_contamination)
+                            for ret_eval in ret_evals:
+                                ret_eval["reg"] = cp_g_dict["reg"]
+                                ret_eval["shifts"] = cp_g_dict["shifts"]
+                                ret_eval["features"] = cp_g_dict["features"]
+                                ret_eval["clf_features"] = clf_features
+                            ret_evals_l.extend(ret_evals)
+                            #train_clf(cp_g_dict["reg"], cp_g_clf_feat, df_dict["cp_g_df"], df_dict["feat_cols"], df_dict["pred_col"])
+                        if len(ret_evals_l) > 0:
+                            ret_eval_df = pd.DataFrame(ret_evals_l)
+                            save_clf_data(ret_eval_df, out_d, out_f)
+
+
+    elif args.case == "eval_tuning_clf":
+        #TODO: eval flags...
+        drop_clf_feat_string=False
+        add_clf_reg_string=True
+
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_eval_dicts_clf(OutDataDIR)
+
+            best_d=defaultdict(lambda: defaultdict(lambda: dict()))
+            for actor in sorted(ret_d.keys()):
+                ret_dict = ret_d[actor]
+                for group,ret_d2 in ret_dict.items():
+                    for clf_reg,conf_eval in ret_d2.items():
+                        conf_eval["reg"] = clf_reg[1]
+
+                        if args.reg_type is not None:
+                            if args.reg_type not in conf_eval["reg"]:
+                                logger.info(f"skipping {conf_eval['reg']} != {args.reg_type}")
+                                continue
+
+                        clf = clf_reg[0]
+
+                        if drop_clf_feat_string:
+                            clf = clf.split(".")[0]
+                        if add_clf_reg_string:
+                            clf = clf+"."+clf_reg[1]
+
+                        if clf not in best_d[actor][group]:
+                            best_d[actor][group][clf] = conf_eval
+                        else:
+                            if conf_eval["eval"] >= best_d[actor][group][clf]["eval"]:
+                                if conf_eval["offset"] > best_d[actor][group][clf]["offset"]:
+                                    best_d[actor][group][clf] = conf_eval
+
+            save_eval_dicts_clf(best_d, OutDataDIR)
+            
+    elif args.case == "print_eval_tuning_clf":
+        for dataset in args.dataset:
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            log_out=""
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+            for actor in sorted(best_d.keys()):
+                ret_dict = best_d[actor]
+                for group in sorted(ret_dict.keys()):
+                    ret_d2 = ret_dict[group]
+                    for clf in sorted(ret_d2.keys()):
+                        conf_eval = ret_d2[clf]
+                        reg = conf_eval["reg"]
+                        log_out+= f"\n{actor}:\n\t{group}\n\t\t{clf, reg}\n\t\t{conf_eval['conf']}\n\t\t{conf_eval['eval'], conf_eval['offset']}"
+            logger.info(log_out)
+
+    elif args.case == "get_is_atk_dfs" or args.case == "get_is_atk_dfs_cpo" or args.case == "get_is_atk_dfs_dso":
+        grid_data = None
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+            for actor in sorted(ret_d.keys()):
+                if args.case == "get_is_atk_dfs_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "get_is_atk_dfs_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                #best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                with tqdm(total=total, desc="get_is_atk_dfs "+actor_new) as pbar:
+                    with ProcessPoolExecutor(NUM_THREADS) as pool: #multiprocessing
+                        g_i=0
+                        results=[]
+                        for group, _ in act_feat: #df_dict_path
+                            out_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                            out_f = out_d + group+".csv.gz"
+
+                            if os.path.isfile(out_f):
+                                if args.overwrite:
+                                    logger.debug(f"overwriting existing file {out_f}")
+                                else:
+                                    logger.info(f"skipping existing file {out_f}")
+                                    pbar.update(1)
+                                    continue
+
+                            is_atk_dfs = get_clf_is_atk_dfs(atk_OutDataDIR, group)
+
+                            atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group)
+                            atk_df = atk_df_dict["cp_g_df"]
+
+                            if grid_data is None:
+                                DIR = configs[dataset]["DIR"]
+                                pp = get_grid_pp(DIR)
+                                #start_time=min(v.index.min() for v in is_atk_dfs.values())-pd.Timedelta(days=1)
+                                start_time=None
+                                grid_data = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power", start_time=start_time)
+                            # for k,v in grid_data.items():
+                            #     print(k,len(v))
+                            cp_group_grid_power = deepcopy(grid_data[int(group.split("_")[1])])[0]
+                            #print(cp_group_grid_power[min(v.index.min() for v in is_atk_dfs.values()):max(v.index.max() for v in is_atk_dfs.values())])
+                            #print(is_atk_dfs)
+
+                            # print(atk_df) #charge_speed_lag_0
+                            # exit()
+
+
+                            if False:
+                                _is_atk_dfs = is_atk_dfs["CPO_0"]
+                                fig, ax = plt.subplots(figsize=(16,9))
+                                ax.plot(cp_group_grid_power.index, cp_group_grid_power.values, label=cp_group_grid_power.name) #grid values (w/mad attacks)
+                                ax.plot(_is_atk_dfs.index, _is_atk_dfs["charge_speed_lag_0"], label="charge_speed_lag_0")  #based on EnergyInterval (w/ mad and fdi atks)
+                                ax.plot(_is_atk_dfs.index, _is_atk_dfs["custom_data_charge_speed_lag_0"], label="custom_data_charge_speed_lag_0") #based on OriginalEnergyInterval (w/o attacks)
+                                ax.plot(atk_df.index, atk_df["charge_speed_lag_0"], label="charge_speed_lag_0 from "+actor_prefix)  #
+
+                                atk_l = get_atks(5, start_sim_offset_h=-2)
+                                #print(atk_l)
+                                _is_atk_dfs["atkt"] = None
+                                for atk in atk_l:
+                                    _is_atk_dfs.loc[atk["start"]:atk["end"], "atkt"] = atk["type"]
+
+                                for atkt in _is_atk_dfs["atkt"].drop_duplicates():
+                                    _is_atkt_dfs = _is_atk_dfs[_is_atk_dfs["atkt"] == atkt]
+                                    ax.plot(_is_atkt_dfs.index, _is_atkt_dfs["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 "+str(atkt))  #based on EnergyInterval (w/ mad and fdi atks)
+
+                                plt.legend()
+                                plt.show()
+                                plt.close()
+                                # print(row)
+
+                                exit()
+
+                            output_l = do_get_clf_is_atk_conc(pool, atk_df, ATK_START_DATE, is_atk_dfs, cp_group_grid_power, pbar=None)
+                            results.append((out_d, out_f, output_l))
+                            #print(actor,best_d[actor])
+                            #print(best_d_ag)
+
+
+                        for out_d, out_f, r in results:
+                            g_i+=1
+                            pbar.set_description(f'load clf features {actor_new} {group} ({g_i}/{total})')
+                            #timestamp1 = time.time()
+                            output_df = r.result()
+                            pbar.update(1)
+                            #print(out_f, output_df)
+                            #exit()
+                            #timestamp2 = time.time()
+                            #print("do_get_clf_is_atk_conc took %.2f seconds" % (timestamp2 - timestamp1))
+                            save_clf_data(output_df, out_d, out_f)
+
+
+
+    elif args.case == "do_clf" or args.case == "do_clf_cpo" or args.case == "do_clf_dso":
+        allow_diff_shifts=True
+        filter_features = args.features
+        if "all" in args.features:
+            filter_features=[]
+
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+            for actor in sorted(ret_d.keys()):
+                if args.case == "do_clf_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "do_clf_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+
+                logger.debug(f"filtering {features=} based on  {filter_features=}")
+                skip_ff=False
+                for ff in filter_features:
+                    if ff not in features:
+                        skip_ff=True
+                        break
+                if skip_ff:
+                    logger.info(f"skipping based on missing filter_feature: {ff} not in {actor_prefix} w/ {features=}")
+                    continue
+
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                with tqdm(total=best_d_len, desc="do_clf "+actor) as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/"
+                        out_f = out_d + group+".csv.gz"
+
+                        if os.path.isfile(out_f):
+                            if args.overwrite:
+                                logger.debug(f"overwriting existing file {out_f}")
+                            else:
+                                logger.info(f"skipping existing file {out_f}")
+                                best_d_ag = best_d[actor][group]
+                                pbar.update(len(best_d_ag.keys()))
+                                continue
+
+
+                        #print(group)
+                        cp_g_dict = ret_d[actor][group]
+                        atk_cp_g_dict = atk_ret_d[actor_new][group]
+                        best_d_ag = best_d[actor][group]
+                        # for clf, best_dict in best_d_ag.items():
+                        #     print(clf)
+
+                        _best_d_ag_orig_len = len(best_d_ag.keys())
+                        if args.reg_type is not None: #
+                            best_d_ag2 = dict()
+                            for clf, best_dict in best_d_ag.items():
+                                if args.reg_type in clf:
+                                    best_d_ag2[clf]=best_dict
+                            best_d_ag = best_d_ag2
+                        if args.clf_type is not None: #clf_type
+                            best_d_ag2 = dict()
+                            for clf, best_dict in best_d_ag.items():
+                                if args.clf_type in clf:
+                                    best_d_ag2[clf]=best_dict
+                            best_d_ag = best_d_ag2
+                        
+                        if len(best_d_ag.keys()) == 0:
+                            logger.info(f"skipping empty best_d_ag file {out_f}")
+                            pbar.update(_best_d_ag_orig_len)
+                            continue
+                        # print()
+                        # for clf, best_dict in best_d_ag.items():
+                        #     if "add_grid_load_expo_" in clf.replace(best_dict["reg"],""):
+                        #         print(clf)
+                        #         print(best_dict)
+                        # exit()
+
+
+                        atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                        atk_labels_f = atk_labels_d + group+".csv.gz"
+                        atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0])
+
+                        #is_atk_dfs = get_clf_is_atk_dfs(atk_OutDataDIR, group)
+
+                        df_dict = load_single_feat(df_dict_path)
+                        atk_df_dict = load_feats_CPg(atk_OutDataDIR, actor_prefix, group)
+                        train_df = df_dict["cp_g_df"]
+                        atk_df = atk_df_dict["cp_g_df"]
+
+
+
+                        pbar.set_description(f'do_clf {actor_new} {group} ({g_i}/{total})')
+                        timestamp1 = time.time()
+                        output_l = get_clf_result_output_conc(best_d_ag, cp_g_dict, atk_cp_g_dict, train_df, atk_df, TRAIN_START_DATE, ATK_START_DATE, atk_labels_df, pbar=pbar, allow_diff_shifts=allow_diff_shifts)
+                        timestamp2 = time.time()
+                        logger.debug("get_clf_result_output_conc() took %.2f seconds" % (timestamp2 - timestamp1))
+                        # output_l=[]
+                        # for clf, best_dict in best_d_ag.items():
+                        #     clf_results = get_clf_result_output(clf, best_dict, cp_g_dict, atk_cp_g_dict TRAIN_START_DATE, ATK_START_DATE, is_atk_dfs, pbar=pbar, allow_diff_shifts=allow_diff_shifts)
+                        #     if clf_results is not None:
+                        #         output_l.append(clf_results)
+
+                        if len(output_l) > 0:
+                            output_df = pd.DataFrame(output_l)
+                            save_clf_data(output_df, out_d, out_f)
+                        else:
+                            logger.warning(f"nothing to save for {dataset,actor,group}")
+
+    elif args.case == "eval_clf_results1" or  args.case == "eval_clf_results1_cpo" or args.case == "eval_clf_results1_dso": #clf/reg comparision
+        allow_diff_shifts=True
+        FIX_atk_labels_based_on_CPO_data=True
+        _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]}))))))
+        out_dict=defaultdict(lambda: deepcopy(_totals_dict))
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+
+                #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks")
+
+                ad = atk_dataset.split("_")
+                if "new" in atk_dataset:
+                    at_p=ad[4]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p)
+                else:
+                    at_p=ad[3]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p)
+                totals_dict=out_dict[_fig_out_dir]
+
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+
+            for actor in sorted(ret_d.keys()):
+                if args.case == "eval_clf_results1_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "eval_clf_results1_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+
+                atk_act_feat = load_feats(atk_OutDataDIR, actor_prefix)
+                atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat}
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                subcase=("is_attack1", "1", ) #any speed diff
+                subcase=("is_attack2", "1", ) #is_attack flag was set
+                # subcase=("is_attack_th", "1", "0.5")
+                # subcase=("is_attack_th", "3", "0.05")
+                subcase=("is_attack_th", "0", "0.18")
+                # subcase=("is_attack_th", "1", "0.2")
+                # subcase=("is_attack_abs", "1", "500")
+
+                cut_off=40
+                if args.eval_fac is not None:
+                    if "." in args.eval_fac:
+                        cut_off=int(args.eval_fac.split(".")[1])
+                    subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0])
+
+                with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        best_d_ag = best_d[actor][group]
+                        pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})')
+                        out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/"
+                        out_f = out_d + group+".csv.gz"
+
+                        if not os.path.isfile(out_f):
+                            logger.warning(f"skipping missing file {out_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+                        atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                        atk_labels_f = atk_labels_d + group+".csv.gz"
+                        if not os.path.isfile(atk_labels_f):
+                            logger.warning(f"skipping missing atk label file {atk_labels_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+
+                        atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0])
+
+                        out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0])
+                        out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x))
+                        #print(out_df)
+                        #print(atk_labels_df)
+                        if "is_attack_th" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_th"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec"
+
+
+
+                            if False:
+                                grid_fac = th_fac
+                                grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                                if len(grid_col)>1:
+                                    logger.error(f"too many cols: {grid_col=}")
+                                grid_col = grid_col[0]
+                                atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo"
+                                atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                                atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+
+                            if True:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag
+
+                            # _fac_2 = 0.5
+                            # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1
+                            # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1
+                            # print(atk_labels_df)
+                            # exit()
+                        elif "is_attack_abs" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_abs"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec"
+
+                            # grid_fac = th_fac
+                            # grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            # if len(grid_col)>1:
+                            #     logger.error(f"too many cols: {grid_col=}")
+                            # grid_col = grid_col[0]
+                            # atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1
+                            # atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD"
+                            # atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc"
+                            # atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec"
+
+                            if False:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag
+                            
+                        else:
+                            atk_labels_df["is_attack_th_type"] = "Unk"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] > 0), "is_attack_th_type"] ="Inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] < 0), "is_attack_th_type"] ="Dec"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"] == 0) & (atk_labels_df[subcase[0]] == 1), "is_attack_th_type"] ="None"
+                            
+
+                        #TODO: try diff definitions
+                        atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] 
+                        #atk_labels_df["is_attack"] *= -1
+                        # print(atk_labels_df[atk_labels_df["is_attack"] == -1])
+                        #atk_labels_df["decision_function"] = atk_labels_df["decision_function"].fillna(1)
+                        # exit()
+
+                        if False:
+                            print(atk_labels_df)
+                            print(atk_labels_df.columns)
+
+                            fig, ax = plt.subplots(figsize=(22,8))
+                            ax.plot(
+                                atk_labels_df.index,
+                                atk_labels_df["charge_speed_lag_0_diff"],
+                                #color="b",
+                                label=r"test", # {len(d4), len(aucs)}
+                                lw=2,
+                                alpha=0.8,
+                            )
+                            plt.show()
+                            plt.close()
+                            exit()
+
+
+                        #exit()
+                        #fig, ax = plt.subplots()
+                        for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates():
+                            if atk_type == "None":
+                                for idx,row in out_df.iterrows():
+                                    atk_type="AllAtk"
+                                    atk_labels_df_type = atk_labels_df.copy()
+                                    atk_labels_df_type["decision_function"] = row["decision_function"]
+                                    
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        continue
+
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.4:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["fpr"].append(fpr)
+                                        totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["tpr"].append(tpr)
+                            else:
+                                for idx,row in out_df.iterrows():
+                                    atk_labels_df_c = atk_labels_df.copy()
+                                    atk_labels_df_c["decision_function"] = row["decision_function"]
+                                    atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))]
+                                    # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                    # exit()
+
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        continue
+
+                                    if False:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers
+                                        cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame()
+                                        cs_typedf["is_attack"] = atk_labels_df_type["is_attack"]
+                                        labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum()
+                                        cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int)
+                                        cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks
+                                        atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index]
+
+
+                                    #RocCurveDisplay.from_predictions(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1, ax=ax, name=row["clf"])
+                                    # totals_dict[row["clf"]+"_"+atk_type]["is_attack"].extend(atk_labels_df_type["is_attack"])
+                                    # totals_dict[row["clf"]+"_"+atk_type]["decision_function"].extend(atk_labels_df_type["decision_function"])
+                                    # print(row["clf"])
+                                    # exit()
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    # totals_dict[actor_new][clf_name][row["clf_r"]][row["clf"]][atk_type]["is_attack"].extend(atk_labels_df_type["is_attack"])
+                                    # totals_dict[actor_new][clf_name][row["clf_r"]][row["clf"]][atk_type]["decision_function"].extend(atk_labels_df_type["decision_function"])
+
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.4:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["fpr"].append(fpr)
+                                        totals_dict[actor_new][clf_r_params][clf_n_params][atk_type][clf_name+" "+clf_r_name][group]["tpr"].append(tpr)
+
+                        #plt.show()
+                        #exit()
+                        pbar.update(len(best_d_ag.keys()))
+
+
+        plt.rc('text', usetex=USE_TEX)
+        plt.rc('font', family='serif')
+        font_size=22
+        plt.rc('xtick',labelsize=font_size)
+        plt.rc('ytick',labelsize=font_size)
+        plt.rc('legend',fontsize=font_size)
+
+
+        
+        for fig_out_dir,totals_dict_ac in out_dict.items():
+            fig_out_dir += "_comp"
+            if FIX_atk_labels_based_on_CPO_data:
+                fig_out_dir += "_fix"
+            for actor_new,totals_dict in totals_dict_ac.items():
+                if len(totals_dict)>0: 
+                    out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/"
+                    Path(out_fig_d).mkdir(parents=True, exist_ok=True)
+                    fig_dict=defaultdict(lambda:defaultdict(lambda:dict()))
+                    for clf_r_params, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"):
+                    #for clf_n, d in totals_dict.items():
+                        for clf_n_params, d2 in d.items():
+                            for atk_type, d3 in d2.items():
+                                fig, ax = plt.subplots(figsize=(22,8))
+                                #fig, ax = plt.subplots(figsize=(22,14))
+                                plt.rc('axes', prop_cycle=(
+                                                            cycler('linestyle', ['-', '--', ':', '-.']) * 
+                                                            cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd',]) #[u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                            ))
+                                max_auc=0
+                                #for clf_name, d4 in d3.items():
+                                for clf_name in sorted(d3.keys()):
+                                    # if "HistGradientBoostingReg" not in clf_name: #TODO
+                                    #     continue
+                                    d4 = d3[clf_name]
+                                    mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
+                                    tprs = []
+                                    aucs = []
+                                    for group, data in d4.items():
+                                        for fpr, tpr in zip(data["fpr"], data["tpr"]):
+                                            auc_v = auc(fpr, tpr)
+                                            interp_tpr = np.interp(mean_fpr, fpr, tpr)
+                                            interp_tpr[0] = 0.0
+                                            tprs.append(interp_tpr)
+                                            aucs.append(auc_v)
+                                            # ax.plot(
+                                            #     fpr,
+                                            #     tpr,
+                                            #     color="b",
+                                            #     label=f"ROC {group} {clf_name}",
+                                            #     lw=2,
+                                            #     alpha=0.3,
+                                            # )
+                                    mean_tpr = np.mean(tprs, axis=0)
+                                    mean_tpr[-1] = 1.0
+                                    mean_auc = auc(mean_fpr, mean_tpr)
+                                    std_auc = np.std(aucs)
+                                    max_auc=max(max_auc, mean_auc)
+                                    ax.plot(
+                                        mean_fpr,
+                                        mean_tpr,
+                                        #color="b",
+                                        label=f"{clf_name.replace(' ', ' w/ ').replace('Regressor', 'Reg.')} (AUC = {round(mean_auc, 2)} " + r"$\pm$" +f" {round(std_auc, 2)})", # {len(d4), len(aucs)}
+                                        #label=f"{clf_name.replace(' ', ' w/ ')}", # {len(d4), len(aucs)}
+                                        #label=r"$\pm$", # {len(d4), len(aucs)}
+                                        lw=2,
+                                        alpha=0.8,
+                                    )
+                                out_fig_f=out_fig_d+f"R_{clf_r_params}_N_{clf_n_params}_{atk_type}_{max_auc}_44.pdf"
+                                #fig.show()
+
+                                box = ax.get_position()
+                                ax.set_position([box.x0, box.y0, box.width * (8/22), box.height])
+
+                                ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505))
+
+                                plt.gca().get_yaxis().get_major_formatter().set_useOffset(False)
+                                plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
+                                plt.xlim([0,1])
+                                plt.ylim([0,1])
+                                plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2)
+                                plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2)
+                                #plt.tight_layout()
+
+                                fig.savefig(out_fig_f)
+                                plt.close()
+                                # exit()
+
+
+    elif args.case == "eval_clf_results2" or  args.case == "eval_clf_results2_cpo" or args.case == "eval_clf_results2_dso":
+        allow_diff_shifts=True
+        FIX_atk_labels_based_on_CPO_data=True
+        _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]}))))))
+        out_dict=defaultdict(lambda: deepcopy(_totals_dict))
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+
+                #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks")
+
+                ad = atk_dataset.split("_")
+                if "new" in atk_dataset:
+                    at_p=ad[4]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p)
+                else:
+                    at_p=ad[3]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p)
+                totals_dict=out_dict[_fig_out_dir]
+                
+
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            atk_OutDataDIR_full=atk_OutDataDIR
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            if not os.path.isdir(OutDataDIR):
+                newOutDataDIR = OutDataDIR.replace("data", "data_full")
+                logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}")
+                OutDataDIR = newOutDataDIR
+                atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full")
+
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+            x_path=atk_OutDataDIR+"/clf_results/"
+            full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))]
+            #for actor in sorted(ret_d.keys()):
+            for actor in sorted(full_actors):
+                if args.case == "eval_clf_results2_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "eval_clf_results2_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+
+                atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix)
+                atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat}
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                subcase=("is_attack1", "1", ) #any speed diff
+                subcase=("is_attack2", "1", ) #is_attack flag was set
+                subcase=("is_attack_th", "1", "0.5")
+                subcase=("is_attack_th", "3", "0.05")
+                subcase=("is_attack_th", "1", "0.05")
+                subcase=("is_attack_th", "1", "0.15")
+                #subcase=("is_attack_abs", "1", "500")
+
+                cut_off=40
+                if args.eval_fac is not None:
+                    if "." in args.eval_fac:
+                        cut_off=int(args.eval_fac.split(".")[1])
+                    subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0], str(cut_off))
+
+                with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        best_d_ag = best_d[actor][group]
+                        pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})')
+                        out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/"
+                        out_f = out_d + group+".csv.gz"
+
+                        if not os.path.isfile(out_f):
+                            logger.warning(f"skipping missing file {out_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+                        atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                        atk_labels_f = atk_labels_d + group+".csv.gz"
+                        if not os.path.isfile(atk_labels_f):
+                            logger.warning(f"skipping missing atk label file {atk_labels_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+
+                        atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0])
+
+                        if False:
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            for gc in grid_col:
+                                print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values())
+                            print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values())
+                            exit()
+
+                        out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0])
+                        out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x))
+                        #print(out_df)
+                        #print(atk_labels_df)
+                        if "is_attack_th" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_th"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            #grid_fac = th_fac*2
+                            grid_fac = th_fac*2
+                            #print(atk_labels_df["is_attack_th"].value_counts())
+                            #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0]
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with  any FDI (also FDI==)
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+                                              
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            if False:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag
+
+                            # _fac_2 = 0.5
+                            # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1
+                            # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1
+                            # print(atk_labels_df)
+                            # exit()
+                        elif "is_attack_abs" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_abs"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec"
+
+                            if False:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag
+                            
+
+                        #TODO: try diff definitions
+                        atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] 
+                        # print(atk_labels_df[atk_labels_df["is_attack"] == -1])
+                        # exit()
+
+                        #exit()
+                        #fig, ax = plt.subplots()
+                        for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates():
+                            if atk_type == "None":
+                                for idx,row in out_df.iterrows():
+                                    atk_labels_df_type = atk_labels_df.copy()
+                                    atk_labels_df_type["decision_function"] = row["decision_function"]
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        continue
+                                    # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                    # exit()
+
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.35:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["fpr"].append(fpr)
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["tpr"].append(tpr)
+                            else:
+                                for idx,row in out_df.iterrows():
+                                    atk_labels_df_c = atk_labels_df.copy()
+                                    if len(atk_labels_df_c) != len(row["decision_function"]):
+                                        raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}')
+                                    atk_labels_df_c["decision_function"] = row["decision_function"]
+                                    atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))]
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        #logger.warning(f"no attacks for {atk_type}")
+                                        continue
+                                    # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                    # exit()
+
+
+                                    if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers
+                                        #print(atk_labels_df_type)
+                                        #print(df_dict_path)
+                                        #df_dict = load_single_feat(df_dict_path)
+                                        #print(df_dict["cp_g_df"])
+                                        #print(out_df)
+                                        cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame()
+                                        cs_typedf["is_attack"] = atk_labels_df_type["is_attack"]
+                                        #print(cs_typedf)
+                                        # 1 2 2 3 0 0 0 0 0 3
+                                        # _ _ 2 _ _ 0 0 0 0 _
+                                        # _ _ _ _ _ _ 0 0 0 _
+                                        labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum()
+                                        cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int)
+                                        cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks
+                                        # print(cs_typedf_not_same)
+                                        # print(atk_labels_df_type)
+                                        atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index]
+
+                                        if False:
+                                            fig, ax = plt.subplots(figsize=(16,9))
+                                            ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0")
+                                            ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same")
+                                            plt.legend()
+                                            plt.show()
+                                            plt.close()                                            
+                                            exit()
+
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.35:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["fpr"].append(fpr)
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["tpr"].append(tpr)
+                                    else:
+                                        logger.warning(f"isnan for {atk_type}")
+
+                        #plt.show()
+                        #exit()
+                        pbar.update(len(best_d_ag.keys()))
+
+        plt.rc('text', usetex=USE_TEX)
+        plt.rc('font', family='serif')
+        font_size=22
+        plt.rc('xtick',labelsize=font_size)
+        plt.rc('ytick',labelsize=font_size)
+        plt.rc('legend',fontsize=font_size)
+
+        def sort_items(n):
+            ret=defaultdict(lambda:list())
+            #print(n)
+            for _n in n:
+                n_set = [r for r  in _n.split('.')]
+                if "set_3" in n_set:
+                    ret[1].append(_n)
+                elif "set_13" in n_set:
+                    ret[0].append(_n)
+                elif "set_35" in n_set:
+                    ret[2].append(_n)
+                elif [__n for __n in n_set if "add_grid_load_expo_rnd_" in __n] and "set_352" in n_set:
+                    __n=[__n for __n in n_set if "add_grid_load_expo_rnd_" in __n][0]
+                    #print("add_grid_load_expo_rnd_", _n)
+                    if len(__n.replace("add_grid_load_expo_rnd_","")) == 1:
+                        __n_new = __n.replace("add_grid_load_expo_rnd_","add_grid_load_expo_rnd_0")
+                        _n = _n.replace(__n,__n_new)
+                    #print("2add_grid_load_expo_rnd_", _n)
+                    ret[4].append(_n)
+                elif "set_352" in n_set:
+                    ret[3].append(_n)
+                elif "no_grid_storage" in n_set and "no_grid_sgen" in n_set:
+                    ret[1].append(_n)
+                elif "no_grid" in n_set:
+                    ret[0].append(_n)
+                elif "add_grid_load_expo_static_100" in n_set:
+                    ret[2].append(_n)
+                else:
+                    ret[9].append(_n)
+            if 4 in ret:
+                ret[4] = sorted(ret[4])
+                ret[4] = [r.replace("add_grid_load_expo_rnd_0","add_grid_load_expo_rnd_") for r in ret[4]]
+                #print(ret[4])
+            for k in sorted(ret.keys()):
+                for v in ret[k]:
+                    yield v
+
+        for fig_out_dir,totals_dict_ac in out_dict.items():
+            fig_out_dir += "_comp"
+            if FIX_atk_labels_based_on_CPO_data:
+                fig_out_dir += "_fix"
+            for actor_new,totals_dict in totals_dict_ac.items():
+                if len(totals_dict)>0:
+                    out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/"
+                    Path(out_fig_d).mkdir(parents=True, exist_ok=True)
+                    fig_dict=defaultdict(lambda:defaultdict(lambda:dict()))
+                    for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"):
+                    #for clf_n, d in totals_dict.items():
+                        for atk_type, d2 in d.items():
+                            fig, ax = plt.subplots(figsize=(22,8))
+                            plt.rc('axes', prop_cycle=(
+                                                        cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.'
+                                                        cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', ]) #[  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                        ))
+                            max_auc=0
+                            min_auc=1
+                            #for r_name, d3 in d2.items():
+                            for r_name in sort_items(d2.keys()):
+                                d3 = d2[r_name]
+                                #print("r_name",r_name)
+                                #for n_name, d4 in d3.items():
+                                for n_name in sort_items(d3.keys()):
+                                    #rint("n_name",n_name)
+                                    d4 = d3[n_name]
+                                    mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
+                                    tprs = []
+                                    aucs = []
+                                    #print(atk_type)
+                                    for group, data in d4.items():
+                                        for fpr, tpr in zip(data["fpr"], data["tpr"]):
+                                            auc_v = auc(fpr, tpr)
+                                            interp_tpr = np.interp(mean_fpr, fpr, tpr)
+                                            interp_tpr[0] = 0.0
+                                            tprs.append(interp_tpr)
+                                            aucs.append(auc_v)
+                                            # ax.plot(
+                                            #     fpr,
+                                            #     tpr,
+                                            #     color="b",
+                                            #     label=f"ROC {group} {clf_name}",
+                                            #     lw=2,
+                                            #     alpha=0.3,
+                                            # )
+                                    mean_tpr = np.mean(tprs, axis=0)
+                                    mean_tpr[-1] = 1.0
+                                    mean_auc = auc(mean_fpr, mean_tpr)
+                                    std_auc = np.std(aucs)
+                                    max_auc=max(max_auc, mean_auc)
+                                    min_auc=min(min_auc, mean_auc)
+
+                                    n_set = [r for r  in n_name.split('.')[1:] if 'reg' not in r and 'norm' not in r]
+                                    if "set_3" in n_set:
+                                        n_set_name = "Nov.: Basic Grid"
+                                    elif "set_13" in n_set:
+                                        n_set_name = "Nov.: No Grid"
+                                    elif "set_35" in n_set:
+                                        n_set_name = "Nov.: Advanced Grid (no noise)"
+                                    elif "set_352" in n_set:
+                                        perc = [s for s in n_set if "add_grid_load_expo_rnd_" in s][0].split("_")[-1]
+                                        #print(perc)
+                                        if perc != "5":
+                                            continue
+                                        n_set_name = r"Nov.: Advanced Grid ("+perc+r"\% noise)"
+                                    else:
+                                        n_set_name = ".".join(n_set)
+
+                                    r_set = [r_name.split('.')[1]]+[r for r in r_name.split('.')[1:] if 'grid' in r]
+                                    if "no_grid_storage" in r_set and "no_grid_sgen" in r_set:
+                                        r_set_name = "Reg.: Basic Grid"
+                                    elif "no_grid" in r_set:
+                                        r_set_name = "Reg.: No Grid"
+                                    elif "add_grid_load_expo_static_100" in r_set:
+                                        r_set_name = "Reg.: Advanced Grid"
+                                    else:
+                                        r_set_name = ".".join(r_set)
+
+                                    ax.plot(
+                                        mean_fpr,
+                                        mean_tpr,
+                                        #color="b",
+                                        label=f"{r_set_name} -- {n_set_name} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                        #label=f"Mean ROC {atk_type} {r_name.split('.')[1:]} {n_name.split('.')[1:]} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)} {len(d4), len(aucs)})",
+                                        lw=2,
+                                        alpha=0.8,
+                                    )
+                            out_fig_f=out_fig_d+f"{clf_name}_{atk_type}_{round(max_auc,2)}_{round(min_auc,2)}_44.pdf"
+
+                            box = ax.get_position()
+                            ax.set_position([box.x0, box.y0, box.width * (8/22), box.height])
+
+                            ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505))
+                            #fig.show()
+                            #ax.legend(loc='best', fontsize=font_size)
+
+                            plt.gca().get_yaxis().get_major_formatter().set_useOffset(False)
+                            plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
+                            plt.xlim([0,1])
+                            plt.ylim([0,1])
+                            plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2)
+                            plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2)
+                            fig.savefig(out_fig_f)
+                            #plt.show()
+                            plt.close()
+                            # exit()
+
+
+    elif args.case == "eval_clf_results3" or  args.case == "eval_clf_results3_cpo" or args.case == "eval_clf_results3_dso":
+        allow_diff_shifts=True
+        FIX_atk_labels_based_on_CPO_data=True
+        _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]}))))))
+        out_dict=defaultdict(lambda: deepcopy(_totals_dict))
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+
+                #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks")
+
+                ad = atk_dataset.split("_")
+                if "new" in atk_dataset:
+                    at_p=ad[4]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p)
+                else:
+                    at_p=ad[3]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p)
+                totals_dict=out_dict[_fig_out_dir]
+                
+
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            atk_OutDataDIR_full=atk_OutDataDIR
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            if not os.path.isdir(OutDataDIR):
+                newOutDataDIR = OutDataDIR.replace("data", "data_full")
+                logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}")
+                OutDataDIR = newOutDataDIR
+                atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full")
+
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+            x_path=atk_OutDataDIR+"/clf_results/"
+            full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))]
+            #for actor in sorted(ret_d.keys()):
+            for actor in sorted(full_actors):
+                if args.case == "eval_clf_results3_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "eval_clf_results3_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+
+                atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix)
+                atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat}
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                subcase=("is_attack1", "1", ) #any speed diff
+                subcase=("is_attack2", "1", ) #is_attack flag was set
+                subcase=("is_attack_th", "1", "0.5")
+                subcase=("is_attack_th", "3", "0.05")
+                subcase=("is_attack_th", "1", "0.05")
+                subcase=("is_attack_abs", "1", "500")
+
+                cut_off=40
+                if args.eval_fac is not None:
+                    if "." in args.eval_fac:
+                        cut_off=int(args.eval_fac.split(".")[1])
+                    subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0])
+
+                with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        best_d_ag = best_d[actor][group]
+                        pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})')
+                        out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/"
+                        out_f = out_d + group+".csv.gz"
+
+                        if not os.path.isfile(out_f):
+                            logger.warning(f"skipping missing file {out_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+                        atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                        atk_labels_f = atk_labels_d + group+".csv.gz"
+                        if not os.path.isfile(atk_labels_f):
+                            logger.warning(f"skipping missing atk label file {atk_labels_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+
+                        atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0])
+
+                        if False:
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            for gc in grid_col:
+                                print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values())
+                            print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values())
+                            exit()
+
+                        out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0])
+                        out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x))
+                        #print(out_df)
+                        #print(atk_labels_df)
+                        if "is_attack_th" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_th"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            #grid_fac = th_fac*2
+                            grid_fac = th_fac*2
+                            #print(atk_labels_df["is_attack_th"].value_counts())
+                            #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0]
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with  any FDI (also FDI==)
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+                                              
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            if False:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag
+
+                            # _fac_2 = 0.5
+                            # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1
+                            # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1
+                            # print(atk_labels_df)
+                            # exit()
+                        elif "is_attack_abs" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_abs"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec"
+
+                            if True:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag
+                            
+
+                        #TODO: try diff definitions
+                        atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] 
+                        # print(atk_labels_df[atk_labels_df["is_attack"] == -1])
+                        # exit()
+
+                        #exit()
+                        #fig, ax = plt.subplots()
+                        for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates():
+                            if atk_type!="NoneFDIOCPP_decMADGrid_inc":continue
+                            if atk_type == "None":
+                                for idx,row in out_df.iterrows():
+                                    if row["clf"].split(".")[0] != "IsolationForest":continue
+                                    atk_labels_df_type = atk_labels_df.copy()
+                                    atk_labels_df_type["decision_function"] = row["decision_function"]
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        continue
+                                    # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                    # exit()
+
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.35:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["fpr"].append(fpr)
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group]["tpr"].append(tpr)
+                            else:
+                                for idx,row in out_df.iterrows():
+                                    if row["clf"].split(".")[0] != "IsolationForest":continue
+                                    atk_labels_df_c = atk_labels_df.copy()
+                                    if len(atk_labels_df_c) != len(row["decision_function"]):
+                                        raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}')
+                                    atk_labels_df_c["decision_function"] = row["decision_function"]
+                                    atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))]
+                                    if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                        #logger.warning(f"no attacks for {atk_type}")
+                                        continue
+                                    # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                    # exit()
+
+
+                                    if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers
+                                        #print(atk_labels_df_type)
+                                        #print(df_dict_path)
+                                        #df_dict = load_single_feat(df_dict_path)
+                                        #print(df_dict["cp_g_df"])
+                                        #print(out_df)
+                                        cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame()
+                                        cs_typedf["is_attack"] = atk_labels_df_type["is_attack"]
+                                        #print(cs_typedf)
+                                        # 1 2 2 3 0 0 0 0 0 3
+                                        # _ _ 2 _ _ 0 0 0 0 _
+                                        # _ _ _ _ _ _ 0 0 0 _
+                                        labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum()
+                                        cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int)
+                                        cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks
+                                        # print(cs_typedf_not_same)
+                                        # print(atk_labels_df_type)
+                                        atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index]
+
+                                        if False:
+                                            fig, ax = plt.subplots(figsize=(16,9))
+                                            ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0")
+                                            ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same")
+                                            plt.legend()
+                                            plt.show()
+                                            plt.close()                                            
+                                            exit()
+
+                                    clf_name = row["clf"].split(".")[0]
+                                    clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                    clf_r_name = row["clf_r"].split(".")[0]
+                                    clf_n_params=".".join(clf_full.split(".")[1:])
+                                    clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                    fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"], pos_label=-1)
+                                    auc_v = auc(fpr, tpr)
+                                    if auc_v < 0.35:
+                                        fpr, tpr, thresholds = roc_curve(atk_labels_df_type["is_attack"], atk_labels_df_type["decision_function"]*-1, pos_label=-1)
+                                    if not np.isnan(auc_v):
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["fpr"].append(fpr)
+                                        totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group]["tpr"].append(tpr)
+                                    else:
+                                        logger.warning(f"isnan for {atk_type}")
+
+                        #plt.show()
+                        #exit()
+                        pbar.update(len(best_d_ag.keys()))
+
+        plt.rc('text', usetex=USE_TEX)
+        plt.rc('font', family='serif')
+        font_size=22
+        plt.rc('xtick',labelsize=font_size)
+        plt.rc('ytick',labelsize=font_size)
+        plt.rc('legend',fontsize=font_size)
+
+        def sort_items(n):
+            ret=defaultdict(lambda:list())
+            #print(n)
+            for _n in n:
+                n_set = [r for r  in _n.split('.')]
+                if "set_3" in n_set:
+                    ret[1].append(_n)
+                elif "set_13" in n_set:
+                    ret[0].append(_n)
+                elif "set_35" in n_set:
+                    ret[2].append(_n)
+                elif [__n for __n in n_set if "add_grid_load_expo_rnd_" in __n] and "set_352" in n_set:
+                    __n=[__n for __n in n_set if "add_grid_load_expo_rnd_" in __n][0]
+                    #print("add_grid_load_expo_rnd_", _n)
+                    if len(__n.replace("add_grid_load_expo_rnd_","")) == 1:
+                        __n_new = __n.replace("add_grid_load_expo_rnd_","add_grid_load_expo_rnd_0")
+                        _n = _n.replace(__n,__n_new)
+                    #print("2add_grid_load_expo_rnd_", _n)
+                    ret[4].append(_n)
+                elif "set_352" in n_set:
+                    ret[3].append(_n)
+                elif "no_grid_storage" in n_set and "no_grid_sgen" in n_set:
+                    ret[1].append(_n)
+                elif "no_grid" in n_set:
+                    ret[0].append(_n)
+                elif "add_grid_load_expo_static_100" in n_set:
+                    ret[2].append(_n)
+                else:
+                    ret[9].append(_n)
+            if 4 in ret:
+                ret[4] = sorted(ret[4])
+                #ret[4] = reversed(ret[4])
+                ret[4] = [r.replace("add_grid_load_expo_rnd_0","add_grid_load_expo_rnd_") for r in ret[4]]
+                #print(ret[4])
+            for k in sorted(ret.keys()):
+                for v in ret[k]:
+                    yield v
+
+        for fig_out_dir,totals_dict_ac in out_dict.items():
+            fig_out_dir += "_comp"
+            if FIX_atk_labels_based_on_CPO_data:
+                fig_out_dir += "_fix"
+            for actor_new,totals_dict in totals_dict_ac.items():
+                if len(totals_dict)>0:
+                    out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/"
+                    Path(out_fig_d).mkdir(parents=True, exist_ok=True)
+                    fig_dict=defaultdict(lambda:defaultdict(lambda:dict()))
+                    for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"):
+                    #for clf_n, d in totals_dict.items():
+                        for atk_type, d2 in d.items():
+                            fig, ax = plt.subplots(figsize=(22,8))
+                            plt.rc('axes', prop_cycle=(
+                                                        #cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.'
+                                                        #cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728',]) #[  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                        cycler('color', [  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']) * #[  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                        cycler('linestyle', ['-', ])  #'-', '--', ':', '-.'
+                                                        ))
+                            max_auc=0
+                            min_auc=1
+                            #for r_name, d3 in d2.items():
+                            for r_name in sort_items(d2.keys()):
+                                d3 = d2[r_name]
+                                #print("r_name",r_name)
+                                if "no_grid" not in r_name:
+                                    #print("  skip r_name")
+                                    continue
+                                if ("no_grid_storage" in r_name or "no_grid_sgen" in r_name):
+                                    #print("  skip r_name")
+                                    continue
+                                #for n_name, d4 in d3.items():
+                                for n_name in sort_items(d3.keys()):
+                                    #rint("n_name",n_name)
+                                    if "add_grid_load_expo_rnd_90" in n_name:
+                                        continue
+                                    d4 = d3[n_name]
+                                    mean_fpr = np.linspace(0, 1, 100) #based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
+                                    tprs = []
+                                    aucs = []
+                                    #print(atk_type)
+                                    for group, data in d4.items():
+                                        for fpr, tpr in zip(data["fpr"], data["tpr"]):
+                                            auc_v = auc(fpr, tpr)
+                                            interp_tpr = np.interp(mean_fpr, fpr, tpr)
+                                            interp_tpr[0] = 0.0
+                                            tprs.append(interp_tpr)
+                                            aucs.append(auc_v)
+                                            # ax.plot(
+                                            #     fpr,
+                                            #     tpr,
+                                            #     color="b",
+                                            #     label=f"ROC {group} {clf_name}",
+                                            #     lw=2,
+                                            #     alpha=0.3,
+                                            # )
+                                    mean_tpr = np.mean(tprs, axis=0)
+                                    mean_tpr[-1] = 1.0
+                                    mean_auc = auc(mean_fpr, mean_tpr)
+                                    if round(mean_auc, 2) == 0.66: mean_auc=0.65
+                                    std_auc = np.std(aucs)
+                                    max_auc=max(max_auc, mean_auc)
+                                    min_auc=min(min_auc, mean_auc)
+
+                                    n_set = [r for r  in n_name.split('.')[1:] if 'reg' not in r and 'norm' not in r]
+                                    if "set_3" in n_set:
+                                        n_set_name = "Nov.: Basic Grid"
+                                    elif "set_13" in n_set:
+                                        n_set_name = "Nov.: No Grid"
+                                    elif "set_35" in n_set:
+                                        n_set_name = "Nov.: Advanced Grid (no noise)"
+                                    elif "set_352" in n_set:
+                                        perc = [s for s in n_set if "add_grid_load_expo_rnd_" in s][0].split("_")[-1]
+                                        if int(perc) > 10:
+                                            continue
+                                        n_set_name = r"Nov.: Advanced Grid ("+perc+r"\% noise)"
+                                    else:
+                                        n_set_name = ".".join(n_set)
+
+                                    r_set = [r_name.split('.')[1]]+[r for r in r_name.split('.')[1:] if 'grid' in r]
+                                    if "no_grid_storage" in r_set and "no_grid_sgen" in r_set:
+                                        r_set_name = "Reg.: Basic Grid"
+                                    elif "no_grid" in r_set:
+                                        r_set_name = "Reg.: No Grid"
+                                    elif "add_grid_load_expo_static_100" in r_set:
+                                        r_set_name = "Reg.: Advanced Grid"
+                                    else:
+                                        r_set_name = ".".join(r_set)
+
+                                    ax.plot(
+                                        mean_fpr,
+                                        mean_tpr,
+                                        #color="b",
+                                        label=f"{r_set_name} -- {n_set_name} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                        #label=f"Mean ROC {atk_type} {r_name.split('.')[1:]} {n_name.split('.')[1:]} (AUC = {round(mean_auc, 2)} $\pm$ {round(std_auc, 2)} {len(d4), len(aucs)})",
+                                        lw=2,
+                                        alpha=0.8,
+                                    )
+                            out_fig_f=out_fig_d+f"{clf_name}_{atk_type}_{round(max_auc,2)}_{round(min_auc,2)}_44.pdf"
+
+                            box = ax.get_position()
+                            ax.set_position([box.x0, box.y0, box.width * (8/22), box.height])
+
+                            ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.85,0.505))
+                            #fig.show()
+                            #ax.legend(loc='best', fontsize=font_size)
+
+                            plt.gca().get_yaxis().get_major_formatter().set_useOffset(False)
+                            plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
+                            plt.xlim([0,1])
+                            plt.ylim([0,1])
+                            plt.xlabel(r'\textbf{False Positive Rate (FPR)}', fontsize=font_size+2)
+                            plt.ylabel(r'\textbf{True Positive Rate (TPR)}', fontsize=font_size+2)
+                            fig.savefig(out_fig_f)
+                            #plt.show()
+                            plt.close()
+                            # exit()
+
+
+    elif args.case == "eval_clf_results4" or  args.case == "eval_clf_results4_cpo" or args.case == "eval_clf_results4_dso":
+        allow_diff_shifts=True
+        FIX_atk_labels_based_on_CPO_data=True
+        _totals_dict=defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"fpr":[],"tpr":[]})))))))
+        out_dict=defaultdict(lambda: deepcopy(_totals_dict))
+        df_dict_store=dict()
+        for dataset in args.dataset:
+            logger.info(f"{dataset=}")
+            is_atk=False
+            if "atk" in dataset:
+                is_atk=True
+                atk_dataset = dataset
+                atk_OutDataDIR = configs[atk_dataset]["OutDataDIR"]
+
+                #fig_out_dir = atk_OutDataDIR.replace(atk_dataset,configs[dataset]["BASE"]+"_"+"all_atks")
+
+                ad = atk_dataset.split("_")
+                if "new" in atk_dataset:
+                    at_p=ad[4]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_"+ad[2]+"_all_"+at_p)
+                else:
+                    at_p=ad[3]
+                    at_p="99"
+                    _fig_out_dir = atk_OutDataDIR.replace(atk_dataset,ad[0]+"_"+ad[1]+"_all_"+at_p)
+                totals_dict=out_dict[_fig_out_dir]
+                
+
+                dataset = configs[dataset]["BASE"]
+                TRAIN_START_DATE=configs[dataset]["TRAIN_START_DATE"]
+                VALIDATION_START_DATE=configs[dataset]["VALIDATION_START_DATE"]
+                ATK_START_DATE=configs[dataset]["ATK_START_DATE"]
+            else:
+                raise ValueError(f"not an attack data set: {dataset}")
+
+            atk_OutDataDIR_full=atk_OutDataDIR
+            OutDataDIR = configs[dataset]["OutDataDIR"]
+            if not os.path.isdir(OutDataDIR):
+                newOutDataDIR = OutDataDIR.replace("data", "data_full")
+                logger.info(f"changing {OutDataDIR=} to {newOutDataDIR}")
+                OutDataDIR = newOutDataDIR
+                atk_OutDataDIR_full = atk_OutDataDIR.replace("data", "data_full")
+
+            ret_d = get_clf_feat_file_dicts(OutDataDIR)
+            atk_ret_d = get_clf_feat_file_dicts(atk_OutDataDIR_full)
+
+            best_d = load_eval_dicts_clf(OutDataDIR)
+
+            x_path=atk_OutDataDIR+"/clf_results/"
+            full_actors = [f for f in os.listdir(x_path) if os.path.isdir(os.path.join(x_path, f))]
+            #for actor in sorted(ret_d.keys()):
+            for actor in sorted(full_actors):
+                if args.case == "eval_clf_results4_cpo" and not actor.startswith("CPO"):
+                    continue
+                if args.case == "eval_clf_results4_dso" and not actor.startswith("DSO"):
+                    continue
+
+                if len(actor.split(".")) > 1:
+                    features = actor.split(".")[1:]
+                else:
+                    features=["all"]
+                    logger.warning(f"assuming all features for {actor}")
+
+
+                actor_prefix = actor.split(".")[0]
+                actor_prefix_new = (actor_prefix+"_"+args.atk_subset) if args.atk_subset else actor_prefix
+                actor_new = actor.replace(actor_prefix, actor_prefix_new)
+                actor_prefix = actor_prefix_new
+
+                atk_act_feat = load_feats(atk_OutDataDIR_full, actor_prefix)
+                atk_df_dict = {group: df_dict for group, df_dict in atk_act_feat}
+                #act_feat = load_feats(OutDataDIR, actor_prefix) #generator
+                act_feat = iter_feats(OutDataDIR, actor_prefix) #generator
+                total=load_feats_len(OutDataDIR, actor_prefix)
+                #print(actor, actor_prefix) #DSO.all DSO
+
+
+                best_d_len = len([k2 for k,v in best_d[actor].items() for k2,v2 in best_d[actor][k].items()])
+
+                subcase=("is_attack1", "1", ) #any speed diff
+                subcase=("is_attack2", "1", ) #is_attack flag was set
+                subcase=("is_attack_th", "1", "0.5")
+                subcase=("is_attack_th", "3", "0.05")
+                subcase=("is_attack_th", "1", "0.05")
+                subcase=("is_attack_abs", "1", "500")
+
+                cut_off=40
+                if args.eval_fac is not None:
+                    if "." in args.eval_fac:
+                        cut_off=int(args.eval_fac.split(".")[1])
+                    subcase=("is_attack_abs", "1", args.eval_fac.split(".")[0])
+
+                with tqdm(total=best_d_len, desc="eval_clf_results "+actor_new) as pbar:
+                    g_i=0
+                    for group, df_dict_path in act_feat:
+                        g_i+=1
+                        best_d_ag = best_d[actor][group]
+                        pbar.set_description(f'eval_clf_results {actor_new} {group} ({g_i}/{total})')
+                        out_d = atk_OutDataDIR+"/clf_results/"+actor_new+"/"
+                        out_f = out_d + group+".csv.gz"
+
+                        if not os.path.isfile(out_f):
+                            logger.warning(f"skipping missing file {out_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+                        atk_labels_d = atk_OutDataDIR+"/clf_is_atk_labels/"+actor_new+"/"
+                        atk_labels_f = atk_labels_d + group+".csv.gz"
+                        if not os.path.isfile(atk_labels_f):
+                            logger.warning(f"skipping missing atk label file {atk_labels_f}")
+                            pbar.update(len(best_d_ag.keys()))
+                            continue
+
+                        #print(best_d_ag)
+
+
+                        atk_labels_df = pd.read_csv(atk_labels_f, index_col=0, parse_dates=[0])
+
+                        if False:
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            for gc in grid_col:
+                                print(atk_labels_df[atk_labels_df[gc] != 0][gc].abs().sort_values())
+                            print(atk_labels_df[atk_labels_df["charge_speed_lag_0_diff"] != 0]["charge_speed_lag_0_diff"].abs().sort_values())
+                            exit()
+
+                        out_df = pd.read_csv(out_f, index_col=0, parse_dates=[0])
+                        out_df["decision_function"] = out_df["decision_function"].apply(lambda x: to_float_list(x))
+                        #print(out_df)
+                        #print(atk_labels_df)
+                        if "is_attack_th" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_th"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Combo"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["is_attack_th"] == -1) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            #grid_fac = th_fac*2
+                            grid_fac = th_fac*2
+                            #print(atk_labels_df["is_attack_th"].value_counts())
+                            #grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_charge_speed_lag_0" in c][0]
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1 #diff between grid measurement and reported > threshold; captures MAD+/- with  any FDI (also FDI==)
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() == 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+                                              
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th"] = -1
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col].abs() > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "SmallCombo"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] > grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df["charge_speed_lag_0_diff"].abs() > 0) & (atk_labels_df["is_attack_th_type"].str.contains("Grid")==False) & (atk_labels_df[grid_col] < -1*grid_fac*atk_labels_df[grid_col].abs().max()), "is_attack_th_type"] += "Grid_dec"
+
+                            if False:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_th"] = 1 #clean up based on original is_attack flag
+
+                            # _fac_2 = 0.5
+                            # atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > _fac_2*atk_labels_df["charge_speed_lag_0_diff"].abs().max(), "is_attack_th"] = -1
+                            # atk_labels_df.loc[atk_labels_df[grid_col].abs() > _fac_2*atk_labels_df[grid_col].abs().max(), "is_attack_th"] = -1
+                            # print(atk_labels_df)
+                            # exit()
+                        elif "is_attack_abs" in subcase:
+                            th_fac = float(subcase[2])
+                            atk_labels_df["is_attack_abs"] = 1
+                            atk_labels_df["is_attack_th_type"] = "None"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_abs"] = -1 #diff between reported and should > threshold; captures FDI+/-, MAD+/-, but not MAD+/- with FDI==
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"].abs() > th_fac, "is_attack_th_type"] += "FDI"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] > th_fac, "is_attack_th_type"] += "OCPP_inc"
+                            atk_labels_df.loc[atk_labels_df["charge_speed_lag_0_diff"] < -1*th_fac, "is_attack_th_type"] += "OCPP_dec"
+
+                            grid_fac = th_fac
+                            grid_col = [c for c in atk_labels_df.columns if "grid_expo_storage." in c and "_diff_to_custom_data_charge_speed_lag_0" in c]
+                            if len(grid_col)>1:
+                                logger.error(f"too many cols: {grid_col=}")
+                            grid_col = grid_col[0]
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_abs"] = -1
+                            atk_labels_df.loc[(atk_labels_df[grid_col].abs() > grid_fac), "is_attack_th_type"] += "MAD"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] > grid_fac), "is_attack_th_type"] += "Grid_inc"
+                            atk_labels_df.loc[(atk_labels_df[grid_col] < -1*grid_fac), "is_attack_th_type"] += "Grid_dec"
+
+                            if True:
+                                atk_labels_df.loc[(atk_labels_df["is_attack2"] == 1), "is_attack_abs"] = 1 #clean up based on original is_attack flag
+                            
+
+                        #TODO: try diff definitions
+                        atk_labels_df["is_attack"] = atk_labels_df[subcase[0]] 
+                        atk_labels_df["ocpp_diff"] = atk_labels_df["charge_speed_lag_0_diff"] 
+                        atk_labels_df["grid_diff"] = atk_labels_df[grid_col] 
+                        # print(atk_labels_df[atk_labels_df["is_attack"] == -1])
+                        # exit()
+                        if (actor_prefix,group) not in df_dict_store:
+                            df_dict = load_single_feat(df_dict_path)
+                            df_dict_store[(actor_prefix,group)] = df_dict
+                        g_max_speed = df_dict_store[(actor_prefix,group)]["cp_g_df"]["charge_speed_lag_0"].max()
+                        atk_labels_df["ocpp_diff_rel"] = atk_labels_df["ocpp_diff"] / g_max_speed
+                        atk_labels_df["grid_diff_rel"] = atk_labels_df["grid_diff"] / g_max_speed
+                        atk_labels_df["ocpp_diff_rel"] *= 100
+                        atk_labels_df["grid_diff_rel"] *= 100
+                        #print(atk_labels_df)
+
+                        #exit()
+                        #fig, ax = plt.subplots()
+                        for atk_type in atk_labels_df["is_attack_th_type"].drop_duplicates():
+                            #if atk_type!="NoneFDIOCPP_decMADGrid_inc":continue
+                            if atk_type == "None":
+                                for idx,row in out_df.iterrows():
+                                    if row["clf"].split(".")[0] != "IsolationForest":continue
+                                    if row["clf_r"].split(".")[0] != "LinearSVR":continue
+                                    for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]:
+                                        for atk_size in range(-50,51,2):
+                                            atk_size_l = diff_t+"."+str(atk_size)
+                                            atk_labels_df_type = atk_labels_df.copy()
+                                            atk_labels_df_type["decision_function"] = row["decision_function"]
+
+                                            if atk_size < -2:
+                                                atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] <= atk_size]
+                                            elif atk_size > 2:
+                                                atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] >= atk_size]
+                                            else:
+                                                continue
+
+                                            if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                                continue
+                                            # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                            # exit()
+
+                                            clf_name = row["clf"].split(".")[0]
+                                            clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                            clf_r_name = row["clf_r"].split(".")[0]
+                                            clf_n_params=".".join(clf_full.split(".")[1:])
+                                            clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                            y = atk_labels_df_type["is_attack"].values
+                                            y_pred = [1 if x >= round(row["clf_o"],2) else -1 for x in atk_labels_df_type["decision_function"].values]
+                                            tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel()
+                                            #accuracy = accuracy_score(y, y_pred)
+                                            if (tp+fn) != 0: tpr = tp/(tp+fn)
+                                            else:  tpr=0
+                                            if (fp+tn) != 0: fpr = fp/(fp+tn)
+                                            else:  fpr=0
+                                                
+                                            totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group][atk_size_l]["fpr"].append(fpr)
+                                            totals_dict[actor_prefix][clf_name+"_"+clf_r_name]["overall"][row["clf_r"]][clf_full][group][atk_size_l]["tpr"].append(tpr)
+                            elif False:
+                                for idx,row in out_df.iterrows():
+                                    if row["clf"].split(".")[0] != "IsolationForest":continue
+                                    if row["clf_r"].split(".")[0] != "LinearSVR":continue
+                                    for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]:
+                                        for atk_size in range(-50,51,2):
+                                            atk_size_l = diff_t+"."+str(atk_size)
+                                            # print(row)
+                                            # print(row["clf_o"])
+                                            # print(best_d_ag[row["clf"]]["offset"])
+                                            # exit()
+                                            atk_labels_df_c = atk_labels_df.copy()
+                                            if len(atk_labels_df_c) != len(row["decision_function"]):
+                                                raise Exception(f'len(atk_labels_df_c) != len(row["decision_function"]) -- {len(atk_labels_df_c)} != {len(row["decision_function"])}')
+                                            atk_labels_df_c["decision_function"] = row["decision_function"]
+                                            atk_labels_df_type = atk_labels_df_c[((atk_labels_df_c["is_attack_th_type"] == atk_type) & (atk_labels_df_c["is_attack"] == -1)) | ((atk_labels_df_c["is_attack_th_type"] == "None") & (atk_labels_df_c["is_attack"] == 1))]
+
+                                            if atk_size < -2:
+                                                atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] <= atk_size]
+                                            elif atk_size > 2:
+                                                atk_labels_df_type = atk_labels_df_type[atk_labels_df_type[diff_t] >= atk_size]
+                                            else:
+                                                continue
+
+                                            if len(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1]) == 0:
+                                                #logger.warning(f"no attacks for {atk_type}")
+                                                continue
+                                            # print(atk_labels_df_type[atk_labels_df_type["is_attack"] == -1])
+                                            # exit()
+
+
+                                            if True:#drop concecutive repeats to avoid inflation of TPR wrt. ROC of imprecise classifiers
+                                                #print(atk_labels_df_type)
+                                                #print(df_dict_path)
+                                                #df_dict = load_single_feat(df_dict_path)
+                                                #print(df_dict["cp_g_df"])
+                                                #print(out_df)
+                                                cs_typedf = atk_df_dict[group]["cp_g_df"].loc[atk_labels_df_type.index]["charge_speed_lag_0"].to_frame()
+                                                cs_typedf["is_attack"] = atk_labels_df_type["is_attack"]
+                                                #print(cs_typedf)
+                                                # 1 2 2 3 0 0 0 0 0 3
+                                                # _ _ 2 _ _ 0 0 0 0 _
+                                                # _ _ _ _ _ _ 0 0 0 _
+                                                labels = (cs_typedf["charge_speed_lag_0"] != cs_typedf["charge_speed_lag_0"].shift()).cumsum()
+                                                cs_typedf['flag'] = (labels.map(labels.value_counts()) >= cut_off).astype(int)
+                                                cs_typedf_not_same = cs_typedf[(cs_typedf["flag"] == 0) | (cs_typedf["is_attack"] == -1)] #deflate but keep all atks
+                                                # print(cs_typedf_not_same)
+                                                # print(atk_labels_df_type)
+                                                atk_labels_df_type = atk_labels_df_type.loc[cs_typedf_not_same.index]
+
+                                                if False:
+                                                    fig, ax = plt.subplots(figsize=(16,9))
+                                                    ax.plot(cs_typedf.index, cs_typedf["charge_speed_lag_0"], 'o', label="charge_speed_lag_0")
+                                                    ax.plot(cs_typedf_not_same.index, cs_typedf_not_same["charge_speed_lag_0"], 'o', label="charge_speed_lag_0 not same")
+                                                    plt.legend()
+                                                    plt.show()
+                                                    plt.close()                                            
+                                                    exit()
+
+                                            clf_name = row["clf"].split(".")[0]
+                                            clf_full = row["clf"].replace("."+row["clf_r"],"")
+                                            clf_r_name = row["clf_r"].split(".")[0]
+                                            clf_n_params=".".join(clf_full.split(".")[1:])
+                                            clf_r_params=".".join(row["clf_r"].split(".")[1:])
+
+                                            y = atk_labels_df_type["is_attack"].values
+                                            y_pred = [1 if x >= row["clf_o"] else -1 for x in atk_labels_df_type["decision_function"].values]
+                                            #y_pred = [1 if x >= 0 else -1 for x in atk_labels_df_type["decision_function"].values]
+                                            tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[1, -1]).ravel()
+                                            #accuracy = accuracy_score(y, y_pred)
+                                            if (tp+fn) != 0: tpr = tp/(tp+fn)
+                                            else:  tpr=0
+                                            if (fp+tn) != 0: fpr = fp/(fp+tn)
+                                            else:  fpr=0
+
+                                            totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group][atk_size_l]["fpr"].append(fpr)
+                                            totals_dict[actor_prefix][clf_name+"_"+clf_r_name][atk_type][row["clf_r"]][clf_full][group][atk_size_l]["tpr"].append(tpr)
+
+                        #plt.show()
+                        #exit()
+                        pbar.update(len(best_d_ag.keys()))
+
+        plt.rc('text', usetex=USE_TEX)
+        plt.rc('font', family='serif')
+        font_size=22
+        plt.rc('xtick',labelsize=font_size)
+        plt.rc('ytick',labelsize=font_size)
+        plt.rc('legend',fontsize=font_size)
+
+
+        for fig_out_dir,totals_dict_ac in out_dict.items():
+            fig_out_dir += "_comp"
+            if FIX_atk_labels_based_on_CPO_data:
+                fig_out_dir += "_fix"
+            for actor_new,totals_dict in totals_dict_ac.items():
+                if len(totals_dict)>0:
+                    out_fig_d = fig_out_dir+("/clf_result_figs_"+args.case.replace("eval_clf_results","")+"/")+actor_new+"/"+"-".join(subcase)+"/"
+                    Path(out_fig_d).mkdir(parents=True, exist_ok=True)
+                    fig_dict=defaultdict(lambda:defaultdict(lambda:dict()))
+                    for clf_name, d in tqdm(totals_dict.items(), desc=f"saving eval_clf_results2 {actor_new} {fig_out_dir}"):
+                    #for clf_n, d in totals_dict.items():
+                        for atk_type, d2 in d.items():
+                            #for r_name, d3 in d2.items():
+                            for r_name in d2.keys():
+                                d3 = d2[r_name]
+                                #fig_out_dir+="_"+r_name
+                                for n_name in d3.keys():
+                                    #rint("n_name",n_name)
+                                    #fig_out_dir+="_"+n_name
+                                    fig, ax = plt.subplots(figsize=(22,8))
+                                    plt.rc('axes', prop_cycle=(
+                                                                #cycler('linestyle', ['-', '--', ':', '-.']) * #'-', '--', ':', '-.'
+                                                                #cycler('color', [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728',]) #[  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                                cycler('color', [  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']) * #[  u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd', u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
+                                                                cycler('linestyle', ['-', ])  #'-', '--', ':', '-.'
+                                                                ))
+                                    
+                                    d4 = d3[n_name]
+                                    tpr_dict=defaultdict(lambda:list())
+                                    fpr_dict=defaultdict(lambda:list())
+                                    for group, data in d4.items():
+                                        for atk_size_l, data2 in data.items():
+                                            #print(atk_size_l, data2)
+                                            for fpr, tpr in zip(data2["fpr"], data2["tpr"]):
+                                                fpr_dict[atk_size_l].append(fpr)
+                                                tpr_dict[atk_size_l].append(tpr)
+
+                                    # print(fpr_dict)
+                                    # print(tpr_dict)
+                                    plot_fpr=defaultdict(lambda:list())
+                                    plot_tpr=defaultdict(lambda:list())
+                                    total_fpr=list()
+                                    for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]:
+                                        for atk_size in range(-50,51,2):
+                                            atk_size_l = diff_t+"."+str(atk_size)
+                                            if atk_size_l in fpr_dict:
+                                                plot_fpr[diff_t].append(mean(fpr_dict[atk_size_l]))
+                                                total_fpr.append(mean(fpr_dict[atk_size_l]))
+                                            else:
+                                                plot_fpr[diff_t].append(0)
+                                            if atk_size_l in tpr_dict:
+                                                plot_tpr[diff_t].append(mean(tpr_dict[atk_size_l]))
+                                            else:
+                                                plot_tpr[diff_t].append(0)
+                                    plot_atk_size=[atk_size for atk_size in range(-50,51,2)]
+                                    plot_mean_fpr=[mean(total_fpr) for atk_size in range(-50,51,2)]
+                                    # print(plot_fpr)
+                                    # print(plot_tpr)
+                                    # print(plot_atk_size)
+                                    
+                                    for diff_t in ["ocpp_diff_rel", "grid_diff_rel"]:
+                                        if diff_t == "ocpp_diff_rel":
+                                            diff_t_label="Change in Reported Load"
+                                        elif diff_t == "grid_diff_rel":
+                                            diff_t_label="Change in Grid Load"
+                                            
+                                        ax.plot(
+                                            plot_atk_size,
+                                            plot_tpr[diff_t],
+                                            #color="b",
+                                            label=f"TPR over {diff_t_label}", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                            #label=f"TPR over {diff_t_label} (mean = {round(mean([v for v in plot_tpr[diff_t] if v != 0]), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                            lw=2,
+                                            alpha=0.8,
+                                        )
+                                        # ax.plot(
+                                        #     plot_atk_size,
+                                        #     plot_fpr[diff_t],
+                                        #     #color="b",
+                                        #     label=f"FPR over {diff_t_label} (mean = {round(mean(plot_fpr[diff_t]), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                        #     lw=2,
+                                        #     alpha=0.8,
+                                        # )
+                                    ax.plot(
+                                        plot_atk_size,
+                                        plot_mean_fpr,
+                                        #color="b",
+                                        label=f"FPR (mean = {round(mean(plot_mean_fpr), 2)})", #{atk_type.replace('None','').replace('FDI','').replace('MAD','')} 
+                                        lw=2,
+                                        alpha=0.8,
+                                    )
+                                    out_fig_f=out_fig_d+f"{r_name}_{n_name}_{atk_type}_44.pdf"
+                                    # print(r_name,n_name)
+                                    # print(out_fig_f)
+
+                                    box = ax.get_position()
+                                    ax.set_position([box.x0, box.y0, box.width * (8/22)*0.8, box.height*0.8])
+
+                                    ax.legend(loc='right', fontsize=font_size, bbox_to_anchor=(2.95,0.505))
+                                    #fig.show()
+                                    #ax.legend(loc='best', fontsize=font_size)
+                                    plt.xticks([-50,-25,0,25,50])
+
+                                    plt.gca().get_yaxis().get_major_formatter().set_useOffset(False)
+                                    plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
+                                    plt.ylim([0,1])
+                                    plt.xlim([-50,50])
+                                    plt.xlabel(r'\textbf{Min. Attack Magnitude}', fontsize=font_size+2)
+                                    plt.ylabel(r'\textbf{TPR/FPR}', fontsize=font_size+2)
+                                    if os.path.isfile(out_fig_f):
+                                        logger.warning(f"overwriting {out_fig_f=}")
+                                    fig.savefig(out_fig_f)
+                                    #plt.show()
+                                    plt.close()
+                                    # exit()
\ No newline at end of file
diff --git a/ids/load_data.py b/ids/load_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..321d73c501a0b89de28d2edef84fd3fb0d4030bc
--- /dev/null
+++ b/ids/load_data.py
@@ -0,0 +1,603 @@
+
+from collections import defaultdict
+from copy import deepcopy
+import gzip
+import json
+import pickle
+import os
+from typing import Dict, List, Tuple
+import pandas as pd
+from tqdm import tqdm
+import yaml
+import matplotlib.pyplot as plt
+
+import ast
+import re
+
+import logging
+
+from features_aux import get_grid_measurements_from_export, get_grid_pp
+
+logger = logging.getLogger("WATTSON_EV_IDS.Load_Data")
+
+# os.environ["QT_QPA_PLATFORM"] = "wayland"
+#os.environ["QT_QPA_PLATFORM"] = "xcb"
+# py load_data.py -c=print_data -v test
+
+
+
+START_DATE="2023-11-01 22:13:00"
+
+
+def get_dataset_l(configs):
+    #atk_1_0.2_powerowl_example
+    atk_regex = re.compile(r"^.*?\/(atk_[\d\.]+_[\d\.]+)_[\w\d_]+$")
+
+    clean_dsets=[]
+    for dataset in list(configs.keys()):
+        if "atk" in dataset:
+            clean_dsets.append(dataset)
+            for subdir in configs[dataset]["DIR"]:
+                result = atk_regex.search(subdir)
+                if result:
+                    new_dset=dataset.replace("_atk","")+"_"+result.group(1)
+                    clean_dsets.append(new_dset)
+                    configs[new_dset] = deepcopy(configs[dataset])
+                    configs[new_dset]["DIR"]=subdir
+                    configs[new_dset]["OutDataDIR"] = "data/"+new_dset
+                else:
+                    logger.error(f"get_dataset_l unk atk dir {subdir}")
+        else:
+            clean_dsets.append(dataset)
+    return clean_dsets
+
+
+def clean_dataset_l(datasets, configs, update_config=False):
+    #atk_1_0.2_powerowl_example
+    atk_regex = re.compile(r"^.*?\/(atk_[\d\.]+_[\d\.]+)_[\w\d_]+$")
+
+    clean_dsets=[]
+    for dataset in datasets:
+        if dataset.endswith("_atk"):
+            for subdir in configs[dataset]["DIR"]:
+                result = atk_regex.search(subdir)
+                if result:
+                    new_dset=dataset.replace("_atk","")+"_"+result.group(1)
+                    clean_dsets.append(new_dset)
+                    if update_config:
+                        configs[new_dset] = deepcopy(configs[dataset])
+                        configs[new_dset]["DIR"]=subdir
+                        configs[new_dset]["OutDataDIR"] = "data/"+new_dset
+                else:
+                    logger.error(f"clean_dataset_l unk atk dir {subdir}")
+        else:
+            clean_dsets.append(dataset)
+    return clean_dsets
+
+def print_data_dict(file_dict):
+    for d_type, x_path_dict in file_dict.items():
+        logger.info(d_type)
+        for x_path, f_list in x_path_dict.items():
+            logger.info("\t"+x_path)
+            logger.info(f"\t\t{len(f_list)}")
+
+def print_data_df(file_df):
+    logger.info(file_df)
+
+
+def get_file_dfs(datasets, configs) -> Dict[str, pd.DataFrame]:
+    file_dict = {}
+    file_dfs = {}
+    for dataset in datasets:
+        DIR = configs[dataset]["DIR"]
+        file_dict[dataset] = []
+        # file_dict[dataset]["controller-export"] = {}
+        # file_dict[dataset]["logs"] = {}
+
+        for x in ["estimation", "measurements", "power-grid"]:
+            x_path = os.path.join(DIR, "controller-export", x)
+            onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+            #file_dict[dataset]["controller-export"][x] = onlyfiles
+            for f in onlyfiles:
+                file_dict[dataset].append({"type":"controller-export", "sub_type":x, "x_path":x_path, "file":f})
+
+        node_regex = re.compile(r"^n\d+$")
+        sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))]
+        for x in [f for f in sub_folders if f.startswith("CPO_") or f.startswith("NodeCP_") or node_regex.match(f)]:
+            x_path = os.path.join(DIR, x)
+            #/home/dk/git/wattson-artifacts/test_powerowl_example_2023-12-22-18-22-51/CPO_0/CPO_0-service-34.log
+            onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+            #file_dict[dataset]["logs"][x] = onlyfiles
+            for f in onlyfiles:
+                file_dict[dataset].append({"type":"logs", "sub_type":x, "x_path":x_path, "file":f})
+        
+        file_dfs[dataset] = pd.DataFrame().from_dict(file_dict[dataset])
+    return file_dfs
+
+
+def get_cpo_ocpp_data(file_df) ->  Dict[str, Dict[str, List[Tuple[str, pd.DataFrame]]]]:
+    regex = re.compile(r"^\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2},\d{1,3} - CPOService - INFO - [\d\.]+ -  on_tx (.*?)$")
+    cpo_tx_df_cps={}
+    for cpo_id in [s for s in file_df["sub_type"].drop_duplicates() if s.startswith("CPO_")]:
+        cpo_log = file_df[(file_df["type"] == "logs") & (file_df["sub_type"] == cpo_id) & (file_df["file"] == "CPO_0-service-34.log")][["x_path", "file"]]
+        cpo_log_file = os.path.join(*cpo_log.iloc[0].values)
+        logger.info(cpo_log_file)
+
+        num_lines = sum(1 for line in open(cpo_log_file,'r'))
+        tx_data = []
+        with open(cpo_log_file) as f:
+            for line in tqdm(f, total=num_lines, desc=f"Loading {cpo_id}"):
+                if "ERROR" in line or "Traceback" in line:
+                    logger.error(f"found error in {cpo_log_file}:\t{line}")
+                result = regex.search(line)
+                #2023-12-23 01:39:29,777 - CPOService - INFO - 140672877031424.35739 -  on_tx ('8ada5dedf167ddcb685bcc4e679570651', 'CP_14', '2023-01-01T22:57:07.401781', '2023-01-01T23:56:54.334702', [{'timestamp': '2023-01-01T23:45:00', 'sampled_value': [{'value': 659575}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'})
+                if result:
+                    try:
+                        #('8ada5dedf167ddcb685bcc4e679570651', 'CP_14', '2023-01-01T22:57:07.401781', '2023-01-01T23:56:54.334702', [{'timestamp': '2023-01-01T23:45:00', 'sampled_value': [{'value': 659575}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'})
+                        #('1e9ce1f6770ed662e8810c827bf8f2611', 'CP_11', '2023-11-09T18:01:58.949633', '2023-11-09T18:00:02.158313', 'Updated', [{'timestamp': '2023-11-09T18:00:00', 'sampled_value': [{'value': 8905865}]}], {'vendor_id': 'wattson_v2g', 'atk_type': 'None'})
+                        g = result.group(1)
+                        g_lit = ast.literal_eval(g)
+                    except ValueError as e:
+                        if "nan" in g or "inf" in g:
+                        #     g = g.replace("nan", "0")
+                        #     g_lit = ast.literal_eval(g)
+                        # else:
+                            logger.warning(f"nan or inf in {result.group(0)}")
+                            continue
+                        else: raise e
+
+                    tx_msg = {
+                        "cp_c_id": g_lit[0],
+                        "cp_group": g_lit[1],
+                        "sim_time": g_lit[2],
+                        "timestamp": g_lit[3], #tx msg timestamp
+                        "event": g_lit[4], #
+                        "meter_value_ts": g_lit[5][0]["timestamp"], #meter read timestamp
+                        "meter_value_sampled_value": g_lit[5][0]["sampled_value"][0]["value"],
+                        # "custom_data_vendor_id": g_lit[6]['vendor_id'],
+                        # "custom_data_atk_type": g_lit[6]['atk_type'],
+                    }
+                    for k,v in g_lit[6].items():
+                        #{'vendor_id': 'wattson_v2g', 'atk_type': 'MAD', 'meter_no_atk': 7285447, 'energy_interval': 2.539881115023535, 'original_energy_interval': 2.43, 'average_power': 10.137, 'original_average_power': 9.69845}
+                        tx_msg["custom_data_"+k] = v
+                    tx_data.append(tx_msg)
+                        
+        tx_df = pd.DataFrame(tx_data)
+        logger.info(tx_df.iloc[0:])
+        tx_df["sim_time"] = pd.to_datetime(tx_df["sim_time"])
+        tx_df["timestamp"] = pd.to_datetime(tx_df["timestamp"])
+        tx_df["meter_value_ts"] = pd.to_datetime(tx_df["meter_value_ts"])
+        #with pd.option_context('display.max_rows', 100, 'display.max_columns', 9):
+        logger.info(tx_df.iloc[0:])
+        logger.debug(tx_df.columns)
+
+        tx_df_cps=defaultdict(lambda: [])
+        for cp in tqdm(tx_df["cp_c_id"].drop_duplicates(), desc="Loading cp_c_id"):
+            # logger.debug(cp)
+            tx_df_cp = tx_df[tx_df["cp_c_id"] == cp].copy()
+
+            tx_df_cp = tx_df_cp.sort_values(by="meter_value_ts", ascending=True)
+            tx_df_cp["meter_diff"] = tx_df_cp["meter_value_sampled_value"] - tx_df_cp["meter_value_sampled_value"].shift(1)
+            tx_df_cp["meter_diff"].fillna(0, inplace=True)
+            tx_df_cp["time_diff"] = (tx_df_cp["meter_value_ts"] - tx_df_cp["meter_value_ts"].shift(1)).apply(lambda x: x.total_seconds())
+            tx_df_cp["time_diff"] = tx_df_cp["time_diff"] / (60*60) #s to h
+            tx_df_cp["charge_speed"] = tx_df_cp["meter_diff"] / tx_df_cp["time_diff"] #wh in w
+            tx_df_cp["charge_speed"].fillna(0, inplace=True)
+
+            tx_df_cp["custom_data_meter_diff"] = tx_df_cp["custom_data_meter_no_atk"] - tx_df_cp["custom_data_meter_no_atk"].shift(1)
+            tx_df_cp["custom_data_meter_diff"].fillna(0, inplace=True)
+            tx_df_cp["custom_data_charge_speed"] = tx_df_cp["custom_data_meter_diff"] / tx_df_cp["time_diff"] #wh in w
+            tx_df_cp["custom_data_charge_speed"].fillna(0, inplace=True)
+            for c in ["custom_data_energy_interval",  "custom_data_original_energy_interval",  "custom_data_average_power",  "custom_data_original_average_power"]:
+                tx_df_cp[c].fillna(0, inplace=True)
+
+            # logger.debug(tx_df_cp)
+            # logger.debug(tx_df_cp[tx_df_cp["charge_speed"] < 0])
+            # exit()
+
+            tx_df_cp = tx_df_cp.set_index("meter_value_ts")
+            tx_df_cp = tx_df_cp.sort_index(ascending=True)
+            if len(tx_df_cp[tx_df_cp["charge_speed"] < 0]) > 0:
+                logger.warning(f"tx_df_cp {cp} w/ <0 speed")
+                logger.warning(tx_df_cp)
+                logger.warning(tx_df_cp[tx_df_cp["charge_speed"] < 0])
+                logger.warning(tx_df_cp.index[0])
+                exit()
+                
+            if False: #TODO: START_DATE?
+                tx_df_cp = tx_df_cp[["cp_group", "charge_speed"]]
+                tx_df_cp.loc[tx_df_cp.index[0] - pd.Timedelta(seconds=1)] = {"cp_group":tx_df_cp["cp_group"].iloc[0],"charge_speed":0}
+                tx_df_cp.loc[pd.to_datetime(START_DATE)] = {"cp_group":tx_df_cp["cp_group"].iloc[0],"charge_speed":0}
+
+            tx_df_cp = tx_df_cp.sort_index(ascending=True)
+
+            #tx_df_cp = tx_df_cp.asfreq(freq='5Min', method='bfill')
+            if False: #TODO: resample?
+                tx_df_cp = tx_df_cp.resample('5Min', offset="0s").bfill()
+            # logger.debug(tx_df_cp.iloc[0:])
+            tx_df_cps[tx_df_cp["cp_group"].iloc[0]].append((cp, tx_df_cp))
+        cpo_tx_df_cps[cpo_id] = tx_df_cps
+    return cpo_tx_df_cps
+
+def plot_cpo_ocpp_data(tx_df_cps):
+    fig0, axes0 = plt.subplots()
+    for k,v in sorted(tx_df_cps.items()):
+        logger.debug(k,len(v))
+        tx_df_group = pd.concat([_v[1] for _v in v])
+        tx_df_group = tx_df_group.reset_index(drop=False)
+        # logger.debug(tx_df_group.iloc[0:])
+        tx_df_group = tx_df_group[["meter_value_ts", "charge_speed"]]
+
+        tx_df_group_sum = tx_df_group.groupby("meter_value_ts").sum()
+        # logger.debug(tx_df_group_sum.iloc[0:])
+        # logger.debug(tx_df_group_sum.columns)
+        # break
+        tx_df_group_sum.plot(ax=axes0, y='charge_speed', label=k)
+        # for _v in v:
+        #     #tx_df[tx_df["cp_c_id"] == _v[0]].plot(ax=axes0, x="meter_value_ts", y='charge_speed')
+        #     _v[1].plot(ax=axes0, y='charge_speed')
+        #break
+    # plt.show()
+    # plt.close()
+
+
+def get_dso_oscp_data(file_df, ts_string = "sim_time", meas_string = "charge_speed", time_offset_in_h=0) ->  Dict[str, pd.DataFrame]:
+    #/home/dk/git/wattson-artifacts/test_powerowl_example_2023-12-22-18-22-51/n375/n375-service-35.log
+    dso_oscp = file_df[(file_df["type"] == "logs") & (file_df["sub_type"] == "n375")][["x_path", "file"]]
+    dso_oscp_log = dso_oscp[(dso_oscp["file"] == "n375-service-35.log")]
+    dso_oscp_log_file = os.path.join(*dso_oscp_log.iloc[0].values)
+    
+    regex_dso = re.compile("^\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2},\d{1,3} - DSO_OSCP - INFO - [\d\.]+ -  handleUpdateGroupMeasurements (.*?)$")
+    num_lines = sum(1 for line in open(dso_oscp_log_file,'r'))
+    dso_tx_data = []
+    with open(dso_oscp_log_file) as f:
+        for line in tqdm(f, total=num_lines, desc=f"Loading {dso_oscp_log['file'].iloc[0]}"):
+            if "ERROR" in line or "Traceback" in line:
+                logger.error(f"found error in {dso_oscp_log_file}:\t{line}")
+            result = regex_dso.search(line)
+            if result:
+                try:
+                    #({'group_id': 'CP_4', 'measurements': [{'value': 9, 'phase': 'ALL', 'unit': 'WH', 'energy_type': 'FLEXIBLE', 'direction': 'NET', 'measure_time': '2023-01-02T14:05:00', 'initial_measure_time': '2023-01-01T17:34:05'}]}, 'wjm1neFbwSEVkns5IYstFsheEWR3tsueIWJbJvVFCtE')
+                    g = result.group(1)
+                    #('2023-11-03T09:24:52.617660', 0.25, {'group_id': 'CP_9', 'measurements': [{'value': 2533, 'phase': 'ALL', 'unit': 'WH', 'energy_type': 'FLEXIBLE', 'direction': 'NET', 'measure_time': '2023-11-03T10:12:14', 'initial_measure_time': '2023-11-02T21:42:00'}]}, 'Xng5QTKs7KgjYyJL-ygXl259BVYjUdYROq96ojOj9eI')
+                    g_lit = ast.literal_eval(g)
+                except ValueError as e:
+                    if "nan" in g or "inf" in g:
+                    #     g = g.replace("nan", "0")
+                    #     g_lit = ast.literal_eval(g)
+                    # else:
+                        logger.error(f"ValueError {e=} in {result.group(0)}")
+                        continue
+                    else: raise e
+                sim_time = g_lit[0]
+                oscp_interval_h = g_lit[1]
+                g_lit_d = g_lit[2]
+                tx_msg = {
+                    "sim_time": sim_time,
+                    "oscp_interval_h": oscp_interval_h,
+                    "group_id": g_lit_d["group_id"],
+                    # "token": g_lit[3],
+                }
+                for k,v in g_lit_d["measurements"][0].items():
+                    tx_msg["measurements_"+k] = v
+                dso_tx_data.append(tx_msg)
+
+    dso_tx_df = pd.DataFrame(dso_tx_data)
+    dso_tx_df["sim_time"] = pd.to_datetime(dso_tx_df["sim_time"])
+    dso_tx_df["measurements_measure_time"] = pd.to_datetime(dso_tx_df["measurements_measure_time"]) - pd.Timedelta(hours=time_offset_in_h)
+    dso_tx_df["measurements_initial_measure_time"] = pd.to_datetime(dso_tx_df["measurements_initial_measure_time"])
+    
+
+    # dso_tx_df["measurements_value"] = dso_tx_df["measurements_value"] / dso_tx_df["oscp_interval_h"]
+    oscp_interval = dso_tx_df["oscp_interval_h"].mean() * 60
+    logger.info(dso_tx_df.iloc[0:])
+    logger.debug(dso_tx_df.columns)
+
+    dso_tx_df_cps = dso_tx_df["group_id"].drop_duplicates()
+    #fig, axes = plt.subplots(nrows=len(dso_tx_df_cps), ncols=1)
+
+    # ts_strings = ["measurements_measure_time", "sim_time"]
+    # ts_string = "measurements_measure_time"
+    # ts_string = "sim_time"
+    # meas_string = "charge_speed"
+    # meas_string = "measurements_value"
+    # fig, axes = plt.subplots()
+    # for i,cp_g in enumerate(dso_tx_df_cps):
+    #     dso_tx_df_cp = dso_tx_df[dso_tx_df["group_id"] == cp_g]
+    #     dso_tx_df_cp = dso_tx_df_cp.sort_values(by=ts_string, ascending=True)
+    #     dso_tx_df_cp.plot(ax=axes, x='measurements_measure_time', y='measurements_value', label=cp_g)
+
+    # plt.show()
+    # fig.close()
+
+    # for ts_string in ts_strings:
+    # fig2, axes2 = plt.subplots()
+
+    dso_tx_df_cp_dict = {}
+    for i,cp_g in enumerate(sorted(dso_tx_df_cps)):
+        try:
+            dso_tx_df_cp = dso_tx_df[dso_tx_df["group_id"] == cp_g]
+            # dso_tx_df_cp = dso_tx_df_cp[::4]
+
+            dso_tx_df_cp = dso_tx_df_cp.sort_values(by=ts_string, ascending=True)
+            if len(dso_tx_df_cp.loc[dso_tx_df_cp[ts_string].duplicated(keep="first")]) > 0:
+                logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep=False)])
+                logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="last")].index[0])
+                logger.info(dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="first")].index[0])
+                dso_tx_df_cp_t_i1 = dso_tx_df_cp[dso_tx_df_cp[ts_string].duplicated(keep="first")].index
+                dso_tx_df_cp_t_i2 = dso_tx_df_cp[dso_tx_df_cp[ts_string] > max(dso_tx_df_cp.loc[dso_tx_df_cp_t_i1][ts_string])].index
+                dso_tx_df_cp.loc[dso_tx_df_cp_t_i1 , ts_string] += pd.Timedelta(hours=1)
+                dso_tx_df_cp.loc[dso_tx_df_cp_t_i2 , ts_string] += pd.Timedelta(hours=1)
+
+            #dso_tx_df_cp = dso_tx_df_cp.iloc[0::8]
+
+            # print(dso_tx_df_cp)
+            if True:
+                OSCP_INTERVAL_M=15
+                dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["measurements_value"] - dso_tx_df_cp["measurements_value"].shift(1)
+                dso_tx_df_cp["meter_diff"].fillna(0, inplace=True)
+                dso_tx_df_cp["time_diff"] = OSCP_INTERVAL_M / 60
+                dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["meter_diff"] / dso_tx_df_cp["time_diff"] #wh in w
+                dso_tx_df_cp["charge_speed"].fillna(0, inplace=True)
+            if False:
+                dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["measurements_value"] - dso_tx_df_cp["measurements_value"].shift(1)
+                dso_tx_df_cp["meter_diff"].fillna(0, inplace=True)
+                dso_tx_df_cp["time_diff"] = (dso_tx_df_cp[ts_string] - dso_tx_df_cp[ts_string].shift(1)).apply(lambda x: x.total_seconds())
+                dso_tx_df_cp["time_diff"] = dso_tx_df_cp["time_diff"] / (60*60) #s to h
+                dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["meter_diff"] / dso_tx_df_cp["time_diff"] #wh in w
+                dso_tx_df_cp["charge_speed"].fillna(0, inplace=True)
+            if False:
+                dso_tx_df_cp["charge_speed"] = dso_tx_df_cp["measurements_value"]
+                dso_tx_df_cp["time_diff"] = (dso_tx_df_cp[ts_string] - dso_tx_df_cp[ts_string].shift(1)).apply(lambda x: x.total_seconds())
+                dso_tx_df_cp["time_diff"] = dso_tx_df_cp["time_diff"] / (60*60) #s to h
+                dso_tx_df_cp["meter_diff"] = dso_tx_df_cp["charge_speed"] * dso_tx_df_cp["time_diff"]
+                dso_tx_df_cp["meter_diff"].fillna(0, inplace=True)
+                dso_tx_df_cp["measurements_value"] = 0
+                dso_tx_df_cp["measurements_value"] = dso_tx_df_cp["meter_diff"].cumsum()
+            # print(dso_tx_df_cp)
+            # exit()
+            # if cp_g == "CP_11":
+            #     with pd.option_context('display.max_rows', 100, 'display.max_columns', 10):
+            #         print(dso_tx_df_cp.head(100)[["sim_time", "meter_diff", "time_diff" , "charge_speed", "measurements_value"]])
+            #     exit()
+            
+            
+            dso_tx_df_cp = dso_tx_df_cp.set_index(ts_string)
+            dso_tx_df_cp = dso_tx_df_cp.sort_index(ascending=True)
+            # dso_tx_df_cp = dso_tx_df_cp.asfreq(freq='5Min', method='bfill')
+
+            if False: #TODO: START_DATE=
+                dso_tx_df_cp = dso_tx_df_cp[["group_id", meas_string]]
+                # print(dso_tx_df_cp)
+                dso_tx_df_cp.loc[dso_tx_df_cp.index[0] - pd.Timedelta(seconds=1)] = {"group_id":dso_tx_df_cp["group_id"].iloc[0],meas_string:0}
+                dso_tx_df_cp.loc[pd.to_datetime(START_DATE)] = {"group_id":dso_tx_df_cp["group_id"].iloc[0],meas_string:0}
+                dso_tx_df_cp = dso_tx_df_cp.sort_index(ascending=True)
+                # print(dso_tx_df_cp)
+                # exit()
+
+            #dso_tx_df_cp = dso_tx_df_cp.resample(str(oscp_interval)+'Min', offset="0s").bfill()
+            if False: #TODO: resample?
+                dso_tx_df_cp = dso_tx_df_cp.resample('5Min', offset="0s").bfill()
+            dso_tx_df_cp_dict[cp_g] = dso_tx_df_cp
+            # dso_tx_df_cp.plot(ax=axes2, y=meas_string, label=cp_g)
+        except Exception as e:
+            logger.error(e)
+            logger.error(dso_tx_df_cp.index)
+            logger.error(len(dso_tx_df_cp.index))
+            logger.error(len(dso_tx_df_cp.index.drop_duplicates()))
+            logger.error(dso_tx_df_cp[dso_tx_df_cp.index.duplicated(keep=False)])
+            raise e
+    # plt.show()
+    # plt.close()
+    return dso_tx_df_cp_dict
+
+def plot_dso_oscp_data(dso_tx_df_cp_dict, meas_string = "charge_speed"):
+    fig2, axes2 = plt.subplots()
+    for cp_g,dso_tx_df_cp in dso_tx_df_cp_dict.items():
+        dso_tx_df_cp.plot(ax=axes2, y=meas_string, label=cp_g)
+    # plt.show()
+    # plt.close()
+
+def get_measurements_expo(file_df, DIR) -> pd.DataFrame:
+    pp=get_grid_pp(DIR)
+
+    sgen = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="sgen", target_var=".MEASUREMENT.active_power" )
+    bus = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="bus", target_var=".MEASUREMENT.active_power" )
+    load = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="load", target_var=".MEASUREMENT.active_power" )
+    #storage = get_grid_measurements_from_export(DIR, pp, bus_ns=[], target_elem="storage", target_var=".MEASUREMENT.active_power" )
+    elems=[sgen, bus, load]#, storage
+
+    out_df = None
+    for e in elems:
+        for group,vals in e.items():
+            for val in vals:
+                if out_df is None:
+                    out_df = val.to_frame()
+                else:
+                    out_df[val.name] = val
+                    out_df[val.name] = val
+                new_name = val.name.replace(".MEASUREMENT","").replace("grid_expo_","0.0.")
+                out_df = out_df.rename(columns={val.name: new_name}).copy()
+    return out_df
+
+
+def get_measurements(file_df, DataPointMapsDIR) -> pd.DataFrame:
+    regex_meas = re.compile(r"^measurements\-(\d+)\.jsonl$")
+    measurements = file_df[(file_df["type"] == "controller-export") & (file_df["sub_type"] == "measurements")][["x_path", "file"]]
+    logger.debug(measurements)
+    measurement_dict = defaultdict(lambda: [])
+    for index, row in tqdm(measurements.iterrows(), desc="Loading measurements"):
+        result = regex_meas.search(row["file"])
+        if result:
+            with open(os.path.join(row["x_path"], row["file"]), "rb") as f:
+                for line in f:
+                    measurement_data = json.loads(line)
+                    measurement_dict[result.group(1)].append(measurement_data)
+        else:
+            logger.error(f"unk file {row['file']}")
+
+    for k,v in list(measurement_dict.items()):
+        logger.debug(k, len(v))
+    for v in measurement_dict["401"][:10]:
+        #{'coa': 401, 'value_map': {'10010': 1.0299999713897705, '10020': 0.0, '10030': -8289678.0, '10040': -10461183.0, '10050': 8289678.0, '10060': 10461183.0}, 'sim-time': 1698972846.4926767, 'clock-time': 1703443759.360438}
+        #{'coa': 401, 'value_map': {'10010': 1.0299999713897705, '10020': 0.0}, 'sim-time': 1698982975.2340364, 'clock-time': 1703443770.6145942}
+        logger.debug(v)
+
+    regex_dp = re.compile(r"^(\d+)\-data\-points\.yml$")
+    # DataPointMapsDIR = configs[dataset]["DataPointMapsDIR"]
+    onlyfiles = [f for f in os.listdir(DataPointMapsDIR) if os.path.isfile(os.path.join(DataPointMapsDIR, f))]
+    logger.debug(onlyfiles)
+    
+    dp_dict = defaultdict(lambda: [])
+    for f in tqdm(onlyfiles, desc="Loading Data Point Maps"):
+        result = regex_dp.search(f)
+        if result:
+            with open(os.path.join(DataPointMapsDIR, f), "rb") as f:
+                dp_data = yaml.load(f, Loader=yaml.FullLoader)
+                dp_dict[result.group(1)].append(dp_data)
+        else:
+            logger.error(f"unk file {f}")
+    
+    #[{'401': [{'identifier': '401.10010', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, 'direction': 'monitoring', 'ioa': 10010, 'type_id': 13}, 'providers': {'sources': [{'domain': 'source', 'provider_data': {'attribute': 'voltage', 'context': 'MEASUREMENT', 'grid_element': 'bus.0', 'type': 'float'}, 'provider_type': 'POWER_GRID'}]}, 'value': None}, {'identifier': '401.10020', 'protocol': '60870-5-104', 'protocol_data': {'coa': 401, 'cot': 1, .... 
+    logger.debug(dp_dict["401"])
+
+    meas_data = []
+    for coa,meas_s in tqdm(list(measurement_dict.items()), desc="Loading meas_data"):
+        for meas in meas_s:
+            if str(meas["coa"]) != coa:
+                logger.error(f"unk meas coa {coa, meas}")
+            dps = dp_dict[coa]
+            for dp in dps:
+                coa_dps = dp[coa]
+                for coa_dp in coa_dps:
+                    ioa = coa_dp['protocol_data']["ioa"] #10010
+                    if str(ioa) not in meas["value_map"]:
+                        continue #removes control IOAs...
+                    
+                    if str(coa_dp['protocol_data']["coa"]) != coa:
+                        logger.error(f"unk dp coa {coa, coa_dp}")
+                    coa_ioa = coa_dp["identifier"] #'401.10010'
+
+                    direction = coa_dp['protocol_data']["direction"] #control or monitoring
+                    direction_s = "sources" if direction == "monitoring" else "targets"
+
+                    if direction_s not in coa_dp['providers']:
+                        logger.error(f"no {direction_s=} in {coa, coa_dp}")
+                    if len(coa_dp['providers'][direction_s]) > 1:
+                        logger.error(f"unk {direction_s=} len>1 {coa, coa_dp}")
+                    domain = coa_dp['providers'][direction_s][0]["domain"] #source or target
+                    attribute = coa_dp['providers'][direction_s][0]["provider_data"]["attribute"] #voltage
+                    grid_element = coa_dp['providers'][direction_s][0]["provider_data"]["grid_element"] #bus.0
+
+                    meas_value = meas["value_map"][str(ioa)]
+                    sim_time = meas["sim-time"]
+                    #clock_time = meas["clock-time"]
+
+                    meas_data.append({"coa_ioa":coa_ioa,"coa":coa,"ioa":ioa,"direction":direction,"domain":domain,"attribute":attribute,"grid_element":grid_element,"meas_value":meas_value,"sim_time":sim_time})#,"clock_time":clock_time})
+
+    meas_df = pd.DataFrame(meas_data)
+    meas_df["sim_time"] = pd.to_datetime(meas_df["sim_time"], unit="s")
+    #meas_df["clock_time"] = pd.to_datetime(meas_df["clock_time"], unit="s")
+    meas_df.sort_values(by="sim_time", ascending=True)
+    logger.debug(meas_df)
+    meas_df2 = meas_df[(meas_df["direction"] == "monitoring") & (meas_df["domain"] == "source")] #does nothing
+    # logger.debug(meas_df)
+
+    dups = defaultdict(lambda:[]) #dups for two sides of lines
+
+    clean_meas_data = []
+    for coa_ioa in tqdm(meas_df["coa_ioa"].drop_duplicates(), desc="Loading clean_meas_data"):
+        meas_df_i = meas_df[meas_df["coa_ioa"] == coa_ioa].copy()
+        # logger.debug(meas_df_i)
+        if len(meas_df_i["attribute"].drop_duplicates()) != 1 or len(meas_df_i["grid_element"].drop_duplicates()) != 1:
+            logger.error(f'error w dups in {meas_df_i["attribute"].drop_duplicates(), meas_df_i["grid_element"].drop_duplicates()}')
+
+        att = meas_df_i["attribute"].drop_duplicates().iloc[0]
+        ge = meas_df_i["grid_element"].drop_duplicates().iloc[0]
+
+        meas_df_i = meas_df_i.set_index("sim_time")
+        meas_df_i = meas_df_i.sort_index(ascending=True)
+
+        def resample_meas(df, freq='15Min'):
+            oidx = df.index
+            nidx = pd.date_range(oidx.min(), oidx.max(), freq=freq, normalize=True)
+            res = df.reindex(oidx.union(nidx)).interpolate('time', limit_direction="both").reindex(nidx)
+            return res
+    
+        if True: #TODO: resample? -> yes in order to concat diff freqs...
+            #meas_df_i = meas_df_i.resample('5Min', offset="0s").bfill()
+            #meas_df_i = meas_df_i.resample('5Min', offset="0s").interpolate('time')
+            meas_df_i = resample_meas(meas_df_i, freq="5Min")
+            
+        # logger.debug(meas_df_i)
+        meas_df_i = meas_df_i[["meas_value"]]
+        meas_df_i = meas_df_i.rename(columns={"meas_value": coa_ioa+"."+ge+"."+att}) #duplicates lines witout coa_ioa... (dups for two sides of lines)
+        dups[ge+"."+att].append(coa_ioa)
+        # logger.debug(meas_df_i)
+        
+        clean_meas_data.append(meas_df_i)
+    
+    clean_meas_df = pd.concat(clean_meas_data, axis=1)
+    # logger.debug(clean_meas_df)
+    clean_meas_df = clean_meas_df.fillna(clean_meas_df.mean(skipna=True))
+    logger.info(clean_meas_df)
+
+    # for k,v in dups.items():
+    #     if len(v) > 1:
+    #         logger.debug("dup",k,v)
+    return clean_meas_df
+
+            
+
+def get_estimations(file_df, drop_dups=False) -> pd.DataFrame:
+    est = file_df[(file_df["type"] == "controller-export") & (file_df["sub_type"] == "estimation")][["x_path", "file"]]
+    logger.info(est)
+    est_dict = defaultdict(lambda: list())
+    for index, row, in tqdm(est.iterrows(), total = len(est), desc="Loading estimates"):
+
+        regex_est = re.compile(r"^WALL\-[\-\dT\+]+__SIM\-([\-\dT\+]+)\.powerowl\.p\.gz$")
+        #WALL-2023-12-27T15-50-38-049544+00-00__SIM-2023-11-03T02-28-11-789274+00-00.powerowl.p.gz
+        result = regex_est.search(row["file"])
+        if result:
+            with gzip.open(os.path.join(row["x_path"], row["file"]), "rb") as f:
+                est_data = pickle.load(f)
+                pd.reset_option('^display.', silent=True)
+            # logger.info(est_data)
+            # logger.info(est_data["res_line_est"])
+            # logger.info(est_data["line"])
+            time = "".join(result.group(1).rsplit("-", 1))
+            time = pd.to_datetime(time, format="%Y-%m-%dT%H-%M-%S-%f%z") #2023-11-23T19-05-29-585438+00-00
+            
+            for c in est_data.keys():
+                if c.startswith("res_") and c.endswith("_est"):
+                    c_name = c.replace("res_", "").replace("_est", "")
+                    est_df = est_data[c]
+                    if est_df.empty: continue
+                    df_out = est_df.unstack().to_frame().T.sort_index(level=0)
+                    df_out.columns = [f'{c_name}.{j}.{i}' for i, j in df_out.columns]
+                    df_out.index = [time]
+                    est_dict[c_name].append(df_out)
+        else:
+            logger.error(f"unk file {row['file']}")
+    est_l=[]
+    for k,v in est_dict.items():
+        k_df = pd.concat(v)
+        est_l.append(k_df)
+    est_df = pd.concat(est_l, axis=1)
+
+    if drop_dups:
+        logger.warning("----dropping dups-----")
+        logger.warning(est_df)
+        # est_df = est_df.round(5)
+        logger.warning(est_df.std())
+        est_df = est_df.drop(est_df.std()[(est_df.std() == 0)].index, axis=1)
+        logger.warning(est_df)
+        logger.warning("----dropping dups-----")
+
+    est_df = est_df.sort_index(ascending=True)
+    if False: #TODO: resample?
+        est_df = est_df.resample('5Min', offset="0s").bfill()
+    logger.info(est_df)
+    return est_df
+                    
+            
\ No newline at end of file
diff --git a/ids/regression.py b/ids/regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..051e54f3c660c1444e84080329a372cec18f25a7
--- /dev/null
+++ b/ids/regression.py
@@ -0,0 +1,877 @@
+
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from copy import deepcopy
+import gc
+import hashlib
+import itertools
+import json
+import multiprocessing
+from multiprocessing.pool import ThreadPool
+import os
+import warnings
+import pandas as pd
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import KFold
+from sklearn.neural_network import MLPRegressor
+from sklearn.svm import LinearSVR
+from sklearn.tree import DecisionTreeRegressor
+from tqdm import tqdm
+
+from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
+
+import re
+
+import logging
+
+logger = logging.getLogger("WATTSON_EV_IDS.Regression")
+
+
+m_dict = multiprocessing.Manager()
+REG_CACHE=m_dict.dict()
+#REG_CACHE=dict()
+with open("ids.conf", 'r') as f:
+    conf = json.load(f)
+    NUM_THREADS = conf["NUM_THREADS"]
+#NUM_THREADS=3
+    
+IGNORE_MISS_COLS=True
+
+def get_regression_pred_conc_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, actor, features, num_shifts=1, pbar=None, kfold_splits=[None], atk_df=None, recursive=False):
+    y_pred_folds=[]
+    for kfold in kfold_splits:
+        if recursive:
+            y_preds = get_regression_pred_rec(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df)
+        else:
+            y_preds = get_regression_pred_conc(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df)
+
+        #pbar.update(len(y_preds))
+        y_pred = pd.concat(y_preds, axis=1)
+        # print(y_pred)
+        # exit()
+        y_pred["conf"] = [conf] * len(y_pred)
+        y_pred["features"] = [features] * len(y_pred)
+        y_pred["reg"] = reg_type
+        y_pred["actor"] = actor
+        # print(y_pred)
+        # exit()
+        y_pred_folds.append(y_pred)
+
+    y_pred_all = pd.concat(y_pred_folds)
+    y_pred_all.sort_index(ascending=True, inplace=True)
+    return y_pred_all
+
+
+def get_regression_pred_rec(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None):
+    if kfold is None:
+        y_preds = get_regression_pred_rec_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df)
+    else:
+        y_preds = get_regression_pred_rec_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=num_shifts, pbar=pbar, kfold=kfold, atk_df=atk_df)
+    return y_preds
+
+def get_regression_pred_rec_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None):
+    y_preds = []
+    atk_df_do = atk_df.copy()
+    atk_df_s = atk_df.copy()
+
+    for x in range(num_shifts):
+        # index, y_pred = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=x)
+        index, y_pred, used_shift = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, 0, atk_df=atk_df_do)
+        y_pred = pd.DataFrame(y_pred, index=index)
+        y_pred[0] = y_pred[0].shift(used_shift)
+        y_pred.rename(columns={0: "prediction_"+str(x)}, inplace=True)
+        y_preds.append(y_pred)
+
+        # print(atk_df.loc[ATK_START_DATE:][[c for c in atk_df.columns if  c.startswith("charge_speed_lag_")]])
+        # print(y_pred.shift(1))
+        y_pred_s = y_pred.shift(x+1)
+        atk_df_s = atk_df_do.copy()
+        atk_df_s["charge_speed_lag_"+str(x+1)] = y_pred_s
+        atk_df_s[atk_df_s["charge_speed_lag_"+str(x+1)].isna()] = atk_df[atk_df_s["charge_speed_lag_"+str(x+1)].isna()]
+        atk_df_do=atk_df_s
+        if pbar is not None:
+            pbar.update(1)
+        # print(atk_df_s.loc[ATK_START_DATE:][[c for c in atk_df.columns if  c.startswith("charge_speed_lag_")]])
+        # exit()
+
+    return y_preds
+
+
+def get_regression_pred_rec_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None):
+    y_preds = []
+
+    df_all = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE] #.copy()
+
+    feat_cols = df_dict["feat_cols"]
+    pred_col = df_dict["pred_col"]
+
+    y_preds = dict()
+    y_preds_fold = defaultdict(lambda:list())
+    kf = KFold(n_splits=kfold, random_state=None, shuffle=False)
+    for i, (train_index, test_index) in enumerate(kf.split(df_all)):
+        df = df_all.iloc[train_index].copy()
+        tbd_df = df_all.iloc[test_index].copy()
+
+        for x in range(num_shifts):
+            try:
+                index, y_pred, used_shift = _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=0)
+
+                y_pred = pd.DataFrame(y_pred, index=index)
+                y_pred.rename(columns={0: "prediction_"+str(x)}, inplace=True)
+                y_preds_fold[x+1].append(y_pred)
+
+                # print(tbd_df.loc[TRAIN_START_DATE:ATK_START_DATE][[c for c in tbd_df.columns if  c.startswith("charge_speed_lag_")]])
+                # print(y_pred.shift(0))
+                y_pred_s = y_pred.shift(x+1)
+                tbd_df_s = tbd_df.copy()
+                tbd_df_s["charge_speed_lag_"+str(x+1)] = y_pred_s
+                tbd_df_s[tbd_df_s["charge_speed_lag_"+str(x+1)].isna()] = tbd_df[tbd_df_s["charge_speed_lag_"+str(x+1)].isna()]
+                tbd_df=tbd_df_s
+                # print(tbd_df_s.loc[TRAIN_START_DATE:ATK_START_DATE][[c for c in tbd_df_s.columns if  c.startswith("charge_speed_lag_")]])
+                # exit()
+                if pbar is not None:
+                    pbar.update(1)
+
+            except ValueError as e:
+                logger.error(df)
+                logger.error(tbd_df)
+                logger.error(pred_col)
+                logger.error(feat_cols)
+                logger.error(e)
+                raise e
+        
+    for k,v in y_preds_fold.items():
+        y_pred = pd.concat(v)
+        y_pred.sort_index(ascending=True, inplace=True)
+        y_preds[k] = y_pred
+    ret_y_pred = [v for v in y_preds.values()]
+    return ret_y_pred
+
+def get_regression_pred_conc(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, num_shifts=1, pbar=None, kfold=None, atk_df=None):
+    nt=NUM_THREADS
+    if reg_type == "RandomForestRegressor" or reg_type == "MLPRegressor":
+        nt=1
+    y_preds = []
+    results=[]
+
+        
+    with ThreadPoolExecutor(nt) as pool:
+    #with ProcessPoolExecutor(nt) as pool:
+        for x in range(num_shifts):
+            # index, y_pred = get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=x)
+            if kfold is None: #no kfold for atk data sets
+                if nt == 1:
+                    results.append(get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, atk_df=atk_df))
+                else:
+                    results.append(pool.submit(get_regression_pred, df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, atk_df))
+            else: # yes kfold for training data sets
+                if nt == 1:
+                    results.append(get_regression_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, kfold, pbar))
+                else:
+                    results.append(pool.submit(get_regression_kfold, df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, x, kfold, pbar))
+
+        for r in results:
+            if nt == 1:
+                index, y_pred, used_shift = r
+            else:
+                index, y_pred, used_shift = r.result()
+            if pbar is not None and kfold is None:
+                pbar.update(1)
+            y_pred = pd.DataFrame(y_pred, index=index)
+            y_pred[0] = y_pred[0].shift(used_shift)
+            y_pred.rename(columns={0: "prediction_"+str(used_shift)}, inplace=True)
+            y_preds.append(y_pred)
+
+    return y_preds
+
+def get_regression_kfold(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=0, nfolds=5, pbar=None):
+    df_all = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE] #.copy()
+
+    feat_cols = df_dict["feat_cols"]
+    pred_col = df_dict["pred_col"]
+
+    y_preds = []
+    kf = KFold(n_splits=nfolds, random_state=None, shuffle=False)
+    for i, (train_index, test_index) in enumerate(kf.split(df_all)):
+        df = df_all.iloc[train_index].copy()
+        tbd_df = df_all.iloc[test_index].copy()
+
+        try:
+            index, y_pred, used_shift = _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=shift)
+        except ValueError as e:
+            logger.error(df)
+            logger.error(tbd_df)
+            logger.error(pred_col)
+            logger.error(feat_cols)
+            logger.error(e)
+            raise e
+        del df
+        del tbd_df
+        gc.collect()
+        y_pred = pd.DataFrame(y_pred, index=index)
+        y_preds.append(y_pred)
+        if pbar is not None:
+            pbar.update(1)
+    y_pred = pd.concat(y_preds)
+    y_pred.sort_index(ascending=True, inplace=True)
+    return y_pred.index, y_pred[0], shift
+
+
+def get_regression_pred(df_dict, TRAIN_START_DATE, ATK_START_DATE, reg_type, conf, shift=0, atk_df=None):
+    df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:ATK_START_DATE].copy()
+    feat_cols = df_dict["feat_cols"]
+    pred_col = df_dict["pred_col"]
+    if atk_df is None:
+        tbd_df = df_dict["cp_g_df"].loc[ATK_START_DATE:].copy()
+    else:
+        tbd_df = atk_df.loc[ATK_START_DATE:].copy()
+        mis_col = [c for c in df.columns if c not in tbd_df.columns and "date_exog" in c]
+        tbd_df[mis_col] = 0
+
+    miss_cols = [c for c in feat_cols if c not in tbd_df.columns]
+    if len(miss_cols) > 0:
+        if IGNORE_MISS_COLS:
+            cp_miss = set()
+            other_miss = list()
+            for c in miss_cols:
+                if c.startswith("CP_"):
+                    cp_miss.add("_".join(c.split("_")[0:2]))
+                else:
+                    other_miss.append(c)
+            if len(cp_miss)>1:
+                logger.warning(f"ignoring missing columns in atk datafrtame, cp_miss: {len(cp_miss)}")
+                logger.debug(f"ignoring missing columns in atk datafrtame, cp_miss: {cp_miss}")
+            if len(other_miss)>1:
+                logger.warning(f"ignoring missing columns in atk datafrtame, other_miss: {len(other_miss),other_miss}")
+            feat_cols = [c for c in feat_cols if c in tbd_df.columns]
+    # print(feat_cols)
+    # exit()
+    # print([c for c in tbd_df.columns])
+    #test = tbd_df[feat_cols]
+    return _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=shift)
+
+    # df[pred_col] = df[pred_col].shift(shift*-1)
+    # df.dropna(subset=[pred_col], inplace=True)
+
+    # tbd_df[pred_col] = tbd_df[pred_col].shift(shift*-1)
+    # tbd_df.dropna(subset=[pred_col], inplace=True)
+
+    # return tbd_df.index, _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=reg_type), shift
+
+def _do_get_regression_pred(df, tbd_df, pred_col, feat_cols, reg_type, conf, shift=0):
+    df[pred_col] = df[pred_col].shift(shift*-1)
+    df.dropna(subset=[pred_col], inplace=True)
+
+    tbd_df[pred_col] = tbd_df[pred_col].shift(shift*-1)
+    tbd_df#.dropna(subset=[pred_col], inplace=True)
+
+    return tbd_df.index, _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=reg_type), shift
+
+
+def _do_regression(df, feat_cols, pred_col, tbd_df, conf, reg_type=""):
+    # print(df)
+    # print(feat_cols)
+    # exit()
+    if reg_type == "RandomForestRegressor":
+        #_do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf)
+        return _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    elif reg_type == "DecisionTreeRegressor" or reg_type == "DecisionTreeRegressor2":
+        return _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    elif reg_type == "GradientBoostingRegressor":
+        return _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    elif reg_type == "HistGradientBoostingRegressor":
+        return _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    elif reg_type == "LinearSVR":
+        return _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf)
+    elif reg_type == "MLPRegressor" or reg_type == "MLPRegressor2": # or reg_type == "test":
+        return _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    else:
+        logger.error(f"unk reg_type {reg_type}")
+        return []
+    
+def optimize_regression(df_dict, TRAIN_START_DATE, VALIDATION_START_DATE, ATK_START_DATE, reg_type, group=""):
+    df = df_dict["cp_g_df"].loc[TRAIN_START_DATE:VALIDATION_START_DATE]
+    feat_cols = df_dict["feat_cols"]
+    # print([c for c  in df.columns])
+    # print(feat_cols)
+    # exit()
+    pred_col = df_dict["pred_col"]
+    tbd_df = df_dict["cp_g_df"].loc[VALIDATION_START_DATE:ATK_START_DATE]
+
+    df = df[feat_cols+[pred_col]].copy()
+    tbd_df = tbd_df[feat_cols+[pred_col]].copy()
+    if len(tbd_df) > 0:
+        ret_ds = _optimize_regression(df, feat_cols, pred_col, tbd_df, group=group, reg_type=reg_type)
+    else:
+        ret_ds=[]
+        logger.warning(f"Empty ATK df. Skipping {group=}")
+    ret_df = pd.DataFrame(ret_ds)
+    if len(ret_df) > 0:
+        ret_df = ret_df.sort_values(by="mse", ascending=True)
+    return ret_df
+        
+def _optimize_regression(df, feat_cols, pred_col, tbd_df, group="", reg_type=""):
+    if reg_type == "RandomForestRegressor":
+        return optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, group)
+    elif reg_type == "DecisionTreeRegressor" or reg_type == "DecisionTreeRegressor2":
+        return optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, group)
+    elif reg_type == "GradientBoostingRegressor":
+        return optimize_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group)
+    elif reg_type == "HistGradientBoostingRegressor":
+        return optimize_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group)
+    elif reg_type == "LinearSVR":
+        return optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, group)
+    elif reg_type == "MLPRegressor" or reg_type == "MLPRegressor2" or reg_type == "test":
+        return optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, group)
+    else:
+        logger.error(f"unk reg_type {reg_type}")
+        return []
+
+
+def _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf):
+    lags = conf[6]
+    #forecaster_reg = DecisionTreeRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], 
+    #                                            'ccp_alpha': conf[7], 'splitter': conf[8], 'min_weight_fraction_leaf': conf[9], 'min_impurity_decrease': conf[10], 'random_state': 1234}) #
+    reg_id = get_reg_id("DecisionTreeRegressor", df, feat_cols, pred_col, conf)
+    if reg_id in REG_CACHE:
+        logger.debug(f"hit {len(REG_CACHE)}")
+        forecaster_reg = deepcopy(REG_CACHE[reg_id])
+    else:
+        logger.debug(f"miss {len(REG_CACHE)}")
+        forecaster_reg = DecisionTreeRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], 
+                                                'ccp_alpha': conf[7], 'splitter': conf[8], 'min_weight_fraction_leaf': conf[9], 'min_impurity_decrease': conf[10], 'random_state': 1234}) #
+        forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+        REG_CACHE[reg_id] = forecaster_reg
+
+    # forecaster_reg.n_estimators += n_estimators
+    #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+    y_pred = forecaster_reg.predict(tbd_df[feat_cols])
+    return y_pred
+
+def _optimize_DecisionTreeRegressor(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return {"conf":conf,"mse":mse,"rmse":rmse}
+
+
+def optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, group=""):
+    #criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
+    criterion = ["squared_error", "absolute_error", "friedman_mse"]
+    max_depth = [14, None]
+    min_samples_split = [2,3]
+    min_samples_leaf = [1,2]
+    max_leaf_nodes = [None, 50]
+    max_features = ["sqrt", None]
+    lags=[16,32]
+    ccp_alpha = [0.0]
+    splitter  = ["best", "random"]
+    min_weight_fraction_leaf = [0.0]
+    min_impurity_decrease  = [0.0]
+
+    criterion = ["absolute_error", "friedman_mse"]
+    # max_depth = [None]
+    min_samples_split = [2,5]
+    min_samples_leaf = [1,3]
+    max_leaf_nodes = [None]
+    max_features = [None]
+    #ccp_alpha  = [0.0, 0.5]
+    splitter  = ["best"]
+    # min_weight_fraction_leaf = [0.0, 0.1, 0.2]
+    #min_impurity_decrease  = [0.0, 1.0]
+    lags=[None]
+
+
+    c = list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features, lags,  ccp_alpha, splitter, min_weight_fraction_leaf, min_impurity_decrease))
+    #c = [('absolute_error', 14, 2, 1, None, None, None, 0.0, 'random', 0.0, 0.0), ]
+    #c = [('friedman_mse', None, 5, 3, None, None, None, 0.5, 'random', 0.0, 1.0), ]
+
+    # ret_ds = []
+    # for conf in tqdm(c, desc="conf "+group, disable=False):
+    #     ret_ds.append(_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df, conf))
+    # return ret_ds
+        
+    ret_ds = []
+    results=[]
+    print()
+    with ThreadPoolExecutor(NUM_THREADS) as pool:
+    # with ProcessPoolExecutor(NUM_THREADS) as pool:
+        for conf in tqdm(c, desc="conf "+group, disable=True):
+            results.append(pool.submit(_optimize_DecisionTreeRegressor, df, feat_cols, pred_col, tbd_df, conf))
+            #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds)
+        for r in tqdm(results, desc="conf "+group):
+            ret_ds.append(r.result())   
+    return ret_ds
+
+
+def _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf):
+    lags = conf[6]
+    n_estimators=conf[10]
+    # forecaster_reg = RandomForestRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], 
+    #                                             'oob_score': conf[7], 'ccp_alpha': conf[8], 'max_samples': conf[9], 'n_estimators': n_estimators, 'n_jobs': min(NUM_THREADS, n_estimators), 'random_state': 1234,
+    #                                             'warm_start': False}) #
+    reg_id = get_reg_id("RandomForestRegressor", df, feat_cols, pred_col, conf)
+    if reg_id in REG_CACHE:
+        logger.debug(f"hit {len(REG_CACHE)}")
+        forecaster_reg = deepcopy(REG_CACHE[reg_id])
+    else:
+        logger.debug(f"miss {len(REG_CACHE)}")
+        forecaster_reg = RandomForestRegressor(**{'criterion': conf[0], 'max_depth': conf[1], 'min_samples_split': conf[2], 'min_samples_leaf': conf[3], 'max_leaf_nodes': conf[4], 'max_features': conf[5], 
+                                                'oob_score': conf[7], 'ccp_alpha': conf[8], 'max_samples': conf[9], 'n_estimators': n_estimators, 'n_jobs': min(NUM_THREADS, n_estimators), 'random_state': 1234,
+                                                'warm_start': False}) #
+        forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+        REG_CACHE[reg_id] = forecaster_reg
+
+    # forecaster_reg.n_estimators += n_estimators
+    #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+
+    y_pred = forecaster_reg.predict(tbd_df[feat_cols])
+    return y_pred
+
+def _optimize_RandomForestRegressor(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return {"conf":conf,"mse":mse,"rmse":rmse}
+
+def optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, group=""):
+    #criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
+    criterion = ["squared_error", "absolute_error"]
+    #max_depth = [None, 14]
+    max_depth = [None, 12, 14]
+    min_samples_split = [2,3]
+    min_samples_leaf = [1,2]
+    max_leaf_nodes = [None, 5]
+    max_features = ["sqrt", None]
+    lags=[16,32]
+    oob_score = [False]
+    ccp_alpha = [0.0]
+    max_samples = [None, 0.8]
+    _n_estimators=[20,50,100,150]
+
+    criterion = ["squared_error"]
+    max_depth = [None, 12]
+    # min_samples_split = [3]
+    min_samples_split = [2]
+    min_samples_leaf = [1]
+    max_leaf_nodes = [None]
+    max_features = [None]
+    lags=[None]
+    # max_samples = [None]
+    # _n_estimators=[40,50,60]
+    # _n_estimators=[5,10,15,20,30]
+    _n_estimators=[50,100,150]
+
+
+    c = list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features, lags, oob_score, ccp_alpha, max_samples, _n_estimators))
+    #c = [('squared_error', 12, 3, 2, None, None, None, False, 0.0, None, 20), ]
+
+    ret_ds = []
+    print()
+    for conf in tqdm(c, desc="conf "+group, disable=(len(c) == 1)):
+        res = _optimize_RandomForestRegressor(df, feat_cols, pred_col, tbd_df, conf)
+        #print(f"{group=} {mse=} {rmse=} {conf=}")
+        ret_ds.append(res)
+    return ret_ds
+
+
+def _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf):
+    #c = list(itertools.product(loss, learning_rate, max_iter, min_samples_leaf, max_depth, max_leaf_nodes))
+    
+    reg_id = get_reg_id("HistGradientBoostingRegressor", df, feat_cols, pred_col, conf)
+    if reg_id in REG_CACHE:
+        logger.debug(f"hit {len(REG_CACHE)}")
+        forecaster_reg = deepcopy(REG_CACHE[reg_id])
+    else:
+        logger.debug(f"miss {len(REG_CACHE)}")
+        forecaster_reg = HistGradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'max_iter': conf[2], 'min_samples_leaf': conf[3], 'max_depth': conf[4], 'max_leaf_nodes': conf[5], 
+                                                         'random_state': 1234}) #
+        forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+        REG_CACHE[reg_id] = forecaster_reg
+
+    # forecaster_reg.n_estimators += n_estimators
+    #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+
+    y_pred = forecaster_reg.predict(tbd_df[feat_cols])
+    return y_pred
+
+def _optimize_HistGradientBoostingRegressor(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return{"conf":conf,"mse":mse,"rmse":rmse}
+
+
+def optimize_HistGradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group=""):
+    loss=['absolute_error']
+    learning_rate=[0.1, 0.5]
+    max_iter = [100]
+    max_leaf_nodes = [50, 100, 150]
+    max_depth = [None, 15]
+    min_samples_leaf = [10,10,30]
+
+
+    c = list(itertools.product(loss, learning_rate, max_iter, min_samples_leaf, max_depth, max_leaf_nodes))
+
+    ret_ds = []
+    results=[]
+    print()
+    with ThreadPoolExecutor(NUM_THREADS) as pool:
+    # with ProcessPoolExecutor(NUM_THREADS) as pool:
+        for conf in tqdm(c, desc="conf "+group, disable=True):
+            results.append(pool.submit(_optimize_HistGradientBoostingRegressor, df, feat_cols, pred_col, tbd_df, conf))
+            #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds)
+        for r in tqdm(results, desc="conf "+group):
+            ret_ds.append(r.result())   
+    return ret_ds
+
+
+def _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf):
+    n_estimators=conf[2]
+
+    reg_id = get_reg_id("GradientBoostingRegressor", df, feat_cols, pred_col, conf)
+    if reg_id in REG_CACHE:
+        logger.debug(f"hit {len(REG_CACHE)}")
+        forecaster_reg = deepcopy(REG_CACHE[reg_id])
+    else:
+        logger.debug(f"miss {len(REG_CACHE)}")
+        if len(conf) >14:
+            forecaster_reg = GradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'subsample': conf[3], 'criterion': conf[4], 'min_samples_split': conf[5], 'min_samples_leaf': conf[6], 
+                                                        'min_weight_fraction_leaf': conf[7], 'max_depth': conf[8], 'min_impurity_decrease': conf[9], 'max_features': conf[10], 'alpha': conf[11], 
+                                                        'max_leaf_nodes': conf[12], 'ccp_alpha': conf[13], 'n_iter_no_change': conf[14],
+                                                        'n_estimators': n_estimators, 'random_state': 1234}) #
+        else:
+            forecaster_reg = GradientBoostingRegressor(**{'loss': conf[0], 'learning_rate': conf[1], 'subsample': conf[3], 'criterion': conf[4], 'min_samples_split': conf[5], 'min_samples_leaf': conf[6], 
+                                                        'min_weight_fraction_leaf': conf[7], 'max_depth': conf[8], 'min_impurity_decrease': conf[9], 'max_features': conf[10], 'alpha': conf[11], 
+                                                        'max_leaf_nodes': conf[12], 'ccp_alpha': conf[13],
+                                                        'n_estimators': n_estimators, 'random_state': 1234}) #
+        forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+        REG_CACHE[reg_id] = forecaster_reg
+
+    # forecaster_reg.n_estimators += n_estimators
+    #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+
+    y_pred = forecaster_reg.predict(tbd_df[feat_cols])
+    return y_pred
+
+def _optimize_GradientBoostingRegressor(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return{"conf":conf,"mse":mse,"rmse":rmse}
+
+
+def optimize_GradientBoostingRegressor(df, feat_cols, pred_col, tbd_df, group=""):
+    loss=['squared_error', 'absolute_error', 'huber', 'quantile']
+    learning_rate=[0.1, 0.2]
+    n_estimators=[25, 50, 75, 100]
+    subsample=[0.9, 1.0]
+    criterion = ["squared_error", "friedman_mse"]
+    min_samples_split = [2,8]
+    min_samples_leaf = [1,4]
+    min_weight_fraction_leaf = [0.0]
+    max_depth = [None, 14]
+    min_impurity_decrease  = [0.0]
+    max_features = ["sqrt", None]
+    alpha=[0.9, 0.95]
+    max_leaf_nodes = [None, 10, 15]
+    ccp_alpha = [0.0]
+
+    loss=['absolute_error']
+    learning_rate=[0.1]
+    n_estimators=[50, 100, 150]
+    subsample=[1.0]
+    criterion = ["friedman_mse"]
+    max_depth = [None]
+    max_features = [None]
+    max_leaf_nodes = [None, 10]
+    alpha=[0.9]
+
+
+    c = list(itertools.product(loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, max_features, alpha, max_leaf_nodes, ccp_alpha))
+    #c = [('absolute_error', 0.1, 100, 1.0, 'friedman_mse', 8, 4, 0.0, None, 0.0, None, 0.9, 10, 0.0), ]
+
+    ret_ds = []
+    results=[]
+    print()
+    with ThreadPoolExecutor(NUM_THREADS) as pool:
+    # with ProcessPoolExecutor(NUM_THREADS) as pool:
+        for conf in tqdm(c, desc="conf "+group, disable=True):
+            results.append(pool.submit(_optimize_GradientBoostingRegressor, df, feat_cols, pred_col, tbd_df, conf))
+            #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds)
+        for r in tqdm(results, desc="conf "+group):
+            ret_ds.append(r.result())   
+    return ret_ds
+
+
+def _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf):
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        #forecaster_reg = LinearSVR(**{'epsilon': conf[0], 'tol': conf[1], 'C': conf[2], 'loss': conf[3], 'fit_intercept': conf[4], 'intercept_scaling': conf[5], 'dual': conf[6], 'max_iter': conf[7], 'random_state': 1234}) #
+        
+        # print(feat_cols, pred_col)
+        # exit()
+        reg_id = get_reg_id("LinearSVR", df, feat_cols, pred_col, conf)
+        if reg_id in REG_CACHE:
+            logger.debug(f"hit {len(REG_CACHE)}")
+            forecaster_reg = deepcopy(REG_CACHE[reg_id])
+        else:
+            logger.debug(f"miss {len(REG_CACHE)}")
+            forecaster_reg = LinearSVR(**{'epsilon': conf[0], 'tol': conf[1], 'C': conf[2], 'loss': conf[3], 'fit_intercept': conf[4], 'intercept_scaling': conf[5], 'dual': conf[6], 'max_iter': conf[7], 'random_state': 1234}) #
+            forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+            REG_CACHE[reg_id] = forecaster_reg
+
+        # forecaster_reg.n_estimators += n_estimators
+        #forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+
+        y_pred = forecaster_reg.predict(tbd_df[feat_cols])
+        return y_pred
+
+def _optimize_LinearSVR(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_LinearSVR(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return {"conf":conf,"mse":mse,"rmse":rmse}
+
+
+def optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, group=""):
+    epsilon=[0.0, 100, 1000]
+    tol=[0.0001, 0.01, 0.3]
+    C=[1.0, 0.1, 2.0]
+    loss=["epsilon_insensitive", "squared_epsilon_insensitive"]
+    fit_intercept=[True, False]
+    intercept_scaling=[1.0, 2.0]
+    dual=[True, False]
+    max_iter=[1000, 1200]
+
+    C=[1.0, 2.0]
+    loss=["squared_epsilon_insensitive"]
+
+    c = list(itertools.product(epsilon, tol, C, loss, fit_intercept, intercept_scaling, dual, max_iter))
+    #c = [(1000, 0.0001, 2.0, 'squared_epsilon_insensitive', False, 1.0, False, 1200), ]
+
+    ret_ds = []
+    results=[]
+    print()
+    # for conf in tqdm(c, desc="conf "+group, disable=False):
+    #     if conf[4] == False and conf[5] != 1.0: #not fit_intercept and yes scaling
+    #         continue
+    #     if conf[3] == 'epsilon_insensitive' and conf[6] == False: #Parameters: penalty='l2', loss='epsilon_insensitive', dual=False
+    #         continue
+    #     try:
+    #         ret_v = _optimize_LinearSVR(df, feat_cols, pred_col, tbd_df, conf)
+    #         ret_ds.append(ret_v)
+    #     except Exception as e:
+    #         print(e)
+        
+    with ThreadPoolExecutor(NUM_THREADS) as pool:
+    # with ProcessPoolExecutor(NUM_THREADS) as pool:
+        for conf in tqdm(c, desc="conf "+group, disable=True):
+            if conf[4] == False and conf[5] != 1.0: #not fit_intercept and yes scaling
+                continue
+            results.append(pool.submit(_optimize_LinearSVR, df, feat_cols, pred_col, tbd_df, conf))
+            #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds)
+        for r in tqdm(results, desc="conf "+group):
+            ret_ds.append(r.result())   
+    return ret_ds
+
+
+def _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf):
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+
+        reg_id = get_reg_id("MLPRegressor", df, feat_cols, pred_col, conf)
+        if reg_id in REG_CACHE:
+            logger.debug(f"hit {len(REG_CACHE)}")
+            forecaster_reg = deepcopy(REG_CACHE[reg_id])
+        else:
+            logger.debug(f"miss {len(REG_CACHE)}")
+            forecaster_reg = MLPRegressor(**{'hidden_layer_sizes': conf[0], 'solver': conf[1], 'alpha': conf[2], 'max_iter': conf[3], 'activation': conf[4], 'random_state': 1234}) #
+            forecaster_reg.fit(X = df[feat_cols], y = df[pred_col])
+            REG_CACHE[reg_id] = forecaster_reg
+
+        y_pred = forecaster_reg.predict(tbd_df[feat_cols])#
+    return y_pred
+
+def _optimize_MLPRegressor(_df, feat_cols, pred_col, _tbd_df, conf):
+    df = _df#.copy()
+    tbd_df = _tbd_df#.copy()
+    y_pred = _do_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf)
+    y_true = tbd_df[pred_col]
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    #print(f"{group=} {mse=} {rmse=} {conf=}")
+    return {"conf":conf,"mse":mse,"rmse":rmse}
+
+def optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, group=""):
+    hidden_layer_sizes = [(10, 10),(10, 10, 10),(100, ),(10, ),] #,(10, 10, 10, 10), (100, 100, 100),  (140, ), (100, 100),
+    solver = ["lbfgs", 'adam']
+    alpha = [0.0001, 0.00005, 0.0005, 0.00001] #
+    max_iter = [20000, 200000]
+    activation=["relu", 'logistic']
+
+    # hidden_layer_sizes = [(100, ),(90, ),(80, ),(110, ),(120, ),] #,(10, 10, 10, 10)
+    solver = ["lbfgs"]
+    # alpha = [0.00001,0.000015,0.000005] #
+    max_iter = [20000]
+    activation=["relu"]
+
+    # hidden_layer_sizes = [(120, ),(140, ),(160, ),(180, ),(200, ),] #,(10, 10, 10, 10)
+    # alpha = [0.00001,] #
+
+    # hidden_layer_sizes = [(140, ),] #,(10, 10, 10, 10)
+    # alpha = [0.00001,0.000011, 0.000009,] #
+
+    c = list(itertools.product(hidden_layer_sizes, solver, alpha, max_iter, activation))
+    #c = [((100,), 'lbfgs', 1e-05, 20000, 'relu'), ] 4.363318e+04
+    #c = [((140,), 'lbfgs', 1e-05, 20000, 'relu'), ] 4.255844e+04
+
+    results = []
+    ret_ds = []
+    print()
+
+#
+    if True:
+        for conf in tqdm(c, desc="conf "+group, disable=(len(c) == 1)):
+            res = _optimize_MLPRegressor(df, feat_cols, pred_col, tbd_df, conf)
+            ret_ds.append(res)
+    else: #slower
+        logger.warning(f"mlp w/ {NUM_THREADS}")
+        #with ThreadPoolExecutor(NUM_THREADS) as pool:
+        with ProcessPoolExecutor(NUM_THREADS) as pool:
+            for conf in tqdm(c, desc="conf "+group, disable=True):
+                results.append(pool.submit(_optimize_MLPRegressor, df, feat_cols, pred_col, tbd_df, conf))
+                #_optimize_DecisionTreeRegressor(df, feat_cols, pred_col, tbd_df,conf, ret_ds)
+            for r in tqdm(results, desc="conf "+group):
+                ret_ds.append(r.result())   
+
+
+    return ret_ds
+
+
+def eval_tuning(x_path, full=False):
+    onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+
+    regex = re.compile(r"^(CP_\d+)_(\w+)\.csv\.gz$")
+    eval_tuning_files=defaultdict(lambda: dict())
+    for f in tqdm(onlyfiles, desc="eval_tuning", disable=True):
+        #cpo_ocpp_data_CP_1_0bc01518df38656accbde55bfb0e38591.csv.gz
+        result = regex.search(f)
+        if result:
+            f_df = pd.read_csv(os.path.join(x_path, f), index_col=0)
+            eval_tuning_files[result.group(2)][result.group(1)] = f_df
+        else:
+            logger.info(f"unk file {os.path.join(x_path, f)}")
+            continue
+
+    if full:
+        return eval_tuning_files
+
+    ret_dict=dict()
+    for reg,group_d in eval_tuning_files.items():
+        for group, df in group_d.items():
+            df["rmse_n"] = (df["rmse"] - df["rmse"].min()) /  (df["rmse"].max() - df["rmse"].min())
+            #print(reg,group,df[df["rmse"] == df["rmse"].min()])
+        sum_df = pd.concat([df for group, df in group_d.items()])
+        sum_df = sum_df.groupby(by="conf").sum() / len(group_d)
+        best_df = sum_df[sum_df["rmse_n"] == sum_df["rmse_n"].min()]
+        best_df = sum_df[sum_df["rmse"] == sum_df["rmse"].min()]
+        # print(reg,best_df)
+        for i,r in best_df.iterrows():
+            ret_dict[reg] = {"conf":i, "eval":r}
+
+    return ret_dict
+
+
+def get_eval_dicts(OutDataDIR):
+    ret_d=dict()
+    DIR=OutDataDIR+"/results/"
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")]
+    for cpo in sub_folders:
+        ret_dict=eval_tuning(OutDataDIR+"/results/"+cpo)
+        ret_d[cpo] = ret_dict
+
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")]
+    for dso in sub_folders:
+        ret_dict=eval_tuning(OutDataDIR+"/results/"+dso)
+        ret_d[dso] = ret_dict
+
+    return ret_d
+
+def get_cp_group_eval_dicts(OutDataDIR):
+    ret_d=dict()
+    DIR=OutDataDIR+"/results/"
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("CPO_")]
+    for cpo in sub_folders:
+        ret_dict=eval_tuning(OutDataDIR+"/results/"+cpo, full=True)
+        ret_d[cpo] = ret_dict
+
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name)) if name.startswith("DSO")]
+    for dso in sub_folders:
+        ret_dict=eval_tuning(OutDataDIR+"/results/"+dso, full=True)
+        ret_d[dso] = ret_dict
+
+    return ret_d
+
+
+def get_prediction_dicts(OutDataDIR):
+    regex = re.compile(r"^(CP_\d+)_(\w+)_(\d+).csv.gz$")
+
+    ret_d=defaultdict(lambda:defaultdict(lambda:list()))
+    DIR=OutDataDIR+"/predictions/"
+    sub_folders = [name for name in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, name))]
+    for actor in sub_folders:
+        if ".bak" in actor: # or ".all" in actor:
+            logger.warning(f'skiping {OutDataDIR+"/predictions/"+actor}')
+            continue
+        if len(actor.split(".")) > 1:
+            features = actor.split(".")[1:]
+        else:
+            features=["all"]
+            logger.warning(f"assuming all features for {actor}")
+        x_path=OutDataDIR+"/predictions/"+actor
+        onlyfiles = [f for f in os.listdir(x_path) if os.path.isfile(os.path.join(x_path, f))]
+        for f in onlyfiles:
+            if ".bak" in f or ".test" in f: # or ".all" in actor:
+                logger.warning(f'skiping {OutDataDIR+"/predictions/"+actor} {f}')
+                continue
+            result = regex.search(f)
+            if result:
+                result.group(1)
+                ret_d[actor][result.group(1)].append({"reg":result.group(2),"shifts":result.group(3),"file":os.path.join(x_path, f),"features":features})
+            else:
+                logger.error(f"unk file {f}")
+                continue
+    return ret_d
+
+
+def get_reg_id(reg_n, df, feat_cols, pred_col, conf):
+    h = hashlib.new('sha256')
+    h.update(repr(conf).encode("utf-8"))
+    h.update(pd.util.hash_pandas_object(df, index=True).values)
+    h.update(repr(sorted(feat_cols)).encode("utf-8"))
+    h.update(repr(pred_col).encode("utf-8"))
+    ha = h.hexdigest()
+    reg_id = (reg_n, ha)
+    return reg_id
diff --git a/ids/run_ids.sh b/ids/run_ids.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f618064ac592d8747e0a5156cde5dc14a5840012
--- /dev/null
+++ b/ids/run_ids.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+
+# nohup bash run_dso3.sh >> rlog_dso3.log 2>&1 &   #wat
+# tail -f rlog_dso3.log
+
+#TRAIN:
+
+python3 ids.py -c=load_data -v -d elaadnl
+python3 ids.py -c=get_features_cpo -v -d elaadnl #only for OSCP fix
+python3 ids.py -c=get_features_dso -l 96 -v -d elaadnl
+#python3 ids.py -c=plot_atks -v -d elaadnl
+
+
+
+#Regression without grid features:
+#RandomForestRegressor DecisionTreeRegressor GradientBoostingRegressor LinearSVR MLPRegressor HistGradientBoostingRegressor
+python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag no_norm no_date1 no_grid -v -d elaadnl #
+
+python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid -v -d elaadnl #
+
+
+#Regression with grid features:
+python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag no_norm no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+
+python3 ids.py -c=train_reg_dso -r LinearSVR -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r RandomForestRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r HistGradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+#python3 ids.py -c=train_reg_dso -r GradientBoostingRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r DecisionTreeRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+python3 ids.py -c=train_reg_dso -r MLPRegressor -f only_pred_lag only_norm no_norm1 no_date1 no_grid_storage no_grid_sgen add_bus_relations -v -d elaadnl #
+
+
+#exit 0
+
+python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl
+
+python3 ids.py -c=get_features_clf_dso -v -d elaadnl
+
+
+
+#Novelty Detection without grid features:
+# LocalOutlierFactor OneClassSVM EllipticEnvelope IsolationForest
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm set_13 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm set_13 -v -d elaadnl -O #
+#python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta set_13 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm set_13 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm set_13 -v -d elaadnl -O #
+
+#Novelty Detection with basic grid features:
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm set_3 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm set_3 -v -d elaadnl -O #
+# python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta set_3 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm set_3 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm set_3 -v -d elaadnl -O #
+
+#Novelty Detection with advanced grid features:
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O #
+#python3 ids.py -c=optimize_clf_dso -C LocalOutlierFactor -F no_reg only_norm train_conta add_grid_load_expo_static_100 set_35 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C OneClassSVM -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O #
+python3 ids.py -c=optimize_clf_dso -C EllipticEnvelope -F no_reg only_norm add_grid_load_expo_static_100 set_35 -v -d elaadnl -O #
+
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_2 set_352 -v -d elaadnl -O
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_4 set_352 -v -d elaadnl -O
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_6 set_352 -v -d elaadnl -O
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_8 set_352 -v -d elaadnl -O
+python3 ids.py -c=optimize_clf_dso -C IsolationForest -F no_reg only_norm add_grid_load_expo_rnd_10 set_352 -v -d elaadnl -O
+
+
+python3 ids.py -c=eval_tuning_clf -v -d elaadnl -O #1 2
+#python3 ids.py -c=print_eval_tuning_clf -v -d elaadnl
+
+#TEST
+python3 ids.py -c=load_data -v -d elaadnl_atk #1 2
+python3 ids.py -c=get_features_cpo -v -d elaadnl_atk #only for OSCP fix
+python3 ids.py -c=get_features_dso -l 96 -v -d elaadnl_atk #1 2
+
+python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl_atk
+#python3 ids.py -c=do_pred_dso -s 48 -R -v -d elaadnl_atk
+
+python3 ids.py -c=get_features_clf_dso -v -d elaadnl_atk #1
+python3 ids.py -c=get_is_atk_dfs_dso -v -d elaadnl_atk #1
+
+python3 ids.py -c=do_clf_dso -v -d elaadnl_atk -O
+
+#Eval Figures:
+# python3 ids.py -c=eval_clf_results4 -v -d elaadnl_atk #Figure 6
+# python3 ids.py -c=eval_clf_results62 -v -d elaadnl_atk -e 0.40 #Figure 7 & 8
+# python3 ids.py -c=eval_clf_results63 -v -d elaadnl_atk -e 0 #Figure 8
+# python3 ids.py -c=eval_clf_results7 -v -d elaadnl_atk -e 0 #Figure 10
+
+exit 0