queries.py

import math
import pandas as pd

FEATHERS_DIR = "feathers"

import generate_graphs as gg
import helper_scripts.helper_functions as hf
import analyze_packets as ap


def main():
    data = pd.read_feather(f"{FEATHERS_DIR}/data.feather")
    # data = pd.read_feather(f"{FEATHERS_DIR}/data_run_20241028.feather")

    # bandwith_calcs()
    loss_calculations()
    # static_scenario_statistical_analysis(data)
    # median_of_all_static_runs_per_algorithm(data)
    # stats_of_qtl95_of_packetloss(data)
    # error_count_and_rate(data)
    # measurements_with_negative_skewness(data)
    # iqr_kurtosis_of_delay_data(data)

    # print_kem_ids()


def bandwith_calcs():
    df = get_cic_and_sic()

    def calc_delay_cuz_of_bandwidth_in_ms(cic, sic, lsf, bandwidth, packetlength=1242):
        """
        Calculates the delay in ms caused by bandwidth.

        Args:
            cic: client initial count.
            sic: server initial count.
            lsf: last server ethernet frame length.
            bandwidth: bandwidth in bits per second.
            packetlength: length of a packet in bytes.

        Returns:
            delay in seconds.
        """
        SECONDS_IN_MS = 1000
        return (
            (((cic + (sic - 1)) * packetlength) + lsf) * 8 / bandwidth * SECONDS_IN_MS
        )

    for bw in [0.1, 0.25, 0.5, 1, 3, 5, 500]:
        df[f"t_delay_{bw}"] = df.apply(
            lambda row: calc_delay_cuz_of_bandwidth_in_ms(
                row["cic"], row["sic"], row["server_last_packet_length"], bw * 1000000
            ),
            axis=1,
        )

    print(df)
    return df


def loss_calculations():
    df = get_cic_and_sic()

    # p_noOneSec does not make sense if cic or sic is bigger than 10 -> look thesis
    df = df.query("cic <= 10 and sic <= 10")

    def calc_p_no_loss(cic, sic, l):
        """
        Calculates the probability p_noLoss.

        Args:
            cic: client initial count.
            sic: server initial count.
            l: loss probability.

        Returns:
            p_noLoss as defined in the thesis.
        """

        return (1 - l) ** (cic + sic)

    def calc_p_no_one_sec_delay(cic, sidc, l):
        """
        Calculates the probability p_noOneSec.

        Args:
            cic: client initial count.
            sidc: server initial decryptable count, without last packet of sic if length < 1200.
            lsf: last server ethernet frame length.
            l: loss probability.

        Returns:
            p_noOneSec as defined in the thesis.
        """

        term1 = (1 - l) ** cic * (1 - l ** (sidc + (cic - 1)))
        term2 = 0

        # range: upper is not inclusive, in math symbol SUM it is inclusive
        for i in range(1, cic):
            term2 += (
                math.comb(cic, i) * (1 - l) ** (cic - i) * l**i * (1 - l ** (cic - i))
            )

        return term1 + term2

    def calc_l_for_no_loss_p(cic, sic, p):
        """
        Calculates the loss probability l for a p_noLoss of 0.95.

        Args:
            cic: client initial count.
            sic: server initial count.
            p: probability.

        Returns:
            l as defined in the thesis.
        """

        return 1 - (p ** (1 / (cic + sic)))

    df["sidc"] = df.apply(
        lambda row: (
            row["sic"] - 1 if row["server_last_packet_length"] < 1200 else row["sic"]
        ),
        axis=1,
    )

    for l in [0.01, 0.05, 0.10, 0.20]:
        df[f"p_noLoss_{l}"] = df.apply(
            lambda row: calc_p_no_loss(row["cic"], row["sic"], l), axis=1
        )
        df[f"p_noOneSec_{l}"] = df.apply(
            lambda row: calc_p_no_one_sec_delay(row["cic"], row["sidc"], l),
            axis=1,
        )

    df["l_for_noLoss_p50"] = df.apply(
        lambda row: calc_l_for_no_loss_p(row["cic"], row["sic"], 0.50), axis=1
    )
    df["l_for_noLoss_p95"] = df.apply(
        lambda row: calc_l_for_no_loss_p(row["cic"], row["sic"], 0.95), axis=1
    )

    print(df)

    return df


def static_scenario_statistical_analysis(data):
    ldata = data
    print("Static scenario statistical analysis")
    ldata = gg.filter_data(
        ldata,
        scenario="static",
        protocol="quic",
        sec_level=["secLevel1", "secLevel1_hybrid"],
    )

    means_of_medians = []
    stdevs_of_medians = []

    kem_alg_names = ldata["kem_alg"].unique()
    for kem_alg_name in kem_alg_names:
        kem_alg_data = ldata.query(f"kem_alg == '{kem_alg_name}'")
        medians = kem_alg_data["median"]
        # print(kem_alg_name, medians.mean(), medians.std())
        means_of_medians.append(medians.mean())
        stdevs_of_medians.append(medians.std())

    print("Mean of stdevs of medians")
    print(pd.Series(stdevs_of_medians).mean())
    print("Stdev of stdevs of medians")
    print(pd.Series(stdevs_of_medians).std())


def median_of_all_static_runs_per_algorithm(data):
    ldata = data
    print("Median of all static runs per algorithm")
    ldata = gg.filter_data(ldata, scenario="static", protocol="quic")
    # compound per algorithm, then take the median of all

    # get every algorithm name
    # print(ldata["kem_alg"].unique())
    kem_alg_names = ldata["kem_alg"].unique()
    for kem_alg_name in kem_alg_names:
        kem_alg_data = ldata.query(f"kem_alg == '{kem_alg_name}'")
        # print(kem_alg_data)
        kem_alg_measurements = []
        for row in kem_alg_data.iterrows():
            # print(row[1]["measurements"])
            kem_alg_measurements.extend(row[1]["measurements"])
            # print(row[1]["median"])
        print(f"Median of {kem_alg_name}")
        print(pd.Series(kem_alg_measurements).median())
        print()


def stats_of_qtl95_of_packetloss(data):
    ldata = data
    print("Stats of qtl95")

    ldata = gg.filter_data(ldata, scenario="packetloss", protocol="quic")
    ldata = ldata.query("kem_alg == 'x25519' or kem_alg == 'frodo640aes'")
    # ldata = ldata.query("kem_alg == 'mlkem1024' or kem_alg == 'frodo1344aes'")

    # ldata = ldata.query
    print("Showing data of packetloss quic")
    ldata = ldata.drop(
        columns=[
            "scenario",
            "protocol",
            "sec_level",
            "cli_pkt_loss",
            "cli_delay",
            "cli_rate",
            "measurements",
        ]
    )
    print(ldata)


# For old run without bigger crypto buffer: Grep tells there are 83996 CRYPTO_BUFFER_EXCEEDEDs, while total error count is just a bit above it 84186
# For new run with fix: 187.0 other errors, probably from server side, because 'Shutdown before completion' on client side while waiting for handshake to complete -> b'808B57C2E1760000:error:0A0000CF:SSL routines:quic_do_handshake:protocol is shutdown:ssl/quic/quic_impl.c:1717:\n'
def error_count_and_rate(data):
    print("Error count and rate")
    ldata = data
    print("Total index length")
    print(len(ldata.index))
    print("Total error count")
    print(ldata["error_count"].sum())
    ldata = ldata.query("error_count > 0")
    print("Total index length with error count > 0")
    print(len(ldata.index))
    print("Data with error count > 0 describe error_count")
    print(ldata["error_count"].describe())
    print("How much each scenario has error count > 0")
    print(ldata["scenario"].value_counts())
    print("How much each protocol has error count > 0")
    print(ldata["protocol"].value_counts())
    print("How much each scenario protocol combinanation has error count > 0")
    print(ldata.groupby(["scenario", "protocol"]).size())
    print("How much each kem_alg has error count > 0")
    print(ldata["kem_alg"].value_counts())

    print("With error count > 3")
    ldata = ldata.query("error_count > 12")
    print(
        ldata[
            [
                "scenario",
                "protocol",
                "sec_level",
                "kem_alg",
                "error_count",
                "error_rate",
            ]
        ]
    )


def measurements_with_negative_skewness(data):
    print("Measurements with negative skewness")
    ldata = data
    print("Skewness of data")
    print(ldata["skewness"].describe())

    print("Amount of data with negative skewness")
    ldata = ldata.query("skewness < 0")
    print(len(ldata.index))
    # ldata = ldata.query("scenario != 'reorder'")
    # print(len(ldata.index))
    # give out per scenario the count of measurements with negative skewness
    print("Per scenario numbers of measurements with negative skewness")
    print(ldata["scenario"].value_counts())  # mostly reorder and jitter, rate a bit


def iqr_kurtosis_of_delay_data(data):
    print("Kurtosis of data, Fisher's definition, so 0 is normal distribution")
    ldata = data
    print(ldata[["iqr", "kurtosis"]].describe())
    ldata = ldata.query("scenario == 'delay'")
    print(ldata[["iqr", "kurtosis"]].describe())


def print_kem_ids():
    data = hf.get_kem_ids()
    print(data)


def get_cic_and_sic():
    udp_packets_df = pd.read_feather("feathers/udp_packets.feather")
    df = ap.get_packets_sent_by_node(udp_packets_df)

    print("\n\n Loss calculations")
    df = df.drop(columns=["length_public_key", "length_ciphertext"])
    # print(df)
    df["cic"] = df["client_sent_packets_with_crypto_count"] - 1
    df["sic"] = df["server_sent_packets_with_crypto_count"]
    df = df.drop(
        columns=[
            "client_sent_packets_with_crypto_count",
            "server_sent_packets_with_crypto_count",
        ]
    )
    return df


if __name__ == "__main__":
    main()