Commit 09d23c53 authored by Patrick Schlindwein's avatar Patrick Schlindwein
Browse files

Merge branch 'learn/#18-chatbot_quality' into 'master'

#18 chatbot quality

See merge request !65
parents 0b02cc47 799cc896
Pipeline #73948 passed with stages
in 2 minutes and 30 seconds
from vectorizerr import IVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from spacy_parser import spacy_tokenizer
import numpy as np
class BowVectorizer(IVectorizer):
id = "bow"
def vectorize(self, strings: list[str]) -> list:
count_vectorizer = CountVectorizer(
ngram_range=(1, 2),
tokenizer=spacy_tokenizer
)
matrix = count_vectorizer.fit_transform(strings).toarray()
return matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
from measure import Measure
class d_bc(Measure):
def measure(self, a, b) -> float:
sum1 = 0
sum2 = 0
for i in range(0, len(a)):
sum1 += abs(a[i] - b[i])
sum2 += abs(a[i] + b[i])
return sum1 / sum2
def get_id(self):
return "d_bc"
from measure import Measure
class d_can(Measure):
def measure(self, a, b) -> float:
sum1 = 0
sum2 = 0
for i in range(0, len(a)):
sum1 += a[i] - b[i]
sum2 += abs(a[i]) + abs(b[i])
return sum1 / sum2
def get_id(self):
return "d_can"
from measure import Measure
class d_cheb(Measure):
def measure(self, a, b) -> float:
res = -1
for i in range(0, len(a)):
x = abs(a[i] - b[i])
if x > res:
res = x
return res
def get_id(self):
return "d_cheb"
from measure import Measure
class d_city(Measure):
def measure(self, a, b) -> float:
sum1 = 0
for i in range(0, len(a)):
sum1 += abs(a[i] - b[i])
return sum1
def get_id(self):
return "d_city"
from measure import Measure
class d_jac(Measure):
def measure(self, a, b) -> float:
union = 0
intersection = 0
for i in range(0, len(a)):
if a[i] != 0 or b[i] != 0:
union += 1
if a[i] != 0 and b[i] != 0:
intersection += 1
return (union - intersection) / union
def get_id(self):
return "d_jac"
from measure import Measure
class d_lr(Measure):
def __init__(self, r):
self.r = r
def measure(self, a, b) -> float:
sum1 = 0
for i in range(0, len(a)):
sum1 += abs(a[i] - b[i]) ** self.r
return sum1 ** (1/self.r)
def get_id(self):
return "d_l" + str(self.r)
from measure import Measure
import numpy as np
class d_tri(Measure):
def measure(self, a, b) -> float:
sum1 = 0
for i in range(0, len(a)):
numerator = (a[i] - b[i]) ** 2
denominator = a[i] + b[i]
if denominator == 0:
sum1 += 0
else:
sum1 += numerator / denominator
sum1 /= 2
if sum1 <= 0:
return 0
return np.sqrt(sum1)
def get_id(self):
return "d_tri"
from vectorizerr import IVectorizer
import numpy as np
def eff(strings: list[str], t: float, vectorizer: IVectorizer):
"""
A efficient implementation of the general algorithm using cosine similarity
"""
B = vectorizer.vectorize(strings)
C = B.dot(np.transpose(B))
C_prime = (C - np.tril(C))
return np.argwhere(C_prime > t), 0
from vectorizerr import IVectorizer
from measure import Measure
def find_similar_strings(strings: list[str],
t: float,
vectorizer: IVectorizer,
measure: Measure,
comparer):
"""
General algorithm to find similar strings in a set of strings
:param strings: The set of strings
:param t: The threshold
:param vectorizer: Vertorization algorithm
:param measure: The measure used to compare the vectors
:param comparer: Function to compare measure to the threshold
:return: A list of coordinates representing similar strings in the
input set
"""
res = list()
I = vectorizer.vectorize(strings)
for i in range(1, len(I)):
for j in range(0, i):
sij = measure.measure(I[i], I[j])
if comparer(sij, t):
res.append([i, j])
return res
@misc{word2vec,
title = {Word vectors and semantic similarity},
howpublished = "\url{https://spacy.io/usage/linguistic-features##vectors-similarity}",
note = "[Online; accessed 25.05.2021]"
}
@misc{ngram,
title = {n-gram},
howpublished = "\url{https://en.wikipedia.org/wiki/N-gram}",
note = "[Online; accessed 25.05.2021]"
}
@misc{bow,
title = {Bag-of-words model},
howpublished = "\url{https://en.wikipedia.org/wiki/Bag-of-words_model}",
note = "[Online; accessed 25.05.2021]"
}
@misc{SciPy,
title = {SciPy distancefunctions},
howpublished = "\url{https://docs.scipy.org/doc/scipy/reference/spatial.distance.html}",
note = "[Online; accessed 25.05.2021]"
}
@misc{braycrutis,
title = {Bray-Crutis dissimilarity},
howpublished = "\url{https://en.wikipedia.org/wiki/Bray-Curtis_dissimilarity}",
note = "[Online; accessed 25.05.2021]"
}
@misc{canberra,
title={Canberra distance},
howpublished = "\url{https://en.wikipedia.org/wiki/Canberra_distance}",
note = "[Online; accessed 25.05.2021]"
}
@misc{chebyshev,
title={Chebyshev distance},
howpublished = "\url{https://en.wikipedia.org/wiki/Chebyshev_distance}",
note = "[Online; accessed 25.05.2021]"
}
@misc{cityblock,
title={City Block Distance},
howpublished = "\url{https://docs.tibco.com/pub/spotfire/6.5.2/doc/html/hc/hc_city_block_distance.htm}",
note = "[Online; accessed 25.05.2021]"
}
@misc{jaccard,
title={Jaccard index},
howpublished = "\url{https://en.wikipedia.org/wiki/Jaccard_index}",
note = "[Online; accessed 25.05.2021]"
}
@misc{cosinesim,
title={Cosine similarity},
howpublished = "\url{https://en.wikipedia.org/wiki/Cosine_similarity}",
note = "[Online; accessed 25.05.2021]"
}
@misc{angsim,
title={Angular similarity},
howpublished = "\url{https://en.wikipedia.org/wiki/Cosine_similarity##Angular_distance_and_similarity}",
note = "[Online; accessed 25.05.2021]"
}
@misc{pearson,
title={Pearson correlation coefficient},
howpublished = "\url{https://en.wikipedia.org/wiki/Pearson_correlation_coefficient}",
note = "[Online; accessed 25.05.2021]"
}
@article{fourmetrics,
author = "Richard Connor",
title = "{A Tale of Four Metrics}",
year = "2016",
}
@misc{contingencyTable,
title={Contingency table},
howpublished = "\url{https://www.statology.org/contingency-table-python/}"
}
@misc{chiquadrat,
title={Chi-Quadrat-Test},
howpublished = "\url{https://datatab.de/tutorial/chi-quadrat}"
}
@misc{ResearchDialogSkillAnalysis,
title={ResearchDialogSkillAnalysis},
howpublished = "\url{https://code.fbi.h-da.de/pse-trapp-public/intentfinder/-/blob/master/docs/ResearchDialogSkillAnalysis.pdf}"
}
\ No newline at end of file
from d_bc import d_bc
from d_can import d_can
from d_cheb import d_cheb
from d_city import d_city
from d_jac import d_jac
from d_lr import d_lr
from d_tri import d_tri
from s_cos import s_cos
from s_ang import s_ang
from s_pear import s_pear
from bow_vectorizer import BowVectorizer
from spacy_vectorizer import SpacyVectorizer
from general_algorithm import find_similar_strings
from efficient_algorithm import eff
from test_data import rki_faq
import time
"""
List of all vectorizers
"""
vectorizers = [
BowVectorizer(),
SpacyVectorizer()
]
"""
List of all similarity measures
"""
similarities = [
s_cos(),
s_ang(),
s_pear()
]
"""
List of all distance measures
"""
distances = [
d_bc(),
d_can(),
d_cheb(),
d_city(),
d_jac(),
d_lr(1),
d_lr(2),
d_lr(3),
d_tri()
]
"""
List of all test cases
"""
t_tests = {"bow": {}, "spacy": {}}
t_tests["bow"]["s_cos"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["bow"]["s_ang"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["bow"]["s_pear"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["bow"]["d_l1"] = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 6, 8, 10]
t_tests["bow"]["d_l2"] = [0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1]
t_tests["bow"]["d_l3"] = [0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1]
t_tests["bow"]["d_bc"] = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
t_tests["bow"]["d_can"] = [-0.01, -0.05, -0.1, -0.15, -0.2, -0.3, -0.4, -0.5]
t_tests["bow"]["d_cheb"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["bow"]["d_city"] = [1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.9]
t_tests["bow"]["d_tri"] = [1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.9]
t_tests["bow"]["d_jac"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["spacy"]["s_cos"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["spacy"]["s_ang"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["spacy"]["s_pear"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["spacy"]["d_l1"] = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 2, 4, 5, 6, 8, 10]
t_tests["spacy"]["d_l2"] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
t_tests["spacy"]["d_l3"] = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
t_tests["spacy"]["d_bc"] = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
t_tests["spacy"]["d_can"] = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4]
t_tests["spacy"]["d_cheb"] = [0.01, 0.05, 0.08, 0.1, 0.15, 0.17]
t_tests["spacy"]["d_city"] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
t_tests["spacy"]["d_tri"] = [1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.9]
t_tests["spacy"]["d_jac"] = []
f = open("results.csv", "w")
f.write("Vectorizer;Similarity;t;runtime\n")
def runtime_wrapper(function):
"""
This function will run its parameter-function and measure the
execution time of it
:param function: the function
:return: function result, runtime
"""
start = time.time()
result = function()
end = time.time()
return result, end - start
def run_test(dataset, vectorizer, measure, t, comparer):
"""
This function will run a single test
:param dataset: The test data
:param vectorizer: The test vectorizer
:param measure: The test measure
:param t: The test threshold
:param comparer: The test comparer
:return:
"""
result, runtime1 = runtime_wrapper(lambda:
find_similar_strings(dataset, t,
vectorizer,
measure,
comparer))
print(vectorizer.id + " " + measure.get_id() + " " + str(t)
+ " " + str(runtime1))
if 5 < len(result) < 8:
f.write(
vectorizer.id + ";" + measure.get_id() + ";" + str(t)
+ ";" + str(runtime1) + "\n")
"""
Running all specified Tests
"""
for vectorizer in vectorizers:
for similarity in similarities:
for t in t_tests[vectorizer.id][similarity.get_id()]:
run_test(rki_faq, vectorizer, similarity, t, lambda a, b: (a > b))
for distance in distances:
for t in t_tests[vectorizer.id][distance.get_id()]:
run_test(rki_faq, vectorizer, distance, t, lambda a, b: (a < b))
"""
Run separate Test for efficient implementation
"""
_, runtime = runtime_wrapper(lambda: eff(rki_faq, 0.8, vectorizers[1]))
f.write("spacy;s_cos;0.8;" + str(runtime) + "\n")
from abc import ABC, abstractmethod
class Measure(ABC):
@abstractmethod
def measure(self, a, b) -> float:
"""
Measures the similarity or distance between a and b
:param a: First vector
:param b: Second vector
:return: Similarity or distance
"""
pass
@abstractmethod
def get_id(self):
"""
Gets the id of the Measure
:return: the id
"""
raise NotImplementedError
\documentclass{article}
\usepackage{geometry}
\geometry{
left=2.5cm,
right=2.5cm,
top=2cm,
bottom=3cm
}
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{listings}
\usepackage[utf8]{inputenc}
\usepackage{tabularx}
\usepackage{spverbatim}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage[u
tf8]{inputenc}
\usepackage[english]{babel}
\usepackage{makecell}
\usepackage[thinlines]{easytable}
\usepackage{csquotes}
\usepackage{biblatex}
\addbibresource{literatur.bib}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\author{Christof Walther, Areum Kim}
\title{Erhaltung der Qualität der Chatbot-Struktur}
\date{\today}
\begin{document}
\maketitle
\newpage
\tableofcontents
\newpage
\section{N-Gramm Model}
N-Gramme sind das Ergebnis der Zerlegung eines Textes in Fragmente. Der Text wird dabei zerlegt, und jeweils N aufeinanderfolgende Fragmente werden als N-Gramm zusammengefasst.\cite{ngram} Die Fragmente sind in unserem Fall Wörter (Token) die uns spaCy erzeugt.
\\
\\
\\
Beispiel: Mono- und Bigramme des Satzes "Hallo schöne Welt"\\
\\
Monogramme: Hallo, schöne, Welt\\
Bigramme: Hallo schöne, schöne Welt
\section{Vektorizierung von Strings}
Um Strings effizient miteinander vergleichen zu können, besteht die Möglichkeit diese als Vektor darzustellen.
\subsection{Bag-of-words (BOW) Model}
Ein Text wird als unstrukturierte Ansammlung von Wörtern betrachtet. In einem Vektor wird lediglich die Häufigkeit der aufgetretenen Wörter festgehalten.
\cite{bow}
\\
\\
Beispiel: \\
Satz 1: "Der Baum ist groß."\\
Satz 2: "Das Haus ist groß." \\
\\
Durch spaCy erhalten wir folgende Token:\\
"Der", "Baum", "ist", "groß"\\
"Das", "Haus", "ist", "groß"\\
\\
Aus diesen Tokens lässt sich nun folgende BOW darstellung ableiten:\\
\begin{table}[h]
\centering
\begin{tabular}{l|c|c|c|c|c|c}
\textbf{}& \textbf{Der} & \textbf{Das} &\textbf{Baum} &\textbf{Haus} &\textbf{ist} &\textbf{groß} \\
\hline
Satz 1 & 1 & 0 & 1 & 0 & 1 & 1\\
Satz 2 & 0 & 1 & 0 & 1 & 1 & 1\\
\end{tabular}
\end{table}
Wir erhalten letztendlich die Vektoren:
\begin{align*}
\text{Satz 1}\; {=}\ [1\; 0\; 1\; 0\; 1\; 1]^T\\
\text{Satz 2}\; {=}\ [0\; 1\; 0\; 1\; 1\; 1]^T\\
\end{align*}
Wichtig hierbei ist zu sehen, dass in den Ergebnissvektoren auch die Anzahl der Wörter vorkommen,
die nicht im Ursprungssatz vorhanden sind (0).
Dies ist notwendig um die Analyse der Vektoren von unterschiedlich langen Sätzen zu ermöglichen.\\
Desweitern ist es möglich nicht nur Monogramme in diese Vektorform zu bringen, sondern auch beliebige N-Gramme die aus den Tokens erzeugt werden.\\
\\
Beispielcode zum Erzeugen einer BOW Matrix aus mehreren Strings mittles sklearn:\\
\begin{spverbatim}
from sklearn.feature_extraction.text import CountVectorizer
A = ["string 1", "string 2", "string 3"]
count_vectorizer = CountVectorizer(
ngram_range=(1, 2),
tokenizer=spacy_tokenizer
)
features = count_vectorizer.fit_transform(A).toarray()
\end{spverbatim}
\subsection{word2vec}
spaCy kann aus Wörtern, durch das verwenden eines neuronalem Netzwerks, Vektoren erzeugen.
Der große Vorteil gegenüber der BOW Vektoren ist, dass sie auch semantische Zusammenhänge darstellen
können, wie zum Beispiel Synonyme.
\cite{word2vec}
\\
\\Beispielcode zum Erzeugen einer word2vec Matrix aus mehreren Strings mittles spaCy:\\
\begin{spverbatim}
import spacy
parser = spacy.load('de_core_news_sm')
A = ["string 1", "string 2", "string 3"]
B = []
for sent in rki_faq:
doc = parser(sent)
B.append(doc.vector)
\end{spverbatim}
\section{Ähnlichkeitsanalyse}
\subsection{Ähnlichkeit zweier Vektoren}
Sei $I = \{v |\; v \in (\mathbb{R})^n, \; \norm{v}_2 = 1 \}$ eine Menge von normierten Vektoren. Ziel der Ähnlichkleitsanalyse ist es die Ähnlichkeit bzw. Distanz zweier Vektoren $a,b \in I$ zueinander zu bestimmen. Hierfür gibt es verschiedene Distanz-/Ähnlichkeitsfunktionen.
Eine Funktion $s: I \times I \rightarrow \mathbb{R}$ heißt Ähnlichkeitsmaß/-funktion, falls für alle $a, b \in I$ gilt:
\begin{gather*}
s(a,b)\;=\;s(b,a)\\
s(a,b)\geq0\\
s(a,b) = 1 \iff a = b
\end{gather*}
Eine Funktion $d: I \times I \rightarrow \mathbb{R}$ heißt Distanzmaß/-funktion, falls für alle $a, b \in I$ gilt:
\begin{gather*}
d(a,b)\;=\;d(b,a)\\