Commit c780a4a3 authored by Christof Walther's avatar Christof Walther Committed by Patrick Schlindwein
Browse files

#60 refactor entire python codebase to get rid of code smells

parent 9581e9a2
from app.similarity.intent_cluster import IntentCluster
from app.similarity.intent import Intent from app.similarity.intent import Intent
from app.similarity.intent_cluster import IntentCluster
# very primitive clustering algrotihm for intents def find_intent_clusters(intent_list: list[Intent], t: float,
iterations: int) -> list:
"""
This method finds clusters in a list of Intents. The distance is calculated
by their cosine similarity
:param iterations: The number of Iterations.
:param t: [0, 1] The max cluster distance
:param intent_list: A list of Intents
:returns: cluster_list: A list of Intent Clusters
:rtype: list
"""
def find_intent_clusters(intent_list: list[Intent], t: float,
iterations: int) -> float:
# init # init
cluster_list = [] cluster_list = []
cluster_index = 0 cluster_index = 0
...@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float, ...@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float,
current_cluster.recalculate_center() current_cluster.recalculate_center()
added_to_cluster = True added_to_cluster = True
# if not then become a cluster center # if current_intent is not close to a cluster,
# become a cluster center
if not added_to_cluster: if not added_to_cluster:
cluster_list.append(IntentCluster( cluster_list.append(IntentCluster(
str(cluster_index), current_intent)) str(cluster_index), current_intent))
......
...@@ -12,12 +12,27 @@ class Intent: ...@@ -12,12 +12,27 @@ class Intent:
self.vector = self.n.vector self.vector = self.n.vector
self.vector_norm = self.n.vector_norm self.vector_norm = self.n.vector_norm
# calculating the smiliarity of the Intent to another intent def similarity_intent(self, intent) -> float:
def similarityIntent(self, intent) -> float: """
This method calculates the cosine similarity to another Intent
:param intent: The other Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
return self.n.similarity(intent.n) return self.n.similarity(intent.n)
# calculating the smiliarity of the Intent to another vector # calculating the similarity of the Intent to another vector
def similarity(self, vector) -> float: def similarity(self, vector) -> float:
"""
This method calculates the cosine similarity to another Vector
:param vector: A vectorized Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
norm = 0 norm = 0
for x in vector: for x in vector:
norm += x ** 2 norm += x ** 2
......
...@@ -4,12 +4,16 @@ from app.similarity.intent import Intent ...@@ -4,12 +4,16 @@ from app.similarity.intent import Intent
class IntentCluster(list): class IntentCluster(list):
def __init__(self, label: str, intent: Intent): def __init__(self, label: str, intent: Intent):
super().__init__()
self.center = intent.vector self.center = intent.vector
self.append(intent) self.append(intent)
self.label = label self.label = label
# setting the center of the cluster equal to the median of all elements
def recalculate_center(self): def recalculate_center(self):
"""
This method calculates the center of the cluster and updates the center
member variable
"""
for i in range(0, len(self.center)): for i in range(0, len(self.center)):
tmp_list = [] tmp_list = []
for element in self: for element in self:
......
...@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase): ...@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase):
"Wo gibt es weitere Informationen?" "Wo gibt es weitere Informationen?"
] ]
def test_similarty(self): def test_similarity(self):
# generate 2 similary ids """
In this case two similar Intents are created and the
Intent:similarity_intent() method is tested
"""
# generate 2 similar ids
ih = IntentHandler(TestSimilarityAnalysis.rki_faq[53]) ih = IntentHandler(TestSimilarityAnalysis.rki_faq[53])
ih2 = IntentHandler(TestSimilarityAnalysis.rki_faq[54]) ih2 = IntentHandler(TestSimilarityAnalysis.rki_faq[54])
id = ih.generate_intent_id(3).replace("_", " ") id1 = ih.generate_intent_id(3).replace("_", " ")
id2 = ih2.generate_intent_id(3).replace("_", " ") id2 = ih2.generate_intent_id(3).replace("_", " ")
# init id vetors # init id vectors
intent1 = Intent(id) intent1 = Intent(id1)
intent2 = Intent(id2) intent2 = Intent(id2)
# check if they are similar # check if they are similar
self.assertGreaterEqual(intent1.similarityIntent(intent2), 0.85) self.assertGreaterEqual(intent1.similarity_intent(intent2), 0.85)
def test_cluster(self): def test_cluster(self):
"""
In this case the find_intent_clusters() method is tested.
"""
# init
intent_list = [] intent_list = []
# generate ids for all questions and init the id vectors # generate ids for all questions and init the id vectors
for q in TestSimilarityAnalysis.rki_faq: for q in TestSimilarityAnalysis.rki_faq:
ih = IntentHandler(q) ih = IntentHandler(q)
id = ih.generate_intent_id(3).replace("_", " ") id1 = ih.generate_intent_id(3).replace("_", " ")
intent = Intent(id) intent = Intent(id1)
if intent.vector_norm > 0: if intent.vector_norm > 0:
intent_list.append(intent) intent_list.append(intent)
......
from fastapi import FastAPI, Request from fastapi import FastAPI
from app.summary.summary_word_embedding import WordEmbeddingSummarizer from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator from app.utilities import generator
...@@ -14,6 +14,12 @@ app = FastAPI( ...@@ -14,6 +14,12 @@ app = FastAPI(
" IntentFinder" " IntentFinder"
) )
"""
In this array, every summarization strategy should be instantiated exactly
once.
Note that every summarization strategy should implement the ISummaryStrategy
interface.
"""
strategies = [ strategies = [
SimpleSpacySummarizer(), SimpleSpacySummarizer(),
SentenceEmbeddingSummarizer(), SentenceEmbeddingSummarizer(),
...@@ -28,20 +34,32 @@ async def root(): ...@@ -28,20 +34,32 @@ async def root():
@app.get("/strategies") @app.get("/strategies")
async def api_strategies(): async def api_strategies():
"""
This function will generate a list of ids from all summarization strategies
:return: The list of all summarization-strategy-ids in JSON format
"""
res = [] res = []
for strategy in strategies: for strategy in strategies:
res.append(strategy.id) res.append(strategy.id)
return res return res
@app.post("/summarize/{strategy_id}", summary="Generate a summary of the given" @app.get("/summarize/{strategy_id}", summary="Generate a summary of the given"
" text.") " text.")
async def summarize(strategy_id: str, req: Request): async def summarize(strategy_id: str, text: str):
"""
This function will summarize a given text with a given summarization
strategy
:param strategy_id: The id of the strategy
:param text: The text to be summarized
:return: The summary, strategy and quality of the summary in JSON format
"""
for strategy in strategies: for strategy in strategies:
if strategy.id == strategy_id: if strategy.id == strategy_id:
quality = 0.5 quality = 0.5
req_json = await req.json() summary = strategy.summarize(text)
summary = strategy.summarize(req_json["text"])
return {"strategy": strategy_id, "quality": quality, return {"strategy": strategy_id, "quality": quality,
"summary": summary} "summary": summary}
...@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request): ...@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request):
@app.post("/intentid", summary="Generate an intent id from a given intent" @app.post("/intentid", summary="Generate an intent id from a given intent"
" text") " text")
async def generate_intent_id(intent: str, maxTokens: int): async def generate_intent_id(intent: str, max_tokens: int):
"""Generate a human readable reduced and yet expressive id for an intent """Generate a human readable reduced and yet expressive id for an intent
based on the passed based on the passed
intent text. intent text.
""" """
return generator.IntentHandler(intent).generate_intent_id(maxTokens) return generator.IntentHandler(intent).generate_intent_id(max_tokens)
...@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy): ...@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
word_frequency = self.divide_into_tokens(doc) word_frequency = self.divide_into_tokens(doc)
# now rank the sentences based on the word frequency # now rank the sentences based on the word frequency
sent_rank = self.rank_the_sentences(doc, word_frequency, text) sent_rank = self.rank_the_sentences(doc, word_frequency)
# get frequency of words # get frequency of words
top_sentences = (sorted(sent_rank.values())[::-1]) top_sentences = (sorted(sent_rank.values())[::-1])
...@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy): ...@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy):
result_text = self.create_the_summary(sent_rank, top_sent) result_text = self.create_the_summary(sent_rank, top_sent)
return result_text return result_text
def divide_into_tokens(self, doc): @staticmethod
def divide_into_tokens(doc):
"""
This method generates a word frequency dict from a given document
:param doc: document created by spacy
:returns: word_frequency: a dict containing all words and the number of
their occurrences
:rtype: dict
"""
corpus = [sent.text.lower() for sent in doc.sents] corpus = [sent.text.lower() for sent in doc.sents]
cv = CountVectorizer(stop_words=list(STOP_WORDS)) cv = CountVectorizer(stop_words=list(STOP_WORDS))
cv_fit = cv.fit_transform(corpus) cv_fit = cv.fit_transform(corpus)
...@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy): ...@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return word_frequency return word_frequency
def get_frequency_of_words(self, word_frequency): @staticmethod
# get high frequency words def rank_the_sentences(doc, word_frequency):
val = sorted(word_frequency.values()) """
This method creates a sentences ranking based on the word frequency
list created by divide_into_tokens
# gets relative frequency of words :param doc: document created by spacy
higher_frequency = val[-1] :param word_frequency: a dict containing all words and the number of
for word in word_frequency.keys(): their occurrences
word_frequency[word] = (word_frequency[word] / higher_frequency) :returns: sent_rank: a list containing all sentences with their
associated ranking
:rtype: dict
"""
def rank_the_sentences(self, doc, word_frequency, text):
sent_rank = {} sent_rank = {}
for sent in doc.sents: for sent in doc.sents:
for word in sent: for word in sent:
...@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy): ...@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return sent_rank return sent_rank
def create_the_summary(self, sent_rank, top_sent): @staticmethod
def create_the_summary(sent_rank, top_sent):
"""
This methods generates the summary
:param sent_rank: a list containing all sentences with their associated
ranking generated by rank_the_sentences
:param top_sent: the best ranked sentence
:returns: result_text: the summary
:rtype: str
"""
# create the summary # create the summary
summary = [] summary = []
for sent, strength in sent_rank.items(): for sent, strength in sent_rank.items():
......
from app.summary.summary_strategy_interface import ISummaryStrategy from app.summary.summary_strategy_interface import ISummaryStrategy
import spacy import spacy
from heapq import nlargest
import math import math
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import nltk import nltk
nltk.download('wordnet') nltk.download('wordnet')
class SummaryTFIDF(ISummaryStrategy): class SummaryTFIDF(ISummaryStrategy):
nlpGer = spacy.load('de_core_news_sm') nlpGer = spacy.load('de_core_news_sm')
nlpEng = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
def __init__(self): def __init__(self):
...@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy):
def id(self): def id(self):
return self._id return self._id
def frequency_matrix(self, summary, language): def frequency_matrix(self, summary):
""" """
This method creates a tf-idf-matrix which is a list with all sentences This method creates a tf-idf-matrix which is a list with all sentences
containing a list with all words in the sentence and their containing a list with all words in the sentence and their
frequency as value frequency as value
:param summary: given text to summarize :param summary: given text to summarize
:param language: language of the text
:returns: freq_matrix: frequency matrix :returns: freq_matrix: frequency matrix
""" """
freq_matrix = {} freq_matrix = {}
if language == 'ger': stop_words = self.nlpGer.Defaults.stop_words
stopWords = self.nlpGer.Defaults.stop_words
elif language == 'en':
stopWords = self.nlpEng.Defaults.stop_words
for sent in summary: for sent in summary:
# dictionary with 'words' as the key # dictionary with 'words' as the key
...@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy):
for word in words: for word in words:
word = self.lemmatizer.lemmatize(word) # Lemmatize the word word = self.lemmatizer.lemmatize(word) # Lemmatize the word
if word not in stopWords: # Reject stopWords if word not in stop_words: # Reject stop_words
if word in freq_table: if word in freq_table:
freq_table[word] += 1 freq_table[word] += 1
else: else:
...@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy):
return freq_matrix return freq_matrix
def tf_matrix(self, freq_matrix): @staticmethod
def tf_matrix(freq_matrix):
""" """
This method calculates the term frequency for every word This method calculates the term frequency for every word
...@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy):
return tf_matrix return tf_matrix
def sentences_per_words(self, freq_matrix): @staticmethod
def sentences_per_words(freq_matrix):
""" """
This methods returns a list with all words and how often a word is This methods returns a list with all words and how often a word is
mentioned in a sentence mentioned in a sentence
...@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy):
return sent_per_words return sent_per_words
def idf_matrix(self, freq_matrix, sent_per_words, total_sentences): @staticmethod
def idf_matrix(freq_matrix, sent_per_words, total_sentences):
""" """
This methods calculates a idf score for every word This methods calculates a idf score for every word
...@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy):
return idf_matrix return idf_matrix
def tf_idf_matrix(self, tf_matrix, idf_matrix): @staticmethod
def tf_idf_matrix(tf_matrix, idf_matrix):
""" """
This methods calculates a tf-idf-score for every word This methods calculates a tf-idf-score for every word
...@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy):
return tf_idf_matrix return tf_idf_matrix
def score_sentences(self, tf_idf_matrix): @staticmethod
def score_sentences(tf_idf_matrix):
""" """
This methods calculates a sentence score for every sentence based on This methods calculates a sentence score for every sentence based on
the tf-idf-matrix the tf-idf-matrix
:param tf_idf_matrix: tf-idf-matrix :param tf_idf_matrix: tf-idf-matrix
:returns: sentenceScore: list of all sentences with sentence score :returns: sentence_score: list of all sentences with sentence score
""" """
sentenceScore = {} sentence_score = {}
for sent, f_table in tf_idf_matrix.items(): for sent, f_table in tf_idf_matrix.items():
total_tfidf_score_per_sentence = 0 total_tfidf_score_per_sentence = 0
...@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy):
total_tfidf_score_per_sentence += tf_idf_score total_tfidf_score_per_sentence += tf_idf_score
if total_words_in_sentence != 0: if total_words_in_sentence != 0:
sentenceScore[sent] = total_tfidf_score_per_sentence / \ sentence_score[sent] = total_tfidf_score_per_sentence / \
total_words_in_sentence total_words_in_sentence
return sentenceScore return sentence_score
def average_score(self, sentence_score): @staticmethod
def average_score(sentence_score):
""" """
This method calculates the average sentence score This method calculates the average sentence score
...@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy):
return average_sent_score return average_sent_score
def create_summary_strat1(self, sentences, sentence_score, threshold): @staticmethod
def create_summary(sentences, sentence_score, threshold):
""" """
This method returns a summary with all sentences having a higher This method returns a summary with all sentences having a higher
sentence score than the threshold sentence score than the threshold
:param sentences: list of all sentences :param sentences: list of all sentences
:param sentence_score: list of sentences with sentence score :param sentence_score: list of sentences with sentence score
:param threshold: threshhold for sentence score :param threshold: threshold for sentence score
:returns: summary: generated summary :returns: summary: generated summary
""" """
summary = '' summary = ''
...@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy):
return summary[1:] return summary[1:]
def create_summary_strat2(self, sentence_score, percentOfText): def summarize(self, text: str) -> str:
""" text = self.nlpGer(text)
This method returns a summary which length is a percentage of the
given text
:param sentence_score: list of sentences with sentence score
:param percentOfText: percentage of sentences in the summary in
relation to the given text
:returns: summary: generated summary
"""
top_sentences = (sorted(sentence_score.values())[::-1])
percentOfText = percentOfText/100
top_percent_sentence = int(percentOfText*len(top_sentences))
top_sent = top_sentences[:top_percent_sentence]
summary = ''
for sent, strength in sentence_score.items():