Commit f9651faf authored by Patrick Schlindwein's avatar Patrick Schlindwein
Browse files

Merge branch 'refactor/#60_python_code_smells' into 'master'

#60 refactor entire python codebase to get rid of code smells

See merge request !39
parents 9581e9a2 c780a4a3
Pipeline #71436 passed with stages
in 15 minutes and 46 seconds
from app.similarity.intent_cluster import IntentCluster
from app.similarity.intent import Intent
from app.similarity.intent_cluster import IntentCluster
# very primitive clustering algrotihm for intents
def find_intent_clusters(intent_list: list[Intent], t: float,
iterations: int) -> list:
"""
This method finds clusters in a list of Intents. The distance is calculated
by their cosine similarity
:param iterations: The number of Iterations.
:param t: [0, 1] The max cluster distance
:param intent_list: A list of Intents
:returns: cluster_list: A list of Intent Clusters
:rtype: list
"""
def find_intent_clusters(intent_list: list[Intent], t: float,
iterations: int) -> float:
# init
cluster_list = []
cluster_index = 0
......@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float,
current_cluster.recalculate_center()
added_to_cluster = True
# if not then become a cluster center
# if current_intent is not close to a cluster,
# become a cluster center
if not added_to_cluster:
cluster_list.append(IntentCluster(
str(cluster_index), current_intent))
......
......@@ -12,12 +12,27 @@ class Intent:
self.vector = self.n.vector
self.vector_norm = self.n.vector_norm
# calculating the smiliarity of the Intent to another intent
def similarityIntent(self, intent) -> float:
def similarity_intent(self, intent) -> float:
"""
This method calculates the cosine similarity to another Intent
:param intent: The other Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
return self.n.similarity(intent.n)
# calculating the smiliarity of the Intent to another vector
# calculating the similarity of the Intent to another vector
def similarity(self, vector) -> float:
"""
This method calculates the cosine similarity to another Vector
:param vector: A vectorized Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
norm = 0
for x in vector:
norm += x ** 2
......
......@@ -4,12 +4,16 @@ from app.similarity.intent import Intent
class IntentCluster(list):
def __init__(self, label: str, intent: Intent):
super().__init__()
self.center = intent.vector
self.append(intent)
self.label = label
# setting the center of the cluster equal to the median of all elements
def recalculate_center(self):
"""
This method calculates the center of the cluster and updates the center
member variable
"""
for i in range(0, len(self.center)):
tmp_list = []
for element in self:
......
......@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase):
"Wo gibt es weitere Informationen?"
]
def test_similarty(self):
# generate 2 similary ids
def test_similarity(self):
"""
In this case two similar Intents are created and the
Intent:similarity_intent() method is tested
"""
# generate 2 similar ids
ih = IntentHandler(TestSimilarityAnalysis.rki_faq[53])
ih2 = IntentHandler(TestSimilarityAnalysis.rki_faq[54])
id = ih.generate_intent_id(3).replace("_", " ")
id1 = ih.generate_intent_id(3).replace("_", " ")
id2 = ih2.generate_intent_id(3).replace("_", " ")
# init id vetors
intent1 = Intent(id)
# init id vectors
intent1 = Intent(id1)
intent2 = Intent(id2)
# check if they are similar
self.assertGreaterEqual(intent1.similarityIntent(intent2), 0.85)
self.assertGreaterEqual(intent1.similarity_intent(intent2), 0.85)
def test_cluster(self):
"""
In this case the find_intent_clusters() method is tested.
"""
# init
intent_list = []
# generate ids for all questions and init the id vectors
for q in TestSimilarityAnalysis.rki_faq:
ih = IntentHandler(q)
id = ih.generate_intent_id(3).replace("_", " ")
intent = Intent(id)
id1 = ih.generate_intent_id(3).replace("_", " ")
intent = Intent(id1)
if intent.vector_norm > 0:
intent_list.append(intent)
......
from fastapi import FastAPI, Request
from fastapi import FastAPI
from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator
......@@ -14,6 +14,12 @@ app = FastAPI(
" IntentFinder"
)
"""
In this array, every summarization strategy should be instantiated exactly
once.
Note that every summarization strategy should implement the ISummaryStrategy
interface.
"""
strategies = [
SimpleSpacySummarizer(),
SentenceEmbeddingSummarizer(),
......@@ -28,20 +34,32 @@ async def root():
@app.get("/strategies")
async def api_strategies():
"""
This function will generate a list of ids from all summarization strategies
:return: The list of all summarization-strategy-ids in JSON format
"""
res = []
for strategy in strategies:
res.append(strategy.id)
return res
@app.post("/summarize/{strategy_id}", summary="Generate a summary of the given"
" text.")
async def summarize(strategy_id: str, req: Request):
@app.get("/summarize/{strategy_id}", summary="Generate a summary of the given"
" text.")
async def summarize(strategy_id: str, text: str):
"""
This function will summarize a given text with a given summarization
strategy
:param strategy_id: The id of the strategy
:param text: The text to be summarized
:return: The summary, strategy and quality of the summary in JSON format
"""
for strategy in strategies:
if strategy.id == strategy_id:
quality = 0.5
req_json = await req.json()
summary = strategy.summarize(req_json["text"])
summary = strategy.summarize(text)
return {"strategy": strategy_id, "quality": quality,
"summary": summary}
......@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request):
@app.post("/intentid", summary="Generate an intent id from a given intent"
" text")
async def generate_intent_id(intent: str, maxTokens: int):
async def generate_intent_id(intent: str, max_tokens: int):
"""Generate a human readable reduced and yet expressive id for an intent
based on the passed
intent text.
"""
return generator.IntentHandler(intent).generate_intent_id(maxTokens)
return generator.IntentHandler(intent).generate_intent_id(max_tokens)
......@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
word_frequency = self.divide_into_tokens(doc)
# now rank the sentences based on the word frequency
sent_rank = self.rank_the_sentences(doc, word_frequency, text)
sent_rank = self.rank_the_sentences(doc, word_frequency)
# get frequency of words
top_sentences = (sorted(sent_rank.values())[::-1])
......@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy):
result_text = self.create_the_summary(sent_rank, top_sent)
return result_text
def divide_into_tokens(self, doc):
@staticmethod
def divide_into_tokens(doc):
"""
This method generates a word frequency dict from a given document
:param doc: document created by spacy
:returns: word_frequency: a dict containing all words and the number of
their occurrences
:rtype: dict
"""
corpus = [sent.text.lower() for sent in doc.sents]
cv = CountVectorizer(stop_words=list(STOP_WORDS))
cv_fit = cv.fit_transform(corpus)
......@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return word_frequency
def get_frequency_of_words(self, word_frequency):
# get high frequency words
val = sorted(word_frequency.values())
@staticmethod
def rank_the_sentences(doc, word_frequency):
"""
This method creates a sentences ranking based on the word frequency
list created by divide_into_tokens
# gets relative frequency of words
higher_frequency = val[-1]
for word in word_frequency.keys():
word_frequency[word] = (word_frequency[word] / higher_frequency)
:param doc: document created by spacy
:param word_frequency: a dict containing all words and the number of
their occurrences
:returns: sent_rank: a list containing all sentences with their
associated ranking
:rtype: dict
"""
def rank_the_sentences(self, doc, word_frequency, text):
sent_rank = {}
for sent in doc.sents:
for word in sent:
......@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return sent_rank
def create_the_summary(self, sent_rank, top_sent):
@staticmethod
def create_the_summary(sent_rank, top_sent):
"""
This methods generates the summary
:param sent_rank: a list containing all sentences with their associated
ranking generated by rank_the_sentences
:param top_sent: the best ranked sentence
:returns: result_text: the summary
:rtype: str
"""
# create the summary
summary = []
for sent, strength in sent_rank.items():
......
from app.summary.summary_strategy_interface import ISummaryStrategy
import spacy
from heapq import nlargest
import math
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
class SummaryTFIDF(ISummaryStrategy):
nlpGer = spacy.load('de_core_news_sm')
nlpEng = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
def __init__(self):
......@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy):
def id(self):
return self._id
def frequency_matrix(self, summary, language):
def frequency_matrix(self, summary):
"""
This method creates a tf-idf-matrix which is a list with all sentences
containing a list with all words in the sentence and their
frequency as value
:param summary: given text to summarize
:param language: language of the text
:returns: freq_matrix: frequency matrix
"""
freq_matrix = {}
if language == 'ger':
stopWords = self.nlpGer.Defaults.stop_words
elif language == 'en':
stopWords = self.nlpEng.Defaults.stop_words
stop_words = self.nlpGer.Defaults.stop_words
for sent in summary:
# dictionary with 'words' as the key
......@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy):
for word in words:
word = self.lemmatizer.lemmatize(word) # Lemmatize the word
if word not in stopWords: # Reject stopWords
if word not in stop_words: # Reject stop_words
if word in freq_table:
freq_table[word] += 1
else:
......@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy):
return freq_matrix
def tf_matrix(self, freq_matrix):
@staticmethod
def tf_matrix(freq_matrix):
"""
This method calculates the term frequency for every word
......@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy):
return tf_matrix
def sentences_per_words(self, freq_matrix):
@staticmethod
def sentences_per_words(freq_matrix):
"""
This methods returns a list with all words and how often a word is
mentioned in a sentence
......@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy):
return sent_per_words
def idf_matrix(self, freq_matrix, sent_per_words, total_sentences):
@staticmethod
def idf_matrix(freq_matrix, sent_per_words, total_sentences):
"""
This methods calculates a idf score for every word
......@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy):
return idf_matrix
def tf_idf_matrix(self, tf_matrix, idf_matrix):
@staticmethod
def tf_idf_matrix(tf_matrix, idf_matrix):
"""
This methods calculates a tf-idf-score for every word
......@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy):
return tf_idf_matrix
def score_sentences(self, tf_idf_matrix):
@staticmethod
def score_sentences(tf_idf_matrix):
"""
This methods calculates a sentence score for every sentence based on
the tf-idf-matrix
:param tf_idf_matrix: tf-idf-matrix
:returns: sentenceScore: list of all sentences with sentence score
:returns: sentence_score: list of all sentences with sentence score
"""
sentenceScore = {}
sentence_score = {}
for sent, f_table in tf_idf_matrix.items():
total_tfidf_score_per_sentence = 0
......@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy):
total_tfidf_score_per_sentence += tf_idf_score
if total_words_in_sentence != 0:
sentenceScore[sent] = total_tfidf_score_per_sentence / \
total_words_in_sentence
sentence_score[sent] = total_tfidf_score_per_sentence / \
total_words_in_sentence
return sentenceScore
return sentence_score
def average_score(self, sentence_score):
@staticmethod
def average_score(sentence_score):
"""
This method calculates the average sentence score
......@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy):
return average_sent_score
def create_summary_strat1(self, sentences, sentence_score, threshold):
@staticmethod
def create_summary(sentences, sentence_score, threshold):
"""
This method returns a summary with all sentences having a higher
sentence score than the threshold
:param sentences: list of all sentences
:param sentence_score: list of sentences with sentence score
:param threshold: threshhold for sentence score
:param threshold: threshold for sentence score
:returns: summary: generated summary
"""
summary = ''
......@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy):
return summary[1:]
def create_summary_strat2(self, sentence_score, percentOfText):
"""
This method returns a summary which length is a percentage of the
given text
:param sentence_score: list of sentences with sentence score
:param percentOfText: percentage of sentences in the summary in
relation to the given text
:returns: summary: generated summary
"""
top_sentences = (sorted(sentence_score.values())[::-1])
percentOfText = percentOfText/100
top_percent_sentence = int(percentOfText*len(top_sentences))
top_sent = top_sentences[:top_percent_sentence]
summary = ''
for sent, strength in sentence_score.items():
if strength in top_sent:
summary += " " + sent.text
return summary[1:]
def create_summary_strat3(self, sentence_score, numberOfSentences):
"""
This method returns a summary with the number of sentences set
:param sentence_score: list of sentences with sentence score
:param numberOfSentences: the number of sentences in the summary
:returns: summary: generated summary
"""
summarized_sentences = nlargest(
numberOfSentences, sentence_score, key=sentence_score.get)
final_sentences = [w.text for w in summarized_sentences]
summary = ' '.join(final_sentences)
return summary
def summarize(self, text: str, language: str = 'ger', strategy: int = 1,
percentOfText: int = 30, numberOfSentences: int = 3):
"""
This method returns a summary for the given text
:param text: str: text to create summary from
:param language: str: (Default value = 'ger') language of given text
:param strategy: int: (Default value = 1) strategy to use (1 for
average score as threshhold, 2 for percentage of given text, 3 for
number of sentences)
:param percentOfText: int: (Default value = 30) value for stragy 2
(1-100)
:param numberOfSentences: int: (Default value = 3) value for strategy 3
:returns: summary: str: generated summary
:raises: ValueError: raises an exception when parameters are set to
wrong values
"""
# check parameters
if language != 'en' and language != 'ger':
raise ValueError("language must be 'en' or 'ger'")
if strategy < 1 or strategy > 3:
raise ValueError("strategy must be 1, 2 or 3")
if percentOfText < 1 or percentOfText > 100:
raise ValueError("percentOfText must be between 1 and 100")
# count number of words in original text
original_words = text.split()
original_words = [w for w in original_words if w.isalnum()]
# num_words_in_original_text = len(original_words)
# convert text to spacy object
if language == 'ger':
text = self.nlpGer(text)
elif language == 'en':
text = self.nlpEng(text)
def summarize(self, text: str) -> str:
text = self.nlpGer(text)
# put all sentences in a list
sentences = list(text.sents)
total_sentences = len(sentences)
# generate frequency matrix
freq_matrix = self.frequency_matrix(sentences, language)
freq_matrix = self.frequency_matrix(sentences)
# generate term frequency matrix
tf_matrix = self.tf_matrix(freq_matrix)
......@@ -299,21 +230,11 @@ class SummaryTFIDF(ISummaryStrategy):
# generate sentence score for every sentence
sentence_scores = self.score_sentences(tf_idf_matrix)
# set threshhold to average score
# set threshold to average score
threshold = self.average_score(sentence_scores)
# summary
# strategy 1
if strategy == 1:
summary = self.create_summary_strat1(
sentences, sentence_scores, threshold)
# strategy 2
elif strategy == 2:
summary = self.create_summary_strat2(
sentence_scores, percentOfText)
# strategy 3
elif strategy == 3:
summary = self.create_summary_strat3(
sentence_scores, numberOfSentences)
summary = self.create_summary(
sentences, sentence_scores, threshold)
return summary
def get_summary(text: str):
return "summary of " + text[::2]
......@@ -73,12 +73,23 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
@staticmethod
def __get_number_of_sentences(sentences):
if math.floor(len(sentences) * 0.3) < 1:
num_sentences_for_summary = 1
elif math.floor(len(sentences) * 0.3) > 10:
num_sentences_for_summary = 10
else:
num_sentences_for_summary = math.floor(len(sentences) * 0.3)
"""
Gets the number of sentences that will be part of the summary
:param sentences: Requested number of sentences
:return: sentences ∈ [1, 10] Granted number of sentences
"""
min_num_sentences_for_summary = 1
max_num_sentences_for_summary = 10
num_sentences_in_percent = 0.3
num_sentences_for_summary = math.floor(len(sentences) *
num_sentences_in_percent)
if num_sentences_for_summary < min_num_sentences_for_summary:
num_sentences_for_summary = min_num_sentences_for_summary
elif num_sentences_for_summary > max_num_sentences_for_summary:
num_sentences_for_summary = max_num_sentences_for_summary
return num_sentences_for_summary
@staticmethod
......@@ -94,6 +105,10 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
column and select the first row
(sentence having least distance from the mean)
3.Sort the sentences based on their sequence in the original text.
:returns: summary: a string representing a summarized version of the
input text
:rtype: str
"""
summary = ' ' \
.join(data
......
from abc import ABC, abstractmethod
# interface for summary strategies
class ISummaryStrategy(ABC):
# id of the suammary strategy
@property
def id(self):
"""
The id for a specific summarization algorithm.
It will specify the path used to access the algorithm.
"""
raise NotImplementedError
# summarizes a text
@abstractmethod
def summarize(self, text: str) -> str:
"""