Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
pse-trapp-public
IntentFinder
Commits
f9651faf
Commit
f9651faf
authored
May 20, 2021
by
Patrick Schlindwein
Browse files
Merge branch 'refactor/#60_python_code_smells' into 'master'
#60
refactor entire python codebase to get rid of code smells See merge request
!39
parents
9581e9a2
c780a4a3
Pipeline
#71436
passed with stages
in 15 minutes and 46 seconds
Changes
19
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
src/nlp/app
/similarity/__init__.py
→
learning/team2/intent similarity
/similarity/__init__.py
View file @
f9651faf
File moved
src/nlp/app
/similarity/cluster_algorithm.py
→
learning/team2/intent similarity
/similarity/cluster_algorithm.py
View file @
f9651faf
from
app.similarity.intent_cluster
import
IntentCluster
from
app.similarity.intent
import
Intent
from
app.similarity.intent_cluster
import
IntentCluster
# very primitive clustering algrotihm for intents
def
find_intent_clusters
(
intent_list
:
list
[
Intent
],
t
:
float
,
iterations
:
int
)
->
list
:
"""
This method finds clusters in a list of Intents. The distance is calculated
by their cosine similarity
:param iterations: The number of Iterations.
:param t: [0, 1] The max cluster distance
:param intent_list: A list of Intents
:returns: cluster_list: A list of Intent Clusters
:rtype: list
"""
def
find_intent_clusters
(
intent_list
:
list
[
Intent
],
t
:
float
,
iterations
:
int
)
->
float
:
# init
cluster_list
=
[]
cluster_index
=
0
...
...
@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float,
current_cluster
.
recalculate_center
()
added_to_cluster
=
True
# if not then become a cluster center
# if current_intent is not close to a cluster,
# become a cluster center
if
not
added_to_cluster
:
cluster_list
.
append
(
IntentCluster
(
str
(
cluster_index
),
current_intent
))
...
...
src/nlp/app
/similarity/intent.py
→
learning/team2/intent similarity
/similarity/intent.py
View file @
f9651faf
...
...
@@ -12,12 +12,27 @@ class Intent:
self
.
vector
=
self
.
n
.
vector
self
.
vector_norm
=
self
.
n
.
vector_norm
# calculating the smiliarity of the Intent to another intent
def
similarityIntent
(
self
,
intent
)
->
float
:
def
similarity_intent
(
self
,
intent
)
->
float
:
"""
This method calculates the cosine similarity to another Intent
:param intent: The other Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
return
self
.
n
.
similarity
(
intent
.
n
)
# calculating the smil
i
arity of the Intent to another vector
# calculating the s
i
milarity of the Intent to another vector
def
similarity
(
self
,
vector
)
->
float
:
"""
This method calculates the cosine similarity to another Vector
:param vector: A vectorized Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
norm
=
0
for
x
in
vector
:
norm
+=
x
**
2
...
...
src/nlp/app
/similarity/intent_cluster.py
→
learning/team2/intent similarity
/similarity/intent_cluster.py
View file @
f9651faf
...
...
@@ -4,12 +4,16 @@ from app.similarity.intent import Intent
class
IntentCluster
(
list
):
def
__init__
(
self
,
label
:
str
,
intent
:
Intent
):
super
().
__init__
()
self
.
center
=
intent
.
vector
self
.
append
(
intent
)
self
.
label
=
label
# setting the center of the cluster equal to the median of all elements
def
recalculate_center
(
self
):
"""
This method calculates the center of the cluster and updates the center
member variable
"""
for
i
in
range
(
0
,
len
(
self
.
center
)):
tmp_list
=
[]
for
element
in
self
:
...
...
src/nlp/app/tests
/test_intent_id_similarity.py
→
learning/team2/intent similarity
/test_intent_id_similarity.py
View file @
f9651faf
...
...
@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase):
"Wo gibt es weitere Informationen?"
]
def
test_similarty
(
self
):
# generate 2 similary ids
def
test_similarity
(
self
):
"""
In this case two similar Intents are created and the
Intent:similarity_intent() method is tested
"""
# generate 2 similar ids
ih
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
53
])
ih2
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
54
])
id
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id
1
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id2
=
ih2
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
# init id vetors
intent1
=
Intent
(
id
)
# init id ve
c
tors
intent1
=
Intent
(
id
1
)
intent2
=
Intent
(
id2
)
# check if they are similar
self
.
assertGreaterEqual
(
intent1
.
similarity
I
ntent
(
intent2
),
0.85
)
self
.
assertGreaterEqual
(
intent1
.
similarity
_i
ntent
(
intent2
),
0.85
)
def
test_cluster
(
self
):
"""
In this case the find_intent_clusters() method is tested.
"""
# init
intent_list
=
[]
# generate ids for all questions and init the id vectors
for
q
in
TestSimilarityAnalysis
.
rki_faq
:
ih
=
IntentHandler
(
q
)
id
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
intent
=
Intent
(
id
)
id
1
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
intent
=
Intent
(
id
1
)
if
intent
.
vector_norm
>
0
:
intent_list
.
append
(
intent
)
...
...
src/nlp/app/nlp_server.py
View file @
f9651faf
from
fastapi
import
FastAPI
,
Request
from
fastapi
import
FastAPI
from
app.summary.summary_word_embedding
import
WordEmbeddingSummarizer
from
app.utilities
import
generator
...
...
@@ -14,6 +14,12 @@ app = FastAPI(
" IntentFinder"
)
"""
In this array, every summarization strategy should be instantiated exactly
once.
Note that every summarization strategy should implement the ISummaryStrategy
interface.
"""
strategies
=
[
SimpleSpacySummarizer
(),
SentenceEmbeddingSummarizer
(),
...
...
@@ -28,20 +34,32 @@ async def root():
@
app
.
get
(
"/strategies"
)
async
def
api_strategies
():
"""
This function will generate a list of ids from all summarization strategies
:return: The list of all summarization-strategy-ids in JSON format
"""
res
=
[]
for
strategy
in
strategies
:
res
.
append
(
strategy
.
id
)
return
res
@
app
.
post
(
"/summarize/{strategy_id}"
,
summary
=
"Generate a summary of the given"
" text."
)
async
def
summarize
(
strategy_id
:
str
,
req
:
Request
):
@
app
.
get
(
"/summarize/{strategy_id}"
,
summary
=
"Generate a summary of the given"
" text."
)
async
def
summarize
(
strategy_id
:
str
,
text
:
str
):
"""
This function will summarize a given text with a given summarization
strategy
:param strategy_id: The id of the strategy
:param text: The text to be summarized
:return: The summary, strategy and quality of the summary in JSON format
"""
for
strategy
in
strategies
:
if
strategy
.
id
==
strategy_id
:
quality
=
0.5
req_json
=
await
req
.
json
()
summary
=
strategy
.
summarize
(
req_json
[
"text"
])
summary
=
strategy
.
summarize
(
text
)
return
{
"strategy"
:
strategy_id
,
"quality"
:
quality
,
"summary"
:
summary
}
...
...
@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request):
@
app
.
post
(
"/intentid"
,
summary
=
"Generate an intent id from a given intent"
" text"
)
async
def
generate_intent_id
(
intent
:
str
,
max
T
okens
:
int
):
async
def
generate_intent_id
(
intent
:
str
,
max
_t
okens
:
int
):
"""Generate a human readable reduced and yet expressive id for an intent
based on the passed
intent text.
"""
return
generator
.
IntentHandler
(
intent
).
generate_intent_id
(
max
T
okens
)
return
generator
.
IntentHandler
(
intent
).
generate_intent_id
(
max
_t
okens
)
src/nlp/app/summary/simple_spacy_summarizer.py
View file @
f9651faf
...
...
@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
word_frequency
=
self
.
divide_into_tokens
(
doc
)
# now rank the sentences based on the word frequency
sent_rank
=
self
.
rank_the_sentences
(
doc
,
word_frequency
,
text
)
sent_rank
=
self
.
rank_the_sentences
(
doc
,
word_frequency
)
# get frequency of words
top_sentences
=
(
sorted
(
sent_rank
.
values
())[::
-
1
])
...
...
@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy):
result_text
=
self
.
create_the_summary
(
sent_rank
,
top_sent
)
return
result_text
def
divide_into_tokens
(
self
,
doc
):
@
staticmethod
def
divide_into_tokens
(
doc
):
"""
This method generates a word frequency dict from a given document
:param doc: document created by spacy
:returns: word_frequency: a dict containing all words and the number of
their occurrences
:rtype: dict
"""
corpus
=
[
sent
.
text
.
lower
()
for
sent
in
doc
.
sents
]
cv
=
CountVectorizer
(
stop_words
=
list
(
STOP_WORDS
))
cv_fit
=
cv
.
fit_transform
(
corpus
)
...
...
@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return
word_frequency
def
get_frequency_of_words
(
self
,
word_frequency
):
# get high frequency words
val
=
sorted
(
word_frequency
.
values
())
@
staticmethod
def
rank_the_sentences
(
doc
,
word_frequency
):
"""
This method creates a sentences ranking based on the word frequency
list created by divide_into_tokens
# gets relative frequency of words
higher_frequency
=
val
[
-
1
]
for
word
in
word_frequency
.
keys
():
word_frequency
[
word
]
=
(
word_frequency
[
word
]
/
higher_frequency
)
:param doc: document created by spacy
:param word_frequency: a dict containing all words and the number of
their occurrences
:returns: sent_rank: a list containing all sentences with their
associated ranking
:rtype: dict
"""
def
rank_the_sentences
(
self
,
doc
,
word_frequency
,
text
):
sent_rank
=
{}
for
sent
in
doc
.
sents
:
for
word
in
sent
:
...
...
@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return
sent_rank
def
create_the_summary
(
self
,
sent_rank
,
top_sent
):
@
staticmethod
def
create_the_summary
(
sent_rank
,
top_sent
):
"""
This methods generates the summary
:param sent_rank: a list containing all sentences with their associated
ranking generated by rank_the_sentences
:param top_sent: the best ranked sentence
:returns: result_text: the summary
:rtype: str
"""
# create the summary
summary
=
[]
for
sent
,
strength
in
sent_rank
.
items
():
...
...
src/nlp/app/summary/summarization_with_strategy_TFIDF.py
View file @
f9651faf
from
app.summary.summary_strategy_interface
import
ISummaryStrategy
import
spacy
from
heapq
import
nlargest
import
math
from
nltk.stem
import
WordNetLemmatizer
import
nltk
nltk
.
download
(
'wordnet'
)
class
SummaryTFIDF
(
ISummaryStrategy
):
nlpGer
=
spacy
.
load
(
'de_core_news_sm'
)
nlpEng
=
spacy
.
load
(
'en_core_web_sm'
)
lemmatizer
=
WordNetLemmatizer
()
def
__init__
(
self
):
...
...
@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy):
def
id
(
self
):
return
self
.
_id
def
frequency_matrix
(
self
,
summary
,
language
):
def
frequency_matrix
(
self
,
summary
):
"""
This method creates a tf-idf-matrix which is a list with all sentences
containing a list with all words in the sentence and their
frequency as value
:param summary: given text to summarize
:param language: language of the text
:returns: freq_matrix: frequency matrix
"""
freq_matrix
=
{}
if
language
==
'ger'
:
stopWords
=
self
.
nlpGer
.
Defaults
.
stop_words
elif
language
==
'en'
:
stopWords
=
self
.
nlpEng
.
Defaults
.
stop_words
stop_words
=
self
.
nlpGer
.
Defaults
.
stop_words
for
sent
in
summary
:
# dictionary with 'words' as the key
...
...
@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy):
for
word
in
words
:
word
=
self
.
lemmatizer
.
lemmatize
(
word
)
# Lemmatize the word
if
word
not
in
stop
W
ords
:
# Reject stop
W
ords
if
word
not
in
stop
_w
ords
:
# Reject stop
_w
ords
if
word
in
freq_table
:
freq_table
[
word
]
+=
1
else
:
...
...
@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
freq_matrix
def
tf_matrix
(
self
,
freq_matrix
):
@
staticmethod
def
tf_matrix
(
freq_matrix
):
"""
This method calculates the term frequency for every word
...
...
@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
tf_matrix
def
sentences_per_words
(
self
,
freq_matrix
):
@
staticmethod
def
sentences_per_words
(
freq_matrix
):
"""
This methods returns a list with all words and how often a word is
mentioned in a sentence
...
...
@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
sent_per_words
def
idf_matrix
(
self
,
freq_matrix
,
sent_per_words
,
total_sentences
):
@
staticmethod
def
idf_matrix
(
freq_matrix
,
sent_per_words
,
total_sentences
):
"""
This methods calculates a idf score for every word
...
...
@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
idf_matrix
def
tf_idf_matrix
(
self
,
tf_matrix
,
idf_matrix
):
@
staticmethod
def
tf_idf_matrix
(
tf_matrix
,
idf_matrix
):
"""
This methods calculates a tf-idf-score for every word
...
...
@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy):
return
tf_idf_matrix
def
score_sentences
(
self
,
tf_idf_matrix
):
@
staticmethod
def
score_sentences
(
tf_idf_matrix
):
"""
This methods calculates a sentence score for every sentence based on
the tf-idf-matrix
:param tf_idf_matrix: tf-idf-matrix
:returns: sentence
S
core: list of all sentences with sentence score
:returns: sentence
_s
core: list of all sentences with sentence score
"""
sentence
S
core
=
{}
sentence
_s
core
=
{}
for
sent
,
f_table
in
tf_idf_matrix
.
items
():
total_tfidf_score_per_sentence
=
0
...
...
@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy):
total_tfidf_score_per_sentence
+=
tf_idf_score
if
total_words_in_sentence
!=
0
:
sentence
S
core
[
sent
]
=
total_tfidf_score_per_sentence
/
\
total_words_in_sentence
sentence
_s
core
[
sent
]
=
total_tfidf_score_per_sentence
/
\
total_words_in_sentence
return
sentence
S
core
return
sentence
_s
core
def
average_score
(
self
,
sentence_score
):
@
staticmethod
def
average_score
(
sentence_score
):
"""
This method calculates the average sentence score
...
...
@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy):
return
average_sent_score
def
create_summary_strat1
(
self
,
sentences
,
sentence_score
,
threshold
):
@
staticmethod
def
create_summary
(
sentences
,
sentence_score
,
threshold
):
"""
This method returns a summary with all sentences having a higher
sentence score than the threshold
:param sentences: list of all sentences
:param sentence_score: list of sentences with sentence score
:param threshold: thresh
h
old for sentence score
:param threshold: threshold for sentence score
:returns: summary: generated summary
"""
summary
=
''
...
...
@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy):
return
summary
[
1
:]
def
create_summary_strat2
(
self
,
sentence_score
,
percentOfText
):
"""
This method returns a summary which length is a percentage of the
given text
:param sentence_score: list of sentences with sentence score
:param percentOfText: percentage of sentences in the summary in
relation to the given text
:returns: summary: generated summary
"""
top_sentences
=
(
sorted
(
sentence_score
.
values
())[::
-
1
])
percentOfText
=
percentOfText
/
100
top_percent_sentence
=
int
(
percentOfText
*
len
(
top_sentences
))
top_sent
=
top_sentences
[:
top_percent_sentence
]
summary
=
''
for
sent
,
strength
in
sentence_score
.
items
():
if
strength
in
top_sent
:
summary
+=
" "
+
sent
.
text
return
summary
[
1
:]
def
create_summary_strat3
(
self
,
sentence_score
,
numberOfSentences
):
"""
This method returns a summary with the number of sentences set
:param sentence_score: list of sentences with sentence score
:param numberOfSentences: the number of sentences in the summary
:returns: summary: generated summary
"""
summarized_sentences
=
nlargest
(
numberOfSentences
,
sentence_score
,
key
=
sentence_score
.
get
)
final_sentences
=
[
w
.
text
for
w
in
summarized_sentences
]
summary
=
' '
.
join
(
final_sentences
)
return
summary
def
summarize
(
self
,
text
:
str
,
language
:
str
=
'ger'
,
strategy
:
int
=
1
,
percentOfText
:
int
=
30
,
numberOfSentences
:
int
=
3
):
"""
This method returns a summary for the given text
:param text: str: text to create summary from
:param language: str: (Default value = 'ger') language of given text
:param strategy: int: (Default value = 1) strategy to use (1 for
average score as threshhold, 2 for percentage of given text, 3 for
number of sentences)
:param percentOfText: int: (Default value = 30) value for stragy 2
(1-100)
:param numberOfSentences: int: (Default value = 3) value for strategy 3
:returns: summary: str: generated summary
:raises: ValueError: raises an exception when parameters are set to
wrong values
"""
# check parameters
if
language
!=
'en'
and
language
!=
'ger'
:
raise
ValueError
(
"language must be 'en' or 'ger'"
)
if
strategy
<
1
or
strategy
>
3
:
raise
ValueError
(
"strategy must be 1, 2 or 3"
)
if
percentOfText
<
1
or
percentOfText
>
100
:
raise
ValueError
(
"percentOfText must be between 1 and 100"
)
# count number of words in original text
original_words
=
text
.
split
()
original_words
=
[
w
for
w
in
original_words
if
w
.
isalnum
()]
# num_words_in_original_text = len(original_words)
# convert text to spacy object
if
language
==
'ger'
:
text
=
self
.
nlpGer
(
text
)
elif
language
==
'en'
:
text
=
self
.
nlpEng
(
text
)
def
summarize
(
self
,
text
:
str
)
->
str
:
text
=
self
.
nlpGer
(
text
)
# put all sentences in a list
sentences
=
list
(
text
.
sents
)
total_sentences
=
len
(
sentences
)
# generate frequency matrix
freq_matrix
=
self
.
frequency_matrix
(
sentences
,
language
)
freq_matrix
=
self
.
frequency_matrix
(
sentences
)
# generate term frequency matrix
tf_matrix
=
self
.
tf_matrix
(
freq_matrix
)
...
...
@@ -299,21 +230,11 @@ class SummaryTFIDF(ISummaryStrategy):
# generate sentence score for every sentence
sentence_scores
=
self
.
score_sentences
(
tf_idf_matrix
)
# set thresh
h
old to average score
# set threshold to average score
threshold
=
self
.
average_score
(
sentence_scores
)
# summary
# strategy 1
if
strategy
==
1
:
summary
=
self
.
create_summary_strat1
(
sentences
,
sentence_scores
,
threshold
)
# strategy 2
elif
strategy
==
2
:
summary
=
self
.
create_summary_strat2
(
sentence_scores
,
percentOfText
)
# strategy 3
elif
strategy
==
3
:
summary
=
self
.
create_summary_strat3
(
sentence_scores
,
numberOfSentences
)
summary
=
self
.
create_summary
(
sentences
,
sentence_scores
,
threshold
)
return
summary
src/nlp/app/summary/summary.py
deleted
100644 → 0
View file @
9581e9a2
def
get_summary
(
text
:
str
):
return
"summary of "
+
text
[::
2
]
src/nlp/app/summary/summary_sentence_embedding.py
View file @
f9651faf
...
...
@@ -73,12 +73,23 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
@
staticmethod
def
__get_number_of_sentences
(
sentences
):
if
math
.
floor
(
len
(
sentences
)
*
0.3
)
<
1
:
num_sentences_for_summary
=
1
elif
math
.
floor
(
len
(
sentences
)
*
0.3
)
>
10
:
num_sentences_for_summary
=
10
else
:
num_sentences_for_summary
=
math
.
floor
(
len
(
sentences
)
*
0.3
)
"""
Gets the number of sentences that will be part of the summary
:param sentences: Requested number of sentences
:return: sentences ∈ [1, 10] Granted number of sentences
"""
min_num_sentences_for_summary
=
1
max_num_sentences_for_summary
=
10
num_sentences_in_percent
=
0.3
num_sentences_for_summary
=
math
.
floor
(
len
(
sentences
)
*
num_sentences_in_percent
)
if
num_sentences_for_summary
<
min_num_sentences_for_summary
:
num_sentences_for_summary
=
min_num_sentences_for_summary
elif
num_sentences_for_summary
>
max_num_sentences_for_summary
:
num_sentences_for_summary
=
max_num_sentences_for_summary
return
num_sentences_for_summary
@
staticmethod
...
...
@@ -94,6 +105,10 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
column and select the first row
(sentence having least distance from the mean)
3.Sort the sentences based on their sequence in the original text.
:returns: summary: a string representing a summarized version of the
input text
:rtype: str
"""
summary
=
' '
\
.
join
(
data
...
...
src/nlp/app/summary/summary_strategy_interface.py
View file @
f9651faf
from
abc
import
ABC
,
abstractmethod
# interface for summary strategies
class
ISummaryStrategy
(
ABC
):
# id of the suammary strategy
@
property
def
id
(
self
):
"""
The id for a specific summarization algorithm.
It will specify the path used to access the algorithm.
"""
raise
NotImplementedError
# summarizes a text
@
abstractmethod
def
summarize
(
self
,
text
:
str
)
->
str
:
"""