Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
pse-trapp-public
IntentFinder
Commits
c780a4a3
Commit
c780a4a3
authored
May 20, 2021
by
Christof Walther
Committed by
Patrick Schlindwein
May 20, 2021
Browse files
#60
refactor entire python codebase to get rid of code smells
parent
9581e9a2
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
src/nlp/app
/similarity/__init__.py
→
learning/team2/intent similarity
/similarity/__init__.py
View file @
c780a4a3
File moved
src/nlp/app
/similarity/cluster_algorithm.py
→
learning/team2/intent similarity
/similarity/cluster_algorithm.py
View file @
c780a4a3
from
app.similarity.intent_cluster
import
IntentCluster
from
app.similarity.intent
import
Intent
from
app.similarity.intent
import
Intent
from
app.similarity.intent_cluster
import
IntentCluster
# very primitive clustering algrotihm for intents
def
find_intent_clusters
(
intent_list
:
list
[
Intent
],
t
:
float
,
iterations
:
int
)
->
list
:
"""
This method finds clusters in a list of Intents. The distance is calculated
by their cosine similarity
:param iterations: The number of Iterations.
:param t: [0, 1] The max cluster distance
:param intent_list: A list of Intents
:returns: cluster_list: A list of Intent Clusters
:rtype: list
"""
def
find_intent_clusters
(
intent_list
:
list
[
Intent
],
t
:
float
,
iterations
:
int
)
->
float
:
# init
# init
cluster_list
=
[]
cluster_list
=
[]
cluster_index
=
0
cluster_index
=
0
...
@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float,
...
@@ -23,7 +31,8 @@ def find_intent_clusters(intent_list: list[Intent], t: float,
current_cluster
.
recalculate_center
()
current_cluster
.
recalculate_center
()
added_to_cluster
=
True
added_to_cluster
=
True
# if not then become a cluster center
# if current_intent is not close to a cluster,
# become a cluster center
if
not
added_to_cluster
:
if
not
added_to_cluster
:
cluster_list
.
append
(
IntentCluster
(
cluster_list
.
append
(
IntentCluster
(
str
(
cluster_index
),
current_intent
))
str
(
cluster_index
),
current_intent
))
...
...
src/nlp/app
/similarity/intent.py
→
learning/team2/intent similarity
/similarity/intent.py
View file @
c780a4a3
...
@@ -12,12 +12,27 @@ class Intent:
...
@@ -12,12 +12,27 @@ class Intent:
self
.
vector
=
self
.
n
.
vector
self
.
vector
=
self
.
n
.
vector
self
.
vector_norm
=
self
.
n
.
vector_norm
self
.
vector_norm
=
self
.
n
.
vector_norm
# calculating the smiliarity of the Intent to another intent
def
similarity_intent
(
self
,
intent
)
->
float
:
def
similarityIntent
(
self
,
intent
)
->
float
:
"""
This method calculates the cosine similarity to another Intent
:param intent: The other Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
return
self
.
n
.
similarity
(
intent
.
n
)
return
self
.
n
.
similarity
(
intent
.
n
)
# calculating the smil
i
arity of the Intent to another vector
# calculating the s
i
milarity of the Intent to another vector
def
similarity
(
self
,
vector
)
->
float
:
def
similarity
(
self
,
vector
)
->
float
:
"""
This method calculates the cosine similarity to another Vector
:param vector: A vectorized Intent
:returns: similarity: [0, 1] The cosine similarity
:rtype: float
"""
norm
=
0
norm
=
0
for
x
in
vector
:
for
x
in
vector
:
norm
+=
x
**
2
norm
+=
x
**
2
...
...
src/nlp/app
/similarity/intent_cluster.py
→
learning/team2/intent similarity
/similarity/intent_cluster.py
View file @
c780a4a3
...
@@ -4,12 +4,16 @@ from app.similarity.intent import Intent
...
@@ -4,12 +4,16 @@ from app.similarity.intent import Intent
class
IntentCluster
(
list
):
class
IntentCluster
(
list
):
def
__init__
(
self
,
label
:
str
,
intent
:
Intent
):
def
__init__
(
self
,
label
:
str
,
intent
:
Intent
):
super
().
__init__
()
self
.
center
=
intent
.
vector
self
.
center
=
intent
.
vector
self
.
append
(
intent
)
self
.
append
(
intent
)
self
.
label
=
label
self
.
label
=
label
# setting the center of the cluster equal to the median of all elements
def
recalculate_center
(
self
):
def
recalculate_center
(
self
):
"""
This method calculates the center of the cluster and updates the center
member variable
"""
for
i
in
range
(
0
,
len
(
self
.
center
)):
for
i
in
range
(
0
,
len
(
self
.
center
)):
tmp_list
=
[]
tmp_list
=
[]
for
element
in
self
:
for
element
in
self
:
...
...
src/nlp/app/tests
/test_intent_id_similarity.py
→
learning/team2/intent similarity
/test_intent_id_similarity.py
View file @
c780a4a3
...
@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase):
...
@@ -106,28 +106,38 @@ class TestSimilarityAnalysis(TestCase):
"Wo gibt es weitere Informationen?"
"Wo gibt es weitere Informationen?"
]
]
def
test_similarty
(
self
):
def
test_similarity
(
self
):
# generate 2 similary ids
"""
In this case two similar Intents are created and the
Intent:similarity_intent() method is tested
"""
# generate 2 similar ids
ih
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
53
])
ih
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
53
])
ih2
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
54
])
ih2
=
IntentHandler
(
TestSimilarityAnalysis
.
rki_faq
[
54
])
id
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id
1
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id2
=
ih2
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id2
=
ih2
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
# init id vetors
# init id ve
c
tors
intent1
=
Intent
(
id
)
intent1
=
Intent
(
id
1
)
intent2
=
Intent
(
id2
)
intent2
=
Intent
(
id2
)
# check if they are similar
# check if they are similar
self
.
assertGreaterEqual
(
intent1
.
similarity
I
ntent
(
intent2
),
0.85
)
self
.
assertGreaterEqual
(
intent1
.
similarity
_i
ntent
(
intent2
),
0.85
)
def
test_cluster
(
self
):
def
test_cluster
(
self
):
"""
In this case the find_intent_clusters() method is tested.
"""
# init
intent_list
=
[]
intent_list
=
[]
# generate ids for all questions and init the id vectors
# generate ids for all questions and init the id vectors
for
q
in
TestSimilarityAnalysis
.
rki_faq
:
for
q
in
TestSimilarityAnalysis
.
rki_faq
:
ih
=
IntentHandler
(
q
)
ih
=
IntentHandler
(
q
)
id
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
id
1
=
ih
.
generate_intent_id
(
3
).
replace
(
"_"
,
" "
)
intent
=
Intent
(
id
)
intent
=
Intent
(
id
1
)
if
intent
.
vector_norm
>
0
:
if
intent
.
vector_norm
>
0
:
intent_list
.
append
(
intent
)
intent_list
.
append
(
intent
)
...
...
src/nlp/app/nlp_server.py
View file @
c780a4a3
from
fastapi
import
FastAPI
,
Request
from
fastapi
import
FastAPI
from
app.summary.summary_word_embedding
import
WordEmbeddingSummarizer
from
app.summary.summary_word_embedding
import
WordEmbeddingSummarizer
from
app.utilities
import
generator
from
app.utilities
import
generator
...
@@ -14,6 +14,12 @@ app = FastAPI(
...
@@ -14,6 +14,12 @@ app = FastAPI(
" IntentFinder"
" IntentFinder"
)
)
"""
In this array, every summarization strategy should be instantiated exactly
once.
Note that every summarization strategy should implement the ISummaryStrategy
interface.
"""
strategies
=
[
strategies
=
[
SimpleSpacySummarizer
(),
SimpleSpacySummarizer
(),
SentenceEmbeddingSummarizer
(),
SentenceEmbeddingSummarizer
(),
...
@@ -28,20 +34,32 @@ async def root():
...
@@ -28,20 +34,32 @@ async def root():
@
app
.
get
(
"/strategies"
)
@
app
.
get
(
"/strategies"
)
async
def
api_strategies
():
async
def
api_strategies
():
"""
This function will generate a list of ids from all summarization strategies
:return: The list of all summarization-strategy-ids in JSON format
"""
res
=
[]
res
=
[]
for
strategy
in
strategies
:
for
strategy
in
strategies
:
res
.
append
(
strategy
.
id
)
res
.
append
(
strategy
.
id
)
return
res
return
res
@
app
.
post
(
"/summarize/{strategy_id}"
,
summary
=
"Generate a summary of the given"
@
app
.
get
(
"/summarize/{strategy_id}"
,
summary
=
"Generate a summary of the given"
" text."
)
" text."
)
async
def
summarize
(
strategy_id
:
str
,
req
:
Request
):
async
def
summarize
(
strategy_id
:
str
,
text
:
str
):
"""
This function will summarize a given text with a given summarization
strategy
:param strategy_id: The id of the strategy
:param text: The text to be summarized
:return: The summary, strategy and quality of the summary in JSON format
"""
for
strategy
in
strategies
:
for
strategy
in
strategies
:
if
strategy
.
id
==
strategy_id
:
if
strategy
.
id
==
strategy_id
:
quality
=
0.5
quality
=
0.5
req_json
=
await
req
.
json
()
summary
=
strategy
.
summarize
(
text
)
summary
=
strategy
.
summarize
(
req_json
[
"text"
])
return
{
"strategy"
:
strategy_id
,
"quality"
:
quality
,
return
{
"strategy"
:
strategy_id
,
"quality"
:
quality
,
"summary"
:
summary
}
"summary"
:
summary
}
...
@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request):
...
@@ -50,9 +68,9 @@ async def summarize(strategy_id: str, req: Request):
@
app
.
post
(
"/intentid"
,
summary
=
"Generate an intent id from a given intent"
@
app
.
post
(
"/intentid"
,
summary
=
"Generate an intent id from a given intent"
" text"
)
" text"
)
async
def
generate_intent_id
(
intent
:
str
,
max
T
okens
:
int
):
async
def
generate_intent_id
(
intent
:
str
,
max
_t
okens
:
int
):
"""Generate a human readable reduced and yet expressive id for an intent
"""Generate a human readable reduced and yet expressive id for an intent
based on the passed
based on the passed
intent text.
intent text.
"""
"""
return
generator
.
IntentHandler
(
intent
).
generate_intent_id
(
max
T
okens
)
return
generator
.
IntentHandler
(
intent
).
generate_intent_id
(
max
_t
okens
)
src/nlp/app/summary/simple_spacy_summarizer.py
View file @
c780a4a3
...
@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
...
@@ -22,7 +22,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
word_frequency
=
self
.
divide_into_tokens
(
doc
)
word_frequency
=
self
.
divide_into_tokens
(
doc
)
# now rank the sentences based on the word frequency
# now rank the sentences based on the word frequency
sent_rank
=
self
.
rank_the_sentences
(
doc
,
word_frequency
,
text
)
sent_rank
=
self
.
rank_the_sentences
(
doc
,
word_frequency
)
# get frequency of words
# get frequency of words
top_sentences
=
(
sorted
(
sent_rank
.
values
())[::
-
1
])
top_sentences
=
(
sorted
(
sent_rank
.
values
())[::
-
1
])
...
@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy):
...
@@ -32,7 +32,16 @@ class SimpleSpacySummarizer(ISummaryStrategy):
result_text
=
self
.
create_the_summary
(
sent_rank
,
top_sent
)
result_text
=
self
.
create_the_summary
(
sent_rank
,
top_sent
)
return
result_text
return
result_text
def
divide_into_tokens
(
self
,
doc
):
@
staticmethod
def
divide_into_tokens
(
doc
):
"""
This method generates a word frequency dict from a given document
:param doc: document created by spacy
:returns: word_frequency: a dict containing all words and the number of
their occurrences
:rtype: dict
"""
corpus
=
[
sent
.
text
.
lower
()
for
sent
in
doc
.
sents
]
corpus
=
[
sent
.
text
.
lower
()
for
sent
in
doc
.
sents
]
cv
=
CountVectorizer
(
stop_words
=
list
(
STOP_WORDS
))
cv
=
CountVectorizer
(
stop_words
=
list
(
STOP_WORDS
))
cv_fit
=
cv
.
fit_transform
(
corpus
)
cv_fit
=
cv
.
fit_transform
(
corpus
)
...
@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy):
...
@@ -42,16 +51,20 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return
word_frequency
return
word_frequency
def
get_frequency_of_words
(
self
,
word_frequency
):
@
staticmethod
# get high frequency words
def
rank_the_sentences
(
doc
,
word_frequency
):
val
=
sorted
(
word_frequency
.
values
())
"""
This method creates a sentences ranking based on the word frequency
list created by divide_into_tokens
# gets relative frequency of words
:param doc: document created by spacy
higher_frequency
=
val
[
-
1
]
:param word_frequency: a dict containing all words and the number of
for
word
in
word_frequency
.
keys
():
their occurrences
word_frequency
[
word
]
=
(
word_frequency
[
word
]
/
higher_frequency
)
:returns: sent_rank: a list containing all sentences with their
associated ranking
:rtype: dict
"""
def
rank_the_sentences
(
self
,
doc
,
word_frequency
,
text
):
sent_rank
=
{}
sent_rank
=
{}
for
sent
in
doc
.
sents
:
for
sent
in
doc
.
sents
:
for
word
in
sent
:
for
word
in
sent
:
...
@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy):
...
@@ -63,7 +76,18 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return
sent_rank
return
sent_rank
def
create_the_summary
(
self
,
sent_rank
,
top_sent
):
@
staticmethod
def
create_the_summary
(
sent_rank
,
top_sent
):
"""
This methods generates the summary
:param sent_rank: a list containing all sentences with their associated
ranking generated by rank_the_sentences
:param top_sent: the best ranked sentence
:returns: result_text: the summary
:rtype: str
"""
# create the summary
# create the summary
summary
=
[]
summary
=
[]
for
sent
,
strength
in
sent_rank
.
items
():
for
sent
,
strength
in
sent_rank
.
items
():
...
...
src/nlp/app/summary/summarization_with_strategy_TFIDF.py
View file @
c780a4a3
from
app.summary.summary_strategy_interface
import
ISummaryStrategy
from
app.summary.summary_strategy_interface
import
ISummaryStrategy
import
spacy
import
spacy
from
heapq
import
nlargest
import
math
import
math
from
nltk.stem
import
WordNetLemmatizer
from
nltk.stem
import
WordNetLemmatizer
import
nltk
import
nltk
nltk
.
download
(
'wordnet'
)
nltk
.
download
(
'wordnet'
)
class
SummaryTFIDF
(
ISummaryStrategy
):
class
SummaryTFIDF
(
ISummaryStrategy
):
nlpGer
=
spacy
.
load
(
'de_core_news_sm'
)
nlpGer
=
spacy
.
load
(
'de_core_news_sm'
)
nlpEng
=
spacy
.
load
(
'en_core_web_sm'
)
lemmatizer
=
WordNetLemmatizer
()
lemmatizer
=
WordNetLemmatizer
()
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -20,22 +19,18 @@ class SummaryTFIDF(ISummaryStrategy):
def
id
(
self
):
def
id
(
self
):
return
self
.
_id
return
self
.
_id
def
frequency_matrix
(
self
,
summary
,
language
):
def
frequency_matrix
(
self
,
summary
):
"""
"""
This method creates a tf-idf-matrix which is a list with all sentences
This method creates a tf-idf-matrix which is a list with all sentences
containing a list with all words in the sentence and their
containing a list with all words in the sentence and their
frequency as value
frequency as value
:param summary: given text to summarize
:param summary: given text to summarize
:param language: language of the text
:returns: freq_matrix: frequency matrix
:returns: freq_matrix: frequency matrix
"""
"""
freq_matrix
=
{}
freq_matrix
=
{}
if
language
==
'ger'
:
stop_words
=
self
.
nlpGer
.
Defaults
.
stop_words
stopWords
=
self
.
nlpGer
.
Defaults
.
stop_words
elif
language
==
'en'
:
stopWords
=
self
.
nlpEng
.
Defaults
.
stop_words
for
sent
in
summary
:
for
sent
in
summary
:
# dictionary with 'words' as the key
# dictionary with 'words' as the key
...
@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -46,7 +41,7 @@ class SummaryTFIDF(ISummaryStrategy):
for
word
in
words
:
for
word
in
words
:
word
=
self
.
lemmatizer
.
lemmatize
(
word
)
# Lemmatize the word
word
=
self
.
lemmatizer
.
lemmatize
(
word
)
# Lemmatize the word
if
word
not
in
stop
W
ords
:
# Reject stop
W
ords
if
word
not
in
stop
_w
ords
:
# Reject stop
_w
ords
if
word
in
freq_table
:
if
word
in
freq_table
:
freq_table
[
word
]
+=
1
freq_table
[
word
]
+=
1
else
:
else
:
...
@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -56,7 +51,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
freq_matrix
return
freq_matrix
def
tf_matrix
(
self
,
freq_matrix
):
@
staticmethod
def
tf_matrix
(
freq_matrix
):
"""
"""
This method calculates the term frequency for every word
This method calculates the term frequency for every word
...
@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -78,7 +74,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
tf_matrix
return
tf_matrix
def
sentences_per_words
(
self
,
freq_matrix
):
@
staticmethod
def
sentences_per_words
(
freq_matrix
):
"""
"""
This methods returns a list with all words and how often a word is
This methods returns a list with all words and how often a word is
mentioned in a sentence
mentioned in a sentence
...
@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -97,7 +94,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
sent_per_words
return
sent_per_words
def
idf_matrix
(
self
,
freq_matrix
,
sent_per_words
,
total_sentences
):
@
staticmethod
def
idf_matrix
(
freq_matrix
,
sent_per_words
,
total_sentences
):
"""
"""
This methods calculates a idf score for every word
This methods calculates a idf score for every word
...
@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -120,7 +118,8 @@ class SummaryTFIDF(ISummaryStrategy):
return
idf_matrix
return
idf_matrix
def
tf_idf_matrix
(
self
,
tf_matrix
,
idf_matrix
):
@
staticmethod
def
tf_idf_matrix
(
tf_matrix
,
idf_matrix
):
"""
"""
This methods calculates a tf-idf-score for every word
This methods calculates a tf-idf-score for every word
...
@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -144,15 +143,16 @@ class SummaryTFIDF(ISummaryStrategy):
return
tf_idf_matrix
return
tf_idf_matrix
def
score_sentences
(
self
,
tf_idf_matrix
):
@
staticmethod
def
score_sentences
(
tf_idf_matrix
):
"""
"""
This methods calculates a sentence score for every sentence based on
This methods calculates a sentence score for every sentence based on
the tf-idf-matrix
the tf-idf-matrix
:param tf_idf_matrix: tf-idf-matrix
:param tf_idf_matrix: tf-idf-matrix
:returns: sentence
S
core: list of all sentences with sentence score
:returns: sentence
_s
core: list of all sentences with sentence score
"""
"""
sentence
S
core
=
{}
sentence
_s
core
=
{}
for
sent
,
f_table
in
tf_idf_matrix
.
items
():
for
sent
,
f_table
in
tf_idf_matrix
.
items
():
total_tfidf_score_per_sentence
=
0
total_tfidf_score_per_sentence
=
0
...
@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -162,12 +162,13 @@ class SummaryTFIDF(ISummaryStrategy):
total_tfidf_score_per_sentence
+=
tf_idf_score
total_tfidf_score_per_sentence
+=
tf_idf_score
if
total_words_in_sentence
!=
0
:
if
total_words_in_sentence
!=
0
:
sentence
S
core
[
sent
]
=
total_tfidf_score_per_sentence
/
\
sentence
_s
core
[
sent
]
=
total_tfidf_score_per_sentence
/
\
total_words_in_sentence
total_words_in_sentence
return
sentence
S
core
return
sentence
_s
core
def
average_score
(
self
,
sentence_score
):
@
staticmethod
def
average_score
(
sentence_score
):
"""
"""
This method calculates the average sentence score
This method calculates the average sentence score
...
@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -183,14 +184,15 @@ class SummaryTFIDF(ISummaryStrategy):
return
average_sent_score
return
average_sent_score
def
create_summary_strat1
(
self
,
sentences
,
sentence_score
,
threshold
):
@
staticmethod
def
create_summary
(
sentences
,
sentence_score
,
threshold
):
"""
"""
This method returns a summary with all sentences having a higher
This method returns a summary with all sentences having a higher
sentence score than the threshold
sentence score than the threshold
:param sentences: list of all sentences
:param sentences: list of all sentences
:param sentence_score: list of sentences with sentence score
:param sentence_score: list of sentences with sentence score
:param threshold: thresh
h
old for sentence score
:param threshold: threshold for sentence score
:returns: summary: generated summary
:returns: summary: generated summary
"""
"""
summary
=
''
summary
=
''
...
@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy):
...
@@ -202,86 +204,15 @@ class SummaryTFIDF(ISummaryStrategy):
return
summary
[
1
:]
return
summary
[
1
:]
def
create_summary_strat2
(
self
,
sentence_score
,
percentOfText
):
def
summarize
(
self
,
text
:
str
)
->
str
:
"""
text
=
self
.
nlpGer
(
text
)
This method returns a summary which length is a percentage of the
given text
:param sentence_score: list of sentences with sentence score
:param percentOfText: percentage of sentences in the summary in
relation to the given text
:returns: summary: generated summary
"""
top_sentences
=
(
sorted
(
sentence_score
.
values
())[::
-
1
])
percentOfText
=
percentOfText
/
100
top_percent_sentence
=
int
(
percentOfText
*
len
(
top_sentences
))
top_sent
=
top_sentences
[:
top_percent_sentence
]
summary
=
''
for
sent
,
strength
in
sentence_score
.
items
():