Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
pse-trapp-public
IntentFinder
Commits
8bd5d6d9
Commit
8bd5d6d9
authored
May 25, 2021
by
Patrick Schlindwein
Browse files
Merge branch 'feat/
#61
-replace_nltk_with_spacy' into 'master'
Feat/
#61
replace nltk with spacy See merge request
!46
parents
d6cd3e8e
242b36b4
Pipeline
#71928
passed with stages
in 2 minutes and 48 seconds
Changes
4
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/nlp/app/summary/summarization_with_strategy_TFIDF.py
View file @
8bd5d6d9
import
spacy
import
math
from
nltk.stem
import
WordNetLemmatizer
import
nltk
from
app.summary.summary_strategy_interface
import
ISummaryStrategy
nltk
.
download
(
'wordnet'
)
class
SummaryTFIDF
(
ISummaryStrategy
):
nlpGer
=
spacy
.
load
(
'de_core_news_sm'
)
lemmatizer
=
WordNetLemmatizer
()
def
__init__
(
self
):
self
.
_id
=
"tfidf"
...
...
@@ -37,11 +32,11 @@ class SummaryTFIDF(ISummaryStrategy):
# and their 'frequency' as the value
freq_table
=
{}
words
=
[
word
.
text
.
lower
()
for
word
in
sent
if
word
.
text
.
isalnum
()]
words
=
[
word
.
lemma_
.
lower
()
for
word
in
sent
if
word
.
text
.
isalnum
()]
# Lemmatize the word
for
word
in
words
:
word
=
self
.
lemmatizer
.
lemmatize
(
word
)
# Lemmatize the word
if
word
not
in
stop_words
:
# Reject stop_words
if
word
not
in
stop_words
:
# Reject stopWords
if
word
in
freq_table
:
freq_table
[
word
]
+=
1
else
:
...
...
src/nlp/app/summary/summary_sentence_embedding.py
View file @
8bd5d6d9
import
math
import
nltk
import
pandas
as
pd
from
sentence_transformers
import
SentenceTransformer
from
nltk.cluster
import
KMeansClusterer
from
nltk.cluster.util
import
cosine_distance
import
numpy
as
np
from
scipy.spatial
import
distance_matrix
import
spacy
from
app.summary.summary_strategy_interface
import
ISummaryStrategy
nltk
.
download
(
'punkt'
)
class
SentenceEmbeddingSummarizer
(
ISummaryStrategy
):
model
=
SentenceTransformer
(
'T-Systems-onsite/cross-en-de-roberta-sentence-transformer'
)
nlp
=
spacy
.
load
(
'de_core_news_sm'
)
def
__init__
(
self
):
self
.
_id
=
"sentence_embedding"
...
...
@@ -24,12 +25,12 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
return
self
.
_id
def
summarize
(
self
,
text
:
str
,
max_length
:
int
)
->
str
:
# convert the article/passage to a list of sentences using nltk’s
#
sentence tokenizer.
sentences
=
nltk
.
sent_tokenize
(
text
)
text
=
self
.
nlp
(
text
)
#
convert the article/passage to a list of sentences using spacy
sentences
=
list
(
text
.
sents
)
# strip leading and trailing spaces
sentences
=
[
sentence
.
strip
()
for
sentence
in
sentences
]
sentences
=
[
sentence
.
text
.
strip
()
for
sentence
in
sentences
]
# for applying different transformations of the data efficiently,
# transform to Pandas Dataframe
...
...
@@ -46,7 +47,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
iterations
=
25
embeddings
=
np
.
array
(
data
[
'embeddings'
].
tolist
())
kclusterer
=
KMeansClusterer
(
num_clusters
,
distance
=
nltk
.
cluster
.
util
.
cosine_distance
,
num_clusters
,
distance
=
cosine_distance
,
repeats
=
iterations
,
avoid_empty_clusters
=
True
)
assigned_clusters
=
kclusterer
\
.
cluster
(
embeddings
,
assign_clusters
=
True
)
...
...
src/nlp/app/tests/test_summary_sentence_embedding.py
View file @
8bd5d6d9
import
math
import
nltk
from
unittest
import
TestCase
from
app.summary.summary_sentence_embedding
import
SentenceEmbeddingSummarizer
import
spacy
class
TestSummarizationSentenceEmbedding
(
TestCase
):
...
...
@@ -184,9 +184,10 @@ class TestSummarizationSentenceEmbedding(TestCase):
summary
=
summary_sentence_embedding
.
summarize
(
self
.
test_text
,
max_length
=
130
)
sentences
=
nltk
.
sent_tokenize
(
self
.
test_text
)
sentences
=
[
sentence
.
strip
()
for
sentence
in
sentences
]
nlp
=
spacy
.
load
(
'de_core_news_sm'
)
text
=
nlp
(
self
.
test_text
)
sentences
=
list
(
text
.
sents
)
sentences
=
[
sentence
.
text
.
strip
()
for
sentence
in
sentences
]
total_sentences_text
=
len
(
sentences
)
min_num_sentences_for_summary
=
1
...
...
@@ -200,8 +201,9 @@ class TestSummarizationSentenceEmbedding(TestCase):
elif
num_sentences_for_summary
>
max_num_sentences_for_summary
:
num_sentences_for_summary
=
max_num_sentences_for_summary
sentences_summary
=
nltk
.
sent_tokenize
(
summary
)
sentences_summary
=
[
sentenceSummary
.
strip
()
for
sentenceSummary
in
text_summary
=
nlp
(
summary
)
sentences_summary
=
list
(
text_summary
.
sents
)
sentences_summary
=
[
sentence
.
text
.
strip
()
for
sentence
in
sentences_summary
]
total_sentences_summary
=
len
(
sentences_summary
)
...
...
src/nlp/requirements.txt
View file @
8bd5d6d9
...
...
@@ -72,7 +72,6 @@ watchgod==0.7
wcwidth==0.2.5
websockets==8.1
sentence-transformers==1.1.0
nltk==3.6.2
pandas==1.2.4
scipy==1.6.2
protobuf==3.16.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment