Commit 8bd5d6d9 authored by Patrick Schlindwein's avatar Patrick Schlindwein
Browse files

Merge branch 'feat/#61-replace_nltk_with_spacy' into 'master'

Feat/#61 replace nltk with spacy

See merge request !46
parents d6cd3e8e 242b36b4
Pipeline #71928 passed with stages
in 2 minutes and 48 seconds
import spacy
import math
from nltk.stem import WordNetLemmatizer
import nltk
from app.summary.summary_strategy_interface import ISummaryStrategy
nltk.download('wordnet')
class SummaryTFIDF(ISummaryStrategy):
nlpGer = spacy.load('de_core_news_sm')
lemmatizer = WordNetLemmatizer()
def __init__(self):
self._id = "tfidf"
......@@ -37,11 +32,11 @@ class SummaryTFIDF(ISummaryStrategy):
# and their 'frequency' as the value
freq_table = {}
words = [word.text.lower() for word in sent if word.text.isalnum()]
words = [word.lemma_.lower() for word in sent if
word.text.isalnum()] # Lemmatize the word
for word in words:
word = self.lemmatizer.lemmatize(word) # Lemmatize the word
if word not in stop_words: # Reject stop_words
if word not in stop_words: # Reject stopWords
if word in freq_table:
freq_table[word] += 1
else:
......
import math
import nltk
import pandas as pd
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance
import numpy as np
from scipy.spatial import distance_matrix
import spacy
from app.summary.summary_strategy_interface import ISummaryStrategy
nltk.download('punkt')
class SentenceEmbeddingSummarizer(ISummaryStrategy):
model = SentenceTransformer(
'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
nlp = spacy.load('de_core_news_sm')
def __init__(self):
self._id = "sentence_embedding"
......@@ -24,12 +25,12 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
return self._id
def summarize(self, text: str, max_length: int) -> str:
# convert the article/passage to a list of sentences using nltk’s
# sentence tokenizer.
sentences = nltk.sent_tokenize(text)
text = self.nlp(text)
# convert the article/passage to a list of sentences using spacy
sentences = list(text.sents)
# strip leading and trailing spaces
sentences = [sentence.strip() for sentence in sentences]
sentences = [sentence.text.strip() for sentence in sentences]
# for applying different transformations of the data efficiently,
# transform to Pandas Dataframe
......@@ -46,7 +47,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
iterations = 25
embeddings = np.array(data['embeddings'].tolist())
kclusterer = KMeansClusterer(
num_clusters, distance=nltk.cluster.util.cosine_distance,
num_clusters, distance=cosine_distance,
repeats=iterations, avoid_empty_clusters=True)
assigned_clusters = kclusterer \
.cluster(embeddings, assign_clusters=True)
......
import math
import nltk
from unittest import TestCase
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
import spacy
class TestSummarizationSentenceEmbedding(TestCase):
......@@ -184,9 +184,10 @@ class TestSummarizationSentenceEmbedding(TestCase):
summary = summary_sentence_embedding.summarize(self.test_text,
max_length=130)
sentences = nltk.sent_tokenize(self.test_text)
sentences = [sentence.strip() for sentence in sentences]
nlp = spacy.load('de_core_news_sm')
text = nlp(self.test_text)
sentences = list(text.sents)
sentences = [sentence.text.strip() for sentence in sentences]
total_sentences_text = len(sentences)
min_num_sentences_for_summary = 1
......@@ -200,8 +201,9 @@ class TestSummarizationSentenceEmbedding(TestCase):
elif num_sentences_for_summary > max_num_sentences_for_summary:
num_sentences_for_summary = max_num_sentences_for_summary
sentences_summary = nltk.sent_tokenize(summary)
sentences_summary = [sentenceSummary.strip() for sentenceSummary in
text_summary = nlp(summary)
sentences_summary = list(text_summary.sents)
sentences_summary = [sentence.text.strip() for sentence in
sentences_summary]
total_sentences_summary = len(sentences_summary)
......
......@@ -72,7 +72,6 @@ watchgod==0.7
wcwidth==0.2.5
websockets==8.1
sentence-transformers==1.1.0
nltk==3.6.2
pandas==1.2.4
scipy==1.6.2
protobuf==3.16.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment