Commit 242b36b4 authored by Nils König's avatar Nils König Committed by Patrick Schlindwein
Browse files

Feat/#61 replace nltk with spacy

parent da5f7a0a
import spacy import spacy
import math import math
from nltk.stem import WordNetLemmatizer
import nltk
from app.summary.summary_strategy_interface import ISummaryStrategy from app.summary.summary_strategy_interface import ISummaryStrategy
nltk.download('wordnet')
class SummaryTFIDF(ISummaryStrategy): class SummaryTFIDF(ISummaryStrategy):
nlpGer = spacy.load('de_core_news_sm') nlpGer = spacy.load('de_core_news_sm')
lemmatizer = WordNetLemmatizer()
def __init__(self): def __init__(self):
self._id = "tfidf" self._id = "tfidf"
...@@ -37,11 +32,11 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -37,11 +32,11 @@ class SummaryTFIDF(ISummaryStrategy):
# and their 'frequency' as the value # and their 'frequency' as the value
freq_table = {} freq_table = {}
words = [word.text.lower() for word in sent if word.text.isalnum()] words = [word.lemma_.lower() for word in sent if
word.text.isalnum()] # Lemmatize the word
for word in words: for word in words:
word = self.lemmatizer.lemmatize(word) # Lemmatize the word if word not in stop_words: # Reject stopWords
if word not in stop_words: # Reject stop_words
if word in freq_table: if word in freq_table:
freq_table[word] += 1 freq_table[word] += 1
else: else:
......
import math import math
import nltk
import pandas as pd import pandas as pd
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance
import numpy as np import numpy as np
from scipy.spatial import distance_matrix from scipy.spatial import distance_matrix
import spacy
from app.summary.summary_strategy_interface import ISummaryStrategy from app.summary.summary_strategy_interface import ISummaryStrategy
nltk.download('punkt')
class SentenceEmbeddingSummarizer(ISummaryStrategy): class SentenceEmbeddingSummarizer(ISummaryStrategy):
model = SentenceTransformer( model = SentenceTransformer(
'T-Systems-onsite/cross-en-de-roberta-sentence-transformer') 'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
nlp = spacy.load('de_core_news_sm')
def __init__(self): def __init__(self):
self._id = "sentence_embedding" self._id = "sentence_embedding"
...@@ -24,12 +25,12 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy): ...@@ -24,12 +25,12 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
return self._id return self._id
def summarize(self, text: str, max_length: int) -> str: def summarize(self, text: str, max_length: int) -> str:
# convert the article/passage to a list of sentences using nltk’s text = self.nlp(text)
# sentence tokenizer. # convert the article/passage to a list of sentences using spacy
sentences = nltk.sent_tokenize(text) sentences = list(text.sents)
# strip leading and trailing spaces # strip leading and trailing spaces
sentences = [sentence.strip() for sentence in sentences] sentences = [sentence.text.strip() for sentence in sentences]
# for applying different transformations of the data efficiently, # for applying different transformations of the data efficiently,
# transform to Pandas Dataframe # transform to Pandas Dataframe
...@@ -46,7 +47,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy): ...@@ -46,7 +47,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
iterations = 25 iterations = 25
embeddings = np.array(data['embeddings'].tolist()) embeddings = np.array(data['embeddings'].tolist())
kclusterer = KMeansClusterer( kclusterer = KMeansClusterer(
num_clusters, distance=nltk.cluster.util.cosine_distance, num_clusters, distance=cosine_distance,
repeats=iterations, avoid_empty_clusters=True) repeats=iterations, avoid_empty_clusters=True)
assigned_clusters = kclusterer \ assigned_clusters = kclusterer \
.cluster(embeddings, assign_clusters=True) .cluster(embeddings, assign_clusters=True)
......
import math import math
import nltk
from unittest import TestCase from unittest import TestCase
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
import spacy
class TestSummarizationSentenceEmbedding(TestCase): class TestSummarizationSentenceEmbedding(TestCase):
...@@ -184,9 +184,10 @@ class TestSummarizationSentenceEmbedding(TestCase): ...@@ -184,9 +184,10 @@ class TestSummarizationSentenceEmbedding(TestCase):
summary = summary_sentence_embedding.summarize(self.test_text, summary = summary_sentence_embedding.summarize(self.test_text,
max_length=130) max_length=130)
sentences = nltk.sent_tokenize(self.test_text) nlp = spacy.load('de_core_news_sm')
text = nlp(self.test_text)
sentences = [sentence.strip() for sentence in sentences] sentences = list(text.sents)
sentences = [sentence.text.strip() for sentence in sentences]
total_sentences_text = len(sentences) total_sentences_text = len(sentences)
min_num_sentences_for_summary = 1 min_num_sentences_for_summary = 1
...@@ -200,8 +201,9 @@ class TestSummarizationSentenceEmbedding(TestCase): ...@@ -200,8 +201,9 @@ class TestSummarizationSentenceEmbedding(TestCase):
elif num_sentences_for_summary > max_num_sentences_for_summary: elif num_sentences_for_summary > max_num_sentences_for_summary:
num_sentences_for_summary = max_num_sentences_for_summary num_sentences_for_summary = max_num_sentences_for_summary
sentences_summary = nltk.sent_tokenize(summary) text_summary = nlp(summary)
sentences_summary = [sentenceSummary.strip() for sentenceSummary in sentences_summary = list(text_summary.sents)
sentences_summary = [sentence.text.strip() for sentence in
sentences_summary] sentences_summary]
total_sentences_summary = len(sentences_summary) total_sentences_summary = len(sentences_summary)
......
...@@ -72,7 +72,6 @@ watchgod==0.7 ...@@ -72,7 +72,6 @@ watchgod==0.7
wcwidth==0.2.5 wcwidth==0.2.5
websockets==8.1 websockets==8.1
sentence-transformers==1.1.0 sentence-transformers==1.1.0
nltk==3.6.2
pandas==1.2.4 pandas==1.2.4
scipy==1.6.2 scipy==1.6.2
protobuf==3.16.0 protobuf==3.16.0
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment