Commit d0a88176 authored by Tim Seegmüller's avatar Tim Seegmüller
Browse files

[feat,#58] implemented reload models once

parent a2821916
......@@ -8,7 +8,10 @@ from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator
from app.questiongenerator import QuestionGenerator
from pydantic import BaseModel
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator
from app.utilities.models import nlp, bert_model, bert_tokenizer
class Item(BaseModel):
text: str
......@@ -16,6 +19,7 @@ class Item(BaseModel):
answer_style: str
app = FastAPI(
title="IntentFinder: NLP-API",
version="1.0",
......@@ -31,11 +35,11 @@ Note that every summarization strategy should implement the ISummaryStrategy
interface.
"""
strategies = [
SimpleSpacySummarizer(),
SentenceEmbeddingSummarizer(),
SummaryTFIDF(),
BertSummary(),
WordEmbeddingSummarizer()]
SimpleSpacySummarizer(_nlp=nlp),
SentenceEmbeddingSummarizer(_nlp=nlp),
SummaryTFIDF(_nlp=nlp),
BertSummary(_bert_model=bert_model, _bert_tokenizer=bert_tokenizer),
WordEmbeddingSummarizer(_nlp=nlp)]
@app.post("/questionGenerator")
......
......@@ -2,12 +2,12 @@ from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
from app.summary.summary_strategy_interface import ISummaryStrategy
from app.utilities.modules import nlp
class SimpleSpacySummarizer(ISummaryStrategy):
def __init__(self):
def __init__(self, _nlp):
self._id = "simple_spacy_summarizer"
self._nlp = _nlp
super().__init__()
@property
......@@ -15,7 +15,8 @@ class SimpleSpacySummarizer(ISummaryStrategy):
return self._id
def summarize(self, text: str, num_sentences: int):
doc = nlp(text)
doc = self._nlp(text)
# Divide into tokens, vectorize and remove stop words
word_frequency = self.divide_into_tokens(doc)
......
import math
from app.summary.summary_strategy_interface import ISummaryStrategy
from app.utilities.modules import nlp
class SummaryTFIDF(ISummaryStrategy):
def __init__(self):
def __init__(self, _nlp):
self._id = "tfidf"
self._nlp = _nlp
super().__init__()
@property
......@@ -24,7 +25,7 @@ class SummaryTFIDF(ISummaryStrategy):
"""
freq_matrix = {}
stop_words = nlp.Defaults.stop_words
stop_words = self._nlp.Defaults.stop_words
for sent in summary:
# dictionary with 'words' as the key
......@@ -191,7 +192,7 @@ class SummaryTFIDF(ISummaryStrategy):
return summary
def summarize(self, text: str, num_sentences: int) -> str:
text = nlp(text)
text = self._nlp(text)
# put all sentences in a list
sentences = list(text.sents)
......
from summarizer import Summarizer
from app.summary.summary_strategy_interface import ISummaryStrategy
from app.utilities.modules import bert_german_model, bert_german_tokenizer
class BertSummary(ISummaryStrategy):
def __init__(self):
def __init__(self, _bert_model, _bert_tokenizer):
self._id = "bert"
self._bert_model = _bert_model
self._bert_tokenizer = _bert_tokenizer
super().__init__()
@property
......@@ -27,7 +28,7 @@ class BertSummary(ISummaryStrategy):
# summarizes text differently somehow custom summarizer has to be
# created with a model and tokenizer
custom_summarizer = Summarizer(custom_model=bert_german_model,
custom_tokenizer=bert_german_tokenizer)
custom_summarizer = Summarizer(custom_model=self._bert_model,
custom_tokenizer=self._bert_tokenizer)
model = custom_summarizer
return model(text, num_sentences=num_sentences, use_first=False)
......@@ -5,12 +5,13 @@ from nltk.cluster.util import cosine_distance
from scipy.spatial import distance_matrix
from app.summary.summary_strategy_interface import ISummaryStrategy
from app.utilities.modules import nlp, model
from app.utilities.models import sentence_transformer
class SentenceEmbeddingSummarizer(ISummaryStrategy):
def __init__(self):
def __init__(self, _nlp):
self._id = "sentence_embedding"
self._nlp = _nlp
super().__init__()
@property
......@@ -18,7 +19,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
return self._id
def summarize(self, text: str, num_sentences: int) -> str:
text = nlp(text)
text = self._nlp(text)
# convert the article/passage to a list of sentences using spacy
sentences = list(text.sents)
......@@ -32,7 +33,8 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
# create new column 'embeddings using
data['embeddings'] = data['sentence']. \
apply(SentenceEmbeddingSummarizer.__get_sentence_embeddings)
apply(
SentenceEmbeddingSummarizer.__get_sentence_embeddings)
# cluster sentences that are contextually similar
num_clusters = num_sentences
......@@ -61,7 +63,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
:param sentence: single sentence
:return: vector from given sentence
"""
embedding = model.encode([sentence])
embedding = sentence_transformer.encode([sentence])
return embedding[0]
@staticmethod
......
......@@ -4,12 +4,12 @@ from typing import Any
from spacy.lang.de.stop_words import STOP_WORDS
from app.summary.summary_strategy_interface import ISummaryStrategy
from app.utilities.modules import nlp
class WordEmbeddingSummarizer(ISummaryStrategy):
def __init__(self):
def __init__(self, _nlp):
self._id = "word_embedding"
self._nlp = _nlp
super().__init__()
@property
......@@ -18,7 +18,7 @@ class WordEmbeddingSummarizer(ISummaryStrategy):
def summarize(self, text: str, num_sentences: int) -> str:
extra_words = list(STOP_WORDS) + list(punctuation) + ['\n']
docx = nlp(text)
docx = self._nlp(text)
# Technik um ein Vokabular anzulegen
freq_word: dict[Any, float] = self.__get_vocab(docx, extra_words)
# Überschrift-Generierung:
......
......@@ -2,6 +2,7 @@ from unittest import TestCase
from app.summary.simple_spacy_summarizer import SimpleSpacySummarizer
from app.tests.test_constants import test_text, num_sentences
from app.utilities.models import nlp
class TestBasicSimpleSpacySummarization(TestCase):
......@@ -12,7 +13,7 @@ class TestBasicSimpleSpacySummarization(TestCase):
# init test data
test_id = "simple_spacy_summarizer"
summarizer = SimpleSpacySummarizer()
summarizer = SimpleSpacySummarizer(_nlp=nlp)
self.assertEqual(test_id, summarizer.id)
def test_text_shortening(self):
......@@ -21,7 +22,7 @@ class TestBasicSimpleSpacySummarization(TestCase):
"""
# init test data
summarizer = SimpleSpacySummarizer()
summarizer = SimpleSpacySummarizer(_nlp=nlp)
summarized_text = summarizer.summarize(test_text,
num_sentences=num_sentences)
self.assertLess(len(summarized_text), len(test_text))
......@@ -2,19 +2,19 @@ from unittest import TestCase
from app.summary.summarization_with_strategy_TFIDF import SummaryTFIDF
from app.tests.test_constants import test_text, num_sentences
from app.utilities.modules import nlp
from app.utilities.models import nlp
class TestSummarizationSpacy(TestCase):
def test_summarize_text(self):
summary_tfidf = SummaryTFIDF()
summary_tfidf = SummaryTFIDF(_nlp=nlp)
summary = summary_tfidf.summarize(test_text,
num_sentences=num_sentences)
assert type(summary) == str
def test_summarize_text_length(self):
summary_tfidf = SummaryTFIDF()
summary_tfidf = SummaryTFIDF(_nlp=nlp)
summary = summary_tfidf \
.summarize(test_text, num_sentences=num_sentences)
......
......@@ -2,30 +2,34 @@ from unittest import TestCase
from app.summary.summary_bert import BertSummary
from app.tests.test_constants import test_text, num_sentences
from app.utilities.modules import nlp
from app.utilities.models import nlp, bert_tokenizer, bert_model
class TestBertSummarization(TestCase):
def test_strategy_id(self):
test_id = "bert"
summarizer = BertSummary()
summarizer = BertSummary(_bert_model=bert_model,
_bert_tokenizer=bert_tokenizer)
self.assertEqual(test_id, summarizer.id)
def test_text_shortening(self):
summarizer = BertSummary()
summarizer = BertSummary(_bert_model=bert_model,
_bert_tokenizer=bert_tokenizer)
summarized_text = summarizer.summarize(test_text,
num_sentences=num_sentences)
self.assertLess(len(summarized_text), len(test_text))
def test_text_str(self):
summary_bert = BertSummary()
summary = summary_bert.summarize(test_text,
num_sentences=num_sentences)
summarizer = BertSummary(_bert_model=bert_model,
_bert_tokenizer=bert_tokenizer)
summary = summarizer.summarize(test_text,
num_sentences=num_sentences)
assert type(summary) == str
def test_ratio_value(self):
sum_bert = BertSummary()
summary = sum_bert.summarize(test_text,
num_sentences=num_sentences)
summarizer = BertSummary(_bert_model=bert_model,
_bert_tokenizer=bert_tokenizer)
summary = summarizer.summarize(test_text,
num_sentences=num_sentences)
self.assertLessEqual(len(list(nlp(summary).sents)),
num_sentences)
......@@ -2,19 +2,19 @@ from unittest import TestCase
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
from app.tests.test_constants import test_text, num_sentences
from app.utilities.modules import nlp
from app.utilities.models import nlp
class TestSummarizationSentenceEmbedding(TestCase):
def test_summarize_text(self):
summary_sentence_embedding = SentenceEmbeddingSummarizer()
summary_sentence_embedding = SentenceEmbeddingSummarizer(_nlp=nlp)
summary = summary_sentence_embedding \
.summarize(test_text, num_sentences=num_sentences)
assert type(summary) == str
def test_summarize_text_length(self):
summary_sentence_embedding = SentenceEmbeddingSummarizer()
summary_sentence_embedding = SentenceEmbeddingSummarizer(_nlp=nlp)
summary = summary_sentence_embedding \
.summarize(test_text, num_sentences=num_sentences)
......
......@@ -2,12 +2,12 @@ from unittest import TestCase
from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.tests.test_constants import test_text, num_sentences
from app.utilities.modules import nlp
from app.utilities.models import nlp
class TestSummarizationSentenceEmbedding(TestCase):
def test_summarize_text(self):
summary_word_embedding = WordEmbeddingSummarizer()
summary_word_embedding = WordEmbeddingSummarizer(_nlp=nlp)
summary = summary_word_embedding \
.summarize(test_text, num_sentences=num_sentences)
......@@ -15,7 +15,7 @@ class TestSummarizationSentenceEmbedding(TestCase):
assert type(summary) == str
def test_summarize_text_length(self):
summary_sentence_embedding = WordEmbeddingSummarizer()
summary_sentence_embedding = WordEmbeddingSummarizer(_nlp=nlp)
summary = summary_sentence_embedding \
.summarize(test_text, num_sentences=num_sentences)
......
......@@ -3,9 +3,9 @@ from sentence_transformers import SentenceTransformer
from transformers import BertModel, BertTokenizer
nlp = spacy.load('de_core_news_sm')
model = SentenceTransformer(
'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
bert_german_model = BertModel.from_pretrained(
'bert-base-german-cased', output_hidden_states=True)
bert_german_tokenizer = BertTokenizer.from_pretrained(
'bert-base-german-cased')
sentence_transformer = SentenceTransformer(
'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
bert_model = BertModel.from_pretrained(
'bert-base-german-cased', output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained(
'bert-base-german-cased')
......@@ -75,8 +75,7 @@ sentence-transformers==1.1.0
pandas==1.2.4
scipy==1.6.2
protobuf==3.16.0
summarizer~=0.0.7
summarizer==0.0.7
bert-extractive-summarizer==0.7.1
sacremoses==0.0.43
sentencepiece==0.1.94
tokenizers==0.9.4
torch==1.7.1
sentencepiece==0.1.94
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment