Commit f1eb645e authored by Christof Walther's avatar Christof Walther Committed by Patrick Schlindwein
Browse files

#65 add max length parameter to ISummaryStrategy

parent f9651faf
...@@ -47,11 +47,12 @@ async def api_strategies(): ...@@ -47,11 +47,12 @@ async def api_strategies():
@app.get("/summarize/{strategy_id}", summary="Generate a summary of the given" @app.get("/summarize/{strategy_id}", summary="Generate a summary of the given"
" text.") " text.")
async def summarize(strategy_id: str, text: str): async def summarize(strategy_id: str, max_length: int, text: str):
""" """
This function will summarize a given text with a given summarization This function will summarize a given text with a given summarization
strategy strategy
:param max_length: max number of characters in the summarization
:param strategy_id: The id of the strategy :param strategy_id: The id of the strategy
:param text: The text to be summarized :param text: The text to be summarized
:return: The summary, strategy and quality of the summary in JSON format :return: The summary, strategy and quality of the summary in JSON format
...@@ -59,7 +60,7 @@ async def summarize(strategy_id: str, text: str): ...@@ -59,7 +60,7 @@ async def summarize(strategy_id: str, text: str):
for strategy in strategies: for strategy in strategies:
if strategy.id == strategy_id: if strategy.id == strategy_id:
quality = 0.5 quality = 0.5
summary = strategy.summarize(text) summary = strategy.summarize(text, max_length)
return {"strategy": strategy_id, "quality": quality, return {"strategy": strategy_id, "quality": quality,
"summary": summary} "summary": summary}
......
...@@ -15,7 +15,7 @@ class SimpleSpacySummarizer(ISummaryStrategy): ...@@ -15,7 +15,7 @@ class SimpleSpacySummarizer(ISummaryStrategy):
def id(self): def id(self):
return self._id return self._id
def summarize(self, text: str): def summarize(self, text: str, max_length: int):
doc = self.nlp(text) doc = self.nlp(text)
# Divide into tokens, vectorize and remove stop words # Divide into tokens, vectorize and remove stop words
......
from app.summary.summary_strategy_interface import ISummaryStrategy
import spacy import spacy
import math import math
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import nltk import nltk
from app.summary.summary_strategy_interface import ISummaryStrategy
nltk.download('wordnet') nltk.download('wordnet')
...@@ -204,7 +204,7 @@ class SummaryTFIDF(ISummaryStrategy): ...@@ -204,7 +204,7 @@ class SummaryTFIDF(ISummaryStrategy):
return summary[1:] return summary[1:]
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
text = self.nlpGer(text) text = self.nlpGer(text)
# put all sentences in a list # put all sentences in a list
......
...@@ -23,7 +23,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy): ...@@ -23,7 +23,7 @@ class SentenceEmbeddingSummarizer(ISummaryStrategy):
def id(self): def id(self):
return self._id return self._id
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
# convert the article/passage to a list of sentences using nltk’s # convert the article/passage to a list of sentences using nltk’s
# sentence tokenizer. # sentence tokenizer.
sentences = nltk.sent_tokenize(text) sentences = nltk.sent_tokenize(text)
......
...@@ -11,11 +11,12 @@ class ISummaryStrategy(ABC): ...@@ -11,11 +11,12 @@ class ISummaryStrategy(ABC):
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
""" """
This methods generates the summary to a given text This methods generates the summary to a given text
:param text: text to summarize :param text: text to summarize
:param max_length: max number of characters in the summarization
:returns: summary: a string representing a summarized version of the :returns: summary: a string representing a summarized version of the
input text input text
""" """
......
...@@ -18,7 +18,7 @@ class WordEmbeddingSummarizer(ISummaryStrategy): ...@@ -18,7 +18,7 @@ class WordEmbeddingSummarizer(ISummaryStrategy):
def id(self): def id(self):
return self._id return self._id
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
extra_words = list(STOP_WORDS) + list(punctuation) + ['\n'] extra_words = list(STOP_WORDS) + list(punctuation) + ['\n']
docx = self.nlp(text) docx = self.nlp(text)
# Technik um ein Vokabular anzulegen # Technik um ein Vokabular anzulegen
......
...@@ -12,7 +12,7 @@ class TestStrategy1(ISummaryStrategy): ...@@ -12,7 +12,7 @@ class TestStrategy1(ISummaryStrategy):
id = "test1" id = "test1"
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
return text return text
...@@ -24,7 +24,7 @@ class TestStrategy2(ISummaryStrategy): ...@@ -24,7 +24,7 @@ class TestStrategy2(ISummaryStrategy):
id = "test2" id = "test2"
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
return "result text" return "result text"
...@@ -65,8 +65,10 @@ class TestNlpApi(TestCase): ...@@ -65,8 +65,10 @@ class TestNlpApi(TestCase):
client = TestClient(app.nlp_server.app) client = TestClient(app.nlp_server.app)
# run test # run test
response1 = client.get("/summarize/test1?text=test%20text") response1 = client.get(
response2 = client.get("/summarize/test2?text=test%20text") "/summarize/test1?max_length=130&text=test%20text")
response2 = client.get(
"/summarize/test2?max_length=130&text=test%20text")
# assert result # assert result
assert response1.status_code == 200 assert response1.status_code == 200
......
...@@ -53,5 +53,5 @@ class TestBasicSimpleSpacySummarization(TestCase): ...@@ -53,5 +53,5 @@ class TestBasicSimpleSpacySummarization(TestCase):
# init test data # init test data
summarizer = SimpleSpacySummarizer() summarizer = SimpleSpacySummarizer()
summarized_text = summarizer.summarize(self.test_text) summarized_text = summarizer.summarize(self.test_text, max_length=130)
self.assertLess(len(summarized_text), len(self.test_text)) self.assertLess(len(summarized_text), len(self.test_text))
...@@ -165,5 +165,5 @@ class TestSummarizationSpacy(TestCase): ...@@ -165,5 +165,5 @@ class TestSummarizationSpacy(TestCase):
def test_summarize_text(self): def test_summarize_text(self):
summary_tfidf = SummaryTFIDF() summary_tfidf = SummaryTFIDF()
summary = summary_tfidf.summarize(self.test_text) summary = summary_tfidf.summarize(self.test_text, max_length=130)
assert type(summary) == str assert type(summary) == str
import math import math
from unittest import TestCase
import nltk import nltk
from unittest import TestCase
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
...@@ -172,7 +171,8 @@ class TestSummarizationSentenceEmbedding(TestCase): ...@@ -172,7 +171,8 @@ class TestSummarizationSentenceEmbedding(TestCase):
summary_sentence_embedding = SentenceEmbeddingSummarizer() summary_sentence_embedding = SentenceEmbeddingSummarizer()
summary = summary_sentence_embedding.summarize(self.test_text) summary = summary_sentence_embedding.summarize(self.test_text,
max_length=130)
assert type(summary) == str assert type(summary) == str
def test_summarize_text_length(self): def test_summarize_text_length(self):
...@@ -182,14 +182,18 @@ class TestSummarizationSentenceEmbedding(TestCase): ...@@ -182,14 +182,18 @@ class TestSummarizationSentenceEmbedding(TestCase):
summary_sentence_embedding = SentenceEmbeddingSummarizer() summary_sentence_embedding = SentenceEmbeddingSummarizer()
summary = summary_sentence_embedding.summarize(self.test_text) summary = summary_sentence_embedding.summarize(self.test_text,
max_length=130)
sentences = nltk.sent_tokenize(self.test_text) sentences = nltk.sent_tokenize(self.test_text)
sentences = [sentence.strip() for sentence in sentences] sentences = [sentence.strip() for sentence in sentences]
total_sentences_text = len(sentences) total_sentences_text = len(sentences)
min_num_sentences_for_summary = 1 min_num_sentences_for_summary = 1
max_num_sentences_for_summary = 10 max_num_sentences_for_summary = 10
num_sentences_for_summary = math.floor(total_sentences_text * 0.3) num_sentences_in_percent = 0.3
num_sentences_for_summary = math.floor(total_sentences_text *
num_sentences_in_percent)
if num_sentences_for_summary < min_num_sentences_for_summary: if num_sentences_for_summary < min_num_sentences_for_summary:
num_sentences_for_summary = min_num_sentences_for_summary num_sentences_for_summary = min_num_sentences_for_summary
......
...@@ -8,7 +8,7 @@ class TestImplementationValid(ISummaryStrategy): ...@@ -8,7 +8,7 @@ class TestImplementationValid(ISummaryStrategy):
""" """
id = "id1" id = "id1"
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
return text return text
...@@ -18,7 +18,7 @@ class TestImplementationInvalidOne(ISummaryStrategy): ...@@ -18,7 +18,7 @@ class TestImplementationInvalidOne(ISummaryStrategy):
This class will create a warning but the mistake is intended. This class will create a warning but the mistake is intended.
""" """
def summarize(self, text: str) -> str: def summarize(self, text: str, max_length: int) -> str:
return text return text
...@@ -35,7 +35,7 @@ class TestISummaryStrategy(TestCase): ...@@ -35,7 +35,7 @@ class TestISummaryStrategy(TestCase):
# run test functions # run test functions
ti = TestImplementationValid() ti = TestImplementationValid()
summary = ti.summarize(test_text) summary = ti.summarize(test_text, max_length=130)
# assert result # assert result
self.assertEqual(summary, test_text) self.assertEqual(summary, test_text)
......
...@@ -164,5 +164,6 @@ class TestSummarizationSentenceEmbedding(TestCase): ...@@ -164,5 +164,6 @@ class TestSummarizationSentenceEmbedding(TestCase):
def test_summarize_text(self): def test_summarize_text(self):
summary_word_embedding = WordEmbeddingSummarizer() summary_word_embedding = WordEmbeddingSummarizer()
summary = summary_word_embedding.summarize(self.text) summary = summary_word_embedding.summarize(self.text, max_length=130)
assert type(summary) == str assert type(summary) == str
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment