summary_word_embedding.py 2.73 KB
Newer Older
Tim Seegmüller's avatar
Tim Seegmüller committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from typing import Any

from spacy.lang.de.stop_words import STOP_WORDS
from string import punctuation
import spacy

from app.summary.summary_strategy_interface import ISummaryStrategy


class WordEmbeddingSummarizer(ISummaryStrategy):
    nlp = spacy.load('de_core_news_sm')

    def __init__(self):
        self._id = "word_embedding"
        super().__init__()

    @property
    def id(self):
        return self._id

    def summarize(self, text: str) -> str:
        extra_words = list(STOP_WORDS) + list(punctuation) + ['\n']
        docx = self.nlp(text)
        # Technik um ein Vokabular anzulegen
        freq_word: dict[Any, float] = self.__get_vocab(docx, extra_words)
        # Überschrift-Generierung:
        val = sorted((freq_word.values()))
        max_freq = val[-3:]

        # TFIDF - Term Frequency - Inverse Document Frequency:
        # Maß, um die Wichtigkeit eines Wortes im Dokument ausfindig zu machen
        for word in freq_word.keys():
            freq_word[word] = (freq_word[word] / max_freq[-1])

        # Sentence Strength - Vergibt jedem Satz im Dokument eine Gewichtung
        sent_strength = self.__get_sentence_strength(docx, freq_word)

        # Getting strong sentences - um die wichtigsten Sätze herauszufiltern
        top_sentences = (sorted(sent_strength.values())[::-1])
        top50percent_sentence = int(0.5 * len(top_sentences))
        top_sent = top_sentences[:top50percent_sentence]

        # Zusammenfassung erstellen:
        return self.__get_summary(sent_strength, top_sent)

    @staticmethod
    def __get_vocab(docx, extra_words):
        all_words = [word.text for word in docx]
        freq_word = {}
        for w in all_words:
            w1 = w.lower()
            if w1 not in extra_words and w1.isalpha():
                if w1 in freq_word.keys():
                    freq_word[w1] += 1
                else:
                    freq_word[w1] = 1
        return freq_word

    @staticmethod
    def __get_summary(sent_strength, top_sent):
        sentence_array = []
        for sent, strength in sent_strength.items():
            if strength in top_sent:
                sentence_array.append(sent.text)
            else:
                continue
        return ' '.join(sentence_array)

    @staticmethod
    def __get_sentence_strength(docx, freq_word):
        sent_strength = {}
        for sent in docx.sents:
            for word in sent:
                if word.text.lower() in freq_word.keys():
                    if sent in sent_strength.keys():
                        sent_strength[sent] += freq_word[word.text.lower()]
                    else:
                        sent_strength[sent] = freq_word[word.text.lower()]
                else:
                    continue
        return sent_strength