summarization_with_strategy_TFIDF.py 7.22 KB
Newer Older
1
2
import spacy
import math
3
from app.summary.summary_strategy_interface import ISummaryStrategy
4

5
6
7
8
9
10
11
12
13
14
15
16

class SummaryTFIDF(ISummaryStrategy):
    nlpGer = spacy.load('de_core_news_sm')

    def __init__(self):
        self._id = "tfidf"
        super().__init__()

    @property
    def id(self):
        return self._id

17
    def frequency_matrix(self, summary):
18
19
20
21
22
23
24
25
26
27
        """
        This method creates a tf-idf-matrix which is a list with all sentences
            containing a list with all words in the sentence and their
            frequency as value

        :param summary: given text to summarize
        :returns: freq_matrix: frequency matrix
        """
        freq_matrix = {}

28
        stop_words = self.nlpGer.Defaults.stop_words
29
30

        for sent in summary:
31
32
            # dictionary with 'words' as the key
            # and their 'frequency' as the value
33
34
            freq_table = {}

Nils König's avatar
Nils König committed
35
36
            words = [word.lemma_.lower() for word in sent if
                     word.text.isalnum()]  # Lemmatize the word
37
38

            for word in words:
Nils König's avatar
Nils König committed
39
                if word not in stop_words:  # Reject stopWords
40
41
42
43
44
45
46
47
48
                    if word in freq_table:
                        freq_table[word] += 1
                    else:
                        freq_table[word] = 1

            freq_matrix[sent] = freq_table

        return freq_matrix

49
50
    @staticmethod
    def tf_matrix(freq_matrix):
51
52
53
54
55
56
57
58
59
60
        """
        This method calculates the term frequency for every word

        :param freq_matrix: frequency matrix
        :returns: tf_matrix: a list with all sentences containing a list with
            all words and their term frequency
        """
        tf_matrix = {}

        for sent, freq_table in freq_matrix.items():
61
            # dictionary with 'word' itself as the key and its TF as the value
62
63
64
65
66
67
68
69
70
71
            tf_table = {}

            total_words_in_sentence = len(freq_table)
            for word, count in freq_table.items():
                tf_table[word] = count / total_words_in_sentence

            tf_matrix[sent] = tf_table

        return tf_matrix

72
73
    @staticmethod
    def sentences_per_words(freq_matrix):
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        """
        This methods returns a list with all words and how often a word is
            mentioned in a sentence

        :param freq_matrix: frequency matrix
        :returns: sent_per_words: sentences per words list
        """
        sent_per_words = {}

        for sent, f_table in freq_matrix.items():
            for word, count in f_table.items():
                if word in sent_per_words:
                    sent_per_words[word] += 1
                else:
                    sent_per_words[word] = 1

        return sent_per_words

92
93
    @staticmethod
    def idf_matrix(freq_matrix, sent_per_words, total_sentences):
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        """
        This methods calculates a idf score for every word

        :param freq_matrix: frequency matrix
        :param sent_per_words: sentences per words list
        :param total_sentences: total sentences list
        :returns: idf_matrix: list of all sentences containing a list of
            all words in this sentence with their idf value
        """
        idf_matrix = {}

        for sent, f_table in freq_matrix.items():
            idf_table = {}

            for word in f_table.keys():
                idf_table[word] = math.log10(
                    total_sentences / float(sent_per_words[word]))

            idf_matrix[sent] = idf_table

        return idf_matrix

116
117
    @staticmethod
    def tf_idf_matrix(tf_matrix, idf_matrix):
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
        """
        This methods calculates a tf-idf-score for every word

        :param tf_matrix: tf-matrix
        :param idf_matrix: idf-matrix
        :returns: tf_idf_matrix: list of all sentences containing a list of
            all words with their tf-idf value
        """
        tf_idf_matrix = {}

        for (sent1, f_table1), (sent2, f_table2) in zip(
                tf_matrix.items(), idf_matrix.items()):

            tf_idf_table = {}

            for (word1, tf_value), (word2, idf_value) in zip(f_table1.items(),
                                                             f_table2.items()):
                tf_idf_table[word1] = float(tf_value * idf_value)

            tf_idf_matrix[sent1] = tf_idf_table

        return tf_idf_matrix

141
142
    @staticmethod
    def score_sentences(tf_idf_matrix):
143
144
145
146
147
        """
        This methods calculates a sentence score for every sentence based on
            the tf-idf-matrix

        :param tf_idf_matrix: tf-idf-matrix
148
        :returns: sentence_score: list of all sentences with sentence score
149
        """
150
        sentence_score = {}
151
152
153
154
155
156
157
158
159

        for sent, f_table in tf_idf_matrix.items():
            total_tfidf_score_per_sentence = 0

            total_words_in_sentence = len(f_table)
            for word, tf_idf_score in f_table.items():
                total_tfidf_score_per_sentence += tf_idf_score

            if total_words_in_sentence != 0:
160
161
                sentence_score[sent] = total_tfidf_score_per_sentence / \
                                       total_words_in_sentence
162

163
        return sentence_score
164

165
166
    @staticmethod
    def average_score(sentence_score):
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
        """
        This method calculates the average sentence score

        :param sentence_score: list of sentences with sentence score
        :returns: average_sent_score: average sentence score
        """

        total_score = 0
        for sent in sentence_score:
            total_score += sentence_score[sent]

        average_sent_score = (total_score / len(sentence_score))

        return average_sent_score

182
183
    @staticmethod
    def create_summary(sentences, sentence_score, threshold):
184
185
186
187
188
189
        """
        This method returns a summary with all sentences having a higher
            sentence score than the threshold

        :param sentences: list of all sentences
        :param sentence_score: list of sentences with sentence score
190
        :param threshold: threshold for sentence score
191
192
193
194
195
196
197
198
199
200
201
        :returns: summary: generated summary
        """
        summary = ''

        for sentence in sentences:
            if sentence in sentence_score and sentence_score[sentence] >= (
                    threshold):
                summary += " " + sentence.text

        return summary[1:]

202
    def summarize(self, text: str, max_length: int) -> str:
203
        text = self.nlpGer(text)
204
205
206
207
208
209

        # put all sentences in a list
        sentences = list(text.sents)
        total_sentences = len(sentences)

        # generate frequency matrix
210
        freq_matrix = self.frequency_matrix(sentences)
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

        # generate term frequency matrix
        tf_matrix = self.tf_matrix(freq_matrix)

        # identify number of sentences with a specific word
        num_sent_per_words = self.sentences_per_words(freq_matrix)

        # generate id-frequency matrix
        idf_matrix = self.idf_matrix(
            freq_matrix, num_sent_per_words, total_sentences)

        # generate tf-idf-matrix
        tf_idf_matrix = self.tf_idf_matrix(tf_matrix, idf_matrix)

        # generate sentence score for every sentence
        sentence_scores = self.score_sentences(tf_idf_matrix)

228
        # set threshold to average score
229
230
231
        threshold = self.average_score(sentence_scores)

        # summary
232
233
        summary = self.create_summary(
            sentences, sentence_scores, threshold)
234
235

        return summary