summary_sentence_embedding.py 4.29 KB
Newer Older
1
2
3
4
5
import math

import pandas as pd
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer
Nils König's avatar
Nils König committed
6
from nltk.cluster.util import cosine_distance
7
8
import numpy as np
from scipy.spatial import distance_matrix
Nils König's avatar
Nils König committed
9
import spacy
10
11
12
13
14
15
16

from app.summary.summary_strategy_interface import ISummaryStrategy


class SentenceEmbeddingSummarizer(ISummaryStrategy):
    model = SentenceTransformer(
        'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
Nils König's avatar
Nils König committed
17
    nlp = spacy.load('de_core_news_sm')
18
19
20
21
22
23
24
25
26

    def __init__(self):
        self._id = "sentence_embedding"
        super().__init__()

    @property
    def id(self):
        return self._id

27
    def summarize(self, text: str, max_length: int) -> str:
Nils König's avatar
Nils König committed
28
29
30
        text = self.nlp(text)
        # convert the article/passage to a list of sentences using spacy
        sentences = list(text.sents)
31
32

        # strip leading and trailing spaces
Nils König's avatar
Nils König committed
33
        sentences = [sentence.text.strip() for sentence in sentences]
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

        # for applying different transformations of the data efficiently,
        # transform to Pandas Dataframe
        data = pd.DataFrame(sentences)
        data.columns = ['sentence']

        # create new column 'embeddings using
        data['embeddings'] = data['sentence']. \
            apply(SentenceEmbeddingSummarizer.__get_sentence_embeddings)

        # cluster sentences that are contextually similar
        num_clusters = SentenceEmbeddingSummarizer.__get_number_of_sentences(
            sentences)
        iterations = 25
        embeddings = np.array(data['embeddings'].tolist())
        kclusterer = KMeansClusterer(
Nils König's avatar
Nils König committed
50
            num_clusters, distance=cosine_distance,
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
            repeats=iterations, avoid_empty_clusters=True)
        assigned_clusters = kclusterer \
            .cluster(embeddings, assign_clusters=True)

        # compute the distance between the sentence vector and the
        # centroid(mean) vector
        data['cluster'] = pd.Series(assigned_clusters, index=data.index)
        data['centroid'] = data['cluster'] \
            .apply(lambda x: kclusterer.means()[x])

        data['distance_from_centroid'] = data.apply(
            SentenceEmbeddingSummarizer.__distance_from_centroid, axis=1)

        return SentenceEmbeddingSummarizer.__generate_summary(data)

    @staticmethod
    def __get_sentence_embeddings(sentence):
        """Create a vector given a sentence
        :param sentence: single sentence
        :return: vector from given sentence
        """
        embedding = SentenceEmbeddingSummarizer.model.encode([sentence])
        return embedding[0]

    @staticmethod
    def __get_number_of_sentences(sentences):
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        """
        Gets the number of sentences that will be part of the summary

        :param sentences: Requested number of sentences
        :return: sentences ∈ [1, 10] Granted number of sentences
        """
        min_num_sentences_for_summary = 1
        max_num_sentences_for_summary = 10
        num_sentences_in_percent = 0.3
        num_sentences_for_summary = math.floor(len(sentences) *
                                               num_sentences_in_percent)

        if num_sentences_for_summary < min_num_sentences_for_summary:
            num_sentences_for_summary = min_num_sentences_for_summary
        elif num_sentences_for_summary > max_num_sentences_for_summary:
            num_sentences_for_summary = max_num_sentences_for_summary

94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
        return num_sentences_for_summary

    @staticmethod
    def __distance_from_centroid(row):
        return distance_matrix([row['embeddings']], [row['centroid']
                               .tolist()])[0][0]

    @staticmethod
    def __generate_summary(data):
        """
        1.Group sentences based on the cluster column.
        2.Sort the group in ascending order based on the distance_from_centroid
         column and select the first row
        (sentence having least distance from the mean)
        3.Sort the sentences based on their sequence in the original text.
109
110
111
112

        :returns: summary: a string representing a summarized version of the
         input text
        :rtype: str
113
114
115
116
117
118
119
        """
        summary = ' ' \
            .join(data
                  .sort_values('distance_from_centroid', ascending=True)
                  .groupby('cluster').head(1).sort_index()['sentence']
                  .tolist())
        return summary