Commit c67386d8 authored by Mischa Buchhofer's avatar Mischa Buchhofer
Browse files

add implementation for #35

parent 48ac3b63
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.9" project-jdk-type="Python SDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (nlp)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
from fastapi import FastAPI, Request
from app.summary import summary
from app.utilities import generator
from .summary.simple_spacy_summarizer import SimpleSpacySummarizer
app = FastAPI(
title="IntentFinder: NLP-API",
......@@ -8,7 +9,8 @@ app = FastAPI(
description="Based on spacy offer several nlp stuff such as summary, intent-id-generation as it is needed by the IntentFinder"
)
strategies = [] # strategies = [strat1(), strat2(), ...]
strategies = [SimpleSpacySummarizer()] # strategies = [strat1(), strat2(), ...]
@app.get("/")
......
from typing import Any, Union
import spacy
from spacy.lang.de.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import de_core_news_sm
from spacy.tokens.span import Span
from . import summary_strategy_interface
from .summary_strategy_interface import ISummaryStrategy
class SimpleSpacySummarizer(ISummaryStrategy):
def __init__(self):
self._id = "simple_spacy_summarizer"
super().__init__()
@property
def id(self):
return self._id
def summarize(self, text: str) -> str:
nlp = de_core_news_sm.load()
doc = nlp(text)
# Divide into tokens, vectorize and remove stop words
corpus = [sent.text.lower() for sent in doc.sents]
cv = CountVectorizer(stop_words=list(STOP_WORDS))
cv_fit = cv.fit_transform(corpus)
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)
word_frequency = dict(zip(word_list, count_list))
# get high frequency words
val = sorted(word_frequency.values())
higher_word_frequencies = [word for word, freq in word_frequency.items() if freq in val[-3:]]
# gets relative frequency of words
higher_frequency = val[-1]
for word in word_frequency.keys():
word_frequency[word] = (word_frequency[word]/higher_frequency)
# now rank the sentences based on the word frequency
sentence_rank = {}
for sent in doc.sents:
for word in sent:
if word.text.lower() in word_frequency.keys():
if sent in sentence_rank.keys():
sentence_rank[sent] += word_frequency[word.text.lower()]
else:
sentence_rank[sent] = word_frequency[word.text.lower()]
top_sentences = (sorted(sentence_rank.values())[::-1])
top_sent = top_sentences[:3]
# create the summary
summary = []
for sent, strength in sentence_rank.items():
if strength in top_sent:
summary.append(sent)
else:
continue
# Create summary as text
result_text = ""
for i in summary:
result_text += i.text + " "
return result_text
\ No newline at end of file
......@@ -3,7 +3,7 @@
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Python 3.7 (nlp)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
......@@ -52,6 +52,7 @@ spacy-alignments==0.7.2
spacy-legacy==3.0.1
spacy-lookups-data==1.0.0
spacy-transformers==1.0.1
scikit-learn==0.24.2
srsly==2.4.0
starlette==0.13.6
thinc==8.0.1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment