Commit 45dc3284 authored by Chiraz Boukadida's avatar Chiraz Boukadida
Browse files

Merge branch 'master' of...

Merge branch 'master' of https://code.fbi.h-da.de/pse-trapp-public/intentfinder into feat/#31_Endpoint_Hochladen_einer_DOCX-Datei

 Conflicts:
	src/ktor-server/src/main/kotlin/de/h_da/fbi/smebt/intentfinder/server/Application.kt
parents 7aa6788f 0b52cb67
......@@ -10,4 +10,5 @@ src/ktor-server/.idea/misc.xml
.DS_Store
.idea/
.gradle/
src/mongodb_data/
src/nltk_data/
# Recherche best practice Rest-API (URI, Parameter, Rueckgabewerte, Fehlerbehandlung)
1. **URI**: <br/>
in URI: <br/>
- sollte man **Nomen** als Ressource verwenden <br/> Beispiel: /users, /orders
- sollte man **keine Verben** verwenden <br/> Beispiel: /getUsers, /getOrders
- sollten **collections in Plural** sein <br/> Beispiel: /articles
<br/><br/>**CRUD Operations**: <br/>
Man kann folgende Operationen auf eine Ressource machen: <br/>
**create (POST)** : um eine neue Ressource zu erstellen <br/>
**Read (GET)** : um eine Ressource abzurufen <br/>
**Update (PUT oder PATCH)** : um eine Ressource zu ersetzen (PUT) oder zu bearbeiten (PATCH) <br/>
**Delete (DELETE)** : um eine Ressource zu löschen
2. **Parameter**: <br/>
Parameter sind dafür da, um Daten zu suchen, zu filtern und zu sortieren. <br/>
- man sollte mit **query parameter** arbeiten, wenn man eine **kleine Collection** betrachtet <br/>
Beispiel: Ich möchte Details über Arbeiter aus kamerun haben<br/> http://localhost:8080/Employees?country=cameroon <br/>
- man sollte mit uri parameter arbeiten, wenn man eine große Collection betrachtet<br/> Beispiel: Ich möchte alle Details von der Abteilung 1, 2, 3 haben <br/>http://localhost:8080/Departments/123 <br/>
3. **Rückgabewerte**: sollten in Json Format sein.
4. **Fehlerbehandlung**:
Die Fehler müssen behandelt werden und entsprechenden Standardfehlercode zurückgeben: <br/># clientseitige Fehler:
**400**: Bad Request → für clientseitige Eingabe fehlgeschlagen <br/>
**401**: Unauthorized →, wenn der Benutzer nicht authentifiziert ist und versucht eine Ressource
zuzugreifen <br/>
**403**: Forbidden →, wenn der Benutzer authentifiziert ist und kein Recht hat eine Ressource
zuzugreifen <br/>
**404**: Not Found: wenn eine Ressource nicht gefunden wird <br/>
**500**: Internal server error: Standard serverseitig Fehlermeldung. wenn etwas serverseitig schiefgeht <br/>
**502**: Bad Gateway →, wenn eine Response vom Server invalid ist <br/>
**503**: Service Unavailable →, wenn etwas Unerwartetes auf der Serverseite passiert <br/>
## Quelle:
* https://www.merixstudio.com/blog/best-practices-rest-api-development/ <br/>
* https://stackoverflow.blog/2020/03/02/best-practices-for-rest-api-design/ <br/>
* https://blog.dreamfactory.com/best-practices-for-naming-rest-api-endpoints/ <br/>
* https://swagger.io/resources/articles/best-practices-in-api-design/ <br/>
# Recherche andere ähnliche APIs, z.B. https://rapidapi.com/darkmanaminovic/api/question-generator inkl. Dokumentation
* https://opentdb.com/api_config.php <br/>
* https://jservice.io/ <br/>
* https://quizapi.io/ <br/>
* **Ergebnisse**: festgelegte Anzahl von Fragen nach kathegorien mit Antwortmöglichkeiten generieren <br/>
# Swagger-Doku
OpenAPI Specification (formerly Swagger Specification) is an API description format for REST APIs. <br/>
Swagger is a set of open-source tools built around the OpenAPI Specification that can help you design, build, document and consume REST APIs <br/>
* Quelle: https://swagger.io/docs/specification/about/ <br/>
* Dokumentation: https://swagger.io/specification/ <br/>
### Docker based Setup
Use the following command to start all containers (First run takes a lot of time):
docker-compose up --build
Afterwards the following services are available:
* Port 8080: ktor-server
* Port 8000: nlp_server
* Port 27017: MongoDB-Server
To access the OpenAPI documentation of nlp_server open **http://localhost:8080/docs**.
The following paths are mounted by default:
* nltk_data: to store nltk-data, e.g. big models, which we don't want to download every time we start our container.
* mongodb_data: stores the data of the mongodb container.
version: "3.7"
services:
python_nlp:
build: nlp
ports:
- 8000:8000
volumes:
- ./nltk_data:/usr/share/nltk_data
ktor-server:
build: ktor-server
ports:
- 8080:8080
openapi_validator:
build: docker-openapi-validator
environment:
WAIT_HOSTS: python_nlp:8000, ktor-server:8080
WAIT_TIMEOUT: 3600
WAIT_HOSTS_TIMEOUT: 3600
WAIT_SLEEP_INTERVAL: 5
depends_on:
- python_nlp
- ktor-server
mongodb_container:
image: mongo:latest
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: rootpassword
ports:
- 27017:27017
volumes:
- ./mongodb_data:/data/db
FROM node:15.14-alpine3.13
# ARG is used here to make auto-update easy
ARG version=0.46.0
RUN npm install -g ibm-openapi-validator@${version}
RUN apk add --no-cache wget
COPY ./entrypoint.sh /
# create lint-openapi config
RUN lint-openapi init
# download docker-compose-wait
ADD https://github.com/ufoscout/docker-compose-wait/releases/download/2.7.3/wait /wait
RUN chmod +x /wait
## Launch the wait tool and then your application
CMD /wait && /entrypoint.sh
#!/bin/sh
wget "http://python_nlp:8000/openapi.json"
#wget "http://ktor-server:8080/openapi.yml"
echo "####### Linting nlp_server OpenAPI.json #######"
lint-openapi ./openapi.json
#lint-openapi ./openapi.yml
FROM gradle:7.0.2-jdk11 AS build
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle clean installDist --no-daemon
EXPOSE 8080
WORKDIR /home/gradle/src/build/install/ktor-server/bin
CMD ./ktor-server
......@@ -5,22 +5,12 @@ import de.h_da.fbi.smebt.intentfinder.server.sources.DocxReader
import io.ktor.application.*
import io.ktor.features.*
import io.ktor.http.*
import io.ktor.http.content.*
import io.ktor.request.*
import io.ktor.response.*
import io.ktor.routing.*
import io.ktor.serialization.*
import kotlinx.serialization.json.Json
import registerUploadRoutes
import java.io.File
import java.nio.file.Files
import java.nio.file.Paths
import kotlinx.serialization.Serializable
@Serializable
data class FileStatus(val path: String, val status: String)
import java.lang.RuntimeException
fun main(args: Array<String>): Unit = io.ktor.server.netty.EngineMain.main(args)
......@@ -32,6 +22,16 @@ fun Application.module(testing: Boolean = false) {
})
}
install(StatusPages){
exception<InternalServerErrorException> { cause ->
call.respond(HttpStatusCode.InternalServerError)
throw cause
}
statusFile(HttpStatusCode.NotFound, filePattern = "error/error#.html")
}
routing {
get("/summary") {
val response = PythonBridge().getSummary("test bridge")
......@@ -41,6 +41,47 @@ fun Application.module(testing: Boolean = false) {
call.respondText("IntentFinder is available")
}
// Definition eines Endpunkts zum Hochladen einer DOCX-Datei
post("/file/{chatbotId}") {
call.respondText("file was successful uploaded")
}
// Definition endpoint zur Änderung eine bereits existierende docx
put("/file/{chatbotId}/{id}/{filename}"){
}
// Definition eines Endpunkts zur Definition einer FAQ-Webseite mit JSON-Konfiguration
post("/faqRessource/{chatbotId}/{jsonStructure}"){
}
// get faq with Json Configuration
get("/faqRessource"){
//Rückgabe Json Object (vgl. #38)
}
// Definition eines Endpunkts zur Definition einer Docx-Datei mit JSON-Konfiguration
post("/docxRessource/{chatbotId}/{filename}"){
}
// get docx file with Json Configuration
get("/docxRessource"){
//Rückgabe Json Object (vgl. #37)
}
// Definition endpoint zum Auslesen aller hochgeladener docx mit status
get("/files"){
}
// Routen ohne Funktionalität
routing{
}
var fileDescription = ""
var fileName = ""
post("/file/{chatbotId}") {
......@@ -132,4 +173,8 @@ fun Application.module(testing: Boolean = false) {
}
}
registerUploadRoutes()
}
\ No newline at end of file
}
class InternalServerErrorException : RuntimeException()
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>The Page cannot be found!</title>
</head>
<body>
<h1>Not Found!</h1>
</body>
</html>
\ No newline at end of file
FROM python:3.9.4
EXPOSE 8000
# Install Debian and PyPi packages
RUN apt-get update && apt-get install -y gcc python3-dev python3-pip libxml2-dev libxslt1-dev zlib1g-dev g++
COPY ./requirements.txt /requirements.txt
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
# Copy app directory to container
COPY ./app /app
# RUN pip install fastapi uvicorn
# Run tests
# RUN python -m unittest /app/tests/test*
# Install and run linter
RUN pip install --no-input flake8
RUN python -m flake8 --extend-exclude venv ./app
# Start server
RUN pip install fastapi uvicorn
CMD ["uvicorn", "app.nlp_server:app", "--host", "0.0.0.0", "--port", "8000"]
# NLP with Python
## setup
## Setup
### Docker based Setup
Use the following command to start all containers (First run takes a lot of time):
docker-compose up --build
Afterwards the following services are available:
* Port 8080: ktor-server
* Port 8000: nlp_server
* Port 27017: MongoDB-Server
To access the OpenAPI documentation of nlp_server open **http://localhost:8080/docs**.
The following paths are mounted by default:
* nltk_data: to store nltk-data, e.g. big models, which we don't want to download every time we start our container.
* mongodb_data: stores the data of the mongodb container.
### Bare metal setup
- install latest version of python and pip, tested with Python 3.9.2 and pip 21.0.1
- open command shell as **admin** and run
......@@ -69,9 +89,9 @@ The OpenAPI doc is available under http://127.0.0.1:8000/docs
- following parameter can be set:
text (string): The text that will be summarized.
max_length(int): maximum allowed amount of characters for this strategy
-Bert strategy only works with a ratio parameter inside this class. This ratio
parameter returns the ratio of sentences in the summary, which doesn't implicit
the ratio of characters in the summary. So the ratio has to be calculated and
......
......@@ -6,6 +6,15 @@ from app.summary.summary_bert import BertSummary
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator
from app.questiongenerator import QuestionGenerator
from pydantic import BaseModel
class Item(BaseModel):
text: str
num_questions: int
answer_style: str
app = FastAPI(
title="IntentFinder: NLP-API",
......@@ -29,6 +38,16 @@ strategies = [
WordEmbeddingSummarizer()]
@app.post("/questionGenerator")
async def api_questionGenerator(item: Item):
qg = QuestionGenerator()
return qg.generate(
item.text,
num_questions=item.num_questions,
answer_style=item.answer_style
)
@app.get("/")
async def root():
return {"message": "nlp server is available"}
......
# import os
# import sys
# import math
import numpy as np
import torch
# import spacy
import re
import random
import json
import en_core_web_sm
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
)
class QuestionGenerator:
def __init__(self, model_dir=None):
QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
self.ANSWER_TOKEN = "<answer>"
self.CONTEXT_TOKEN = "<context>"
self.SEQ_LENGTH = 512
self.device = \
torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.qg_tokenizer = \
AutoTokenizer.from_pretrained(QG_PRETRAINED, use_fast=False)
self.qg_model = \
AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
self.qg_model.to(self.device)
self.qa_evaluator = QAEvaluator(model_dir)
def generate(
self,
article,
use_evaluator=True,
num_questions=None,
answer_style="all"
):
print("Generating questions...\n")
qg_inputs, qg_answers = \
self.generate_qg_inputs(article, answer_style)
generated_questions = \
self.generate_questions_from_inputs(qg_inputs)
message = "{} questions doesn't match {} answers".format(
len(generated_questions), len(qg_answers)
)
assert len(generated_questions) == len(qg_answers), message
if use_evaluator:
print("Evaluating QA pairs...\n")
encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
generated_questions, qg_answers
)
scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
if num_questions:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores, num_questions
)
else:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores
)
else:
print("Skipping evaluation step.\n")
qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
return qa_list
def generate_qg_inputs(self, text, answer_style):
VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
if answer_style not in VALID_ANSWER_STYLES:
raise ValueError(
"Invalid answer style {}. Please choose from {}".format(
answer_style, VALID_ANSWER_STYLES
)
)
inputs = []
answers = []
if answer_style == "sentences" or answer_style == "all":
segments = self._split_into_segments(text)
for segment in segments:
sentences = self._split_text(segment)
prepped_inputs, prepped_answers = self._prepare_qg_inputs(
sentences, segment
)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
if answer_style == "multiple_choice" or answer_style == "all":
sentences = self._split_text(text)
prepped_inputs, prepped_answers = \
self._prepare_qg_inputs_MC(sentences)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
return inputs, answers
def generate_questions_from_inputs(self, qg_inputs):
generated_questions = []
for qg_input in qg_inputs:
question = self._generate_question(qg_input)
generated_questions.append(question)
return generated_questions
def _split_text(self, text):
MAX_SENTENCE_LEN = 128
sentences = re.findall(r".*?[.!\?]", text)
cut_sentences = []
for sentence in sentences:
if len(sentence) > MAX_SENTENCE_LEN:
cut_sentences.extend(re.split("[,;:)]", sentence))
# temporary solution to remove useless post-quote sentence fragments
cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
sentences = sentences + cut_sentences
return list(set([s.strip(" ") for s in sentences]))
def _split_into_segments(self, text):
MAX_TOKENS = 490
paragraphs = text.split("\n")
tokenized_paragraphs = [
self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
]
segments = []
while len(tokenized_paragraphs) > 0:
segment = []
while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
paragraph = tokenized_paragraphs.pop(0)
segment.extend(paragraph)
segments.append(segment)
return [self.qg_tokenizer.decode(s) for s in segments]
def _prepare_qg_inputs(self, sentences, text):
inputs = []
answers = []
for sentence in sentences:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN, sentence, self.CONTEXT_TOKEN, text
)
inputs.append(qg_input)
answers.append(sentence)
return inputs, answers
def _prepare_qg_inputs_MC(self, sentences):
spacy_nlp = en_core_web_sm.load()
docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
inputs_from_text = []
answers_from_text = []
for i in range(len(sentences)):
entities = docs[i].ents
if entities:
for entity in entities:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN,
entity,
self.CONTEXT_TOKEN,
sentences[i]
)
answers = self._get_MC_answers(entity, docs)
inputs_from_text.append(qg_input)
answers_from_text.append(answers)
return inputs_from_text, answers_from_text
def _get_MC_answers(self, correct_answer, docs):
entities = []
for doc in docs:
entities.extend(
[
{
"text": e.text,
"label_": e.label_
} for e in doc.ents
]
)
# remove duplicate elements
entities_json = [json.dumps(kv) for kv in entities]
pool = set(entities_json)
num_choices = (
min(4, len(pool)) - 1
) # -1 because we already have the correct answer
# add the correct answer
final_choices = []
correct_label = correct_answer.label_
final_choices.append({"answer": correct_answer.text, "correct": True})
pool.remove(
json.dumps(
{"text": correct_answer.text, "label_": correct_answer.label_}
)
)
# find answers with the same NER label
matches = [e for e in pool if correct_label in e]
# if we don't have enough then add some other random answers
if len(matches) < num_choices:
choices = matches
pool = pool.difference(set(choices))
choices.extend(random.sample(pool, num_choices - len(choices)))
else:
choices = random.sample(matches, num_choices)
choices = [json.loads(s) for s in choices]
for choice in choices:
final_choices.append({"answer": choice["text"], "correct": False})
random.shuffle(final_choices)
return final_choices
def _generate_question(self, qg_input):
self.qg_model.eval()
encoded_input = self._encode_qg_input(qg_input)
with torch.no_grad():
output = self.qg_model.generate(
input_ids=encoded_input["input_ids"]
)
question = self.qg_tokenizer.decode(