Commit 45dc3284 authored by Chiraz Boukadida's avatar Chiraz Boukadida
Browse files

Merge branch 'master' of...

Merge branch 'master' of https://code.fbi.h-da.de/pse-trapp-public/intentfinder into feat/#31_Endpoint_Hochladen_einer_DOCX-Datei

 Conflicts:
	src/ktor-server/src/main/kotlin/de/h_da/fbi/smebt/intentfinder/server/Application.kt
parents 7aa6788f 0b52cb67
......@@ -10,4 +10,5 @@ src/ktor-server/.idea/misc.xml
.DS_Store
.idea/
.gradle/
src/mongodb_data/
src/nltk_data/
# Recherche best practice Rest-API (URI, Parameter, Rueckgabewerte, Fehlerbehandlung)
1. **URI**: <br/>
in URI: <br/>
- sollte man **Nomen** als Ressource verwenden <br/> Beispiel: /users, /orders
- sollte man **keine Verben** verwenden <br/> Beispiel: /getUsers, /getOrders
- sollten **collections in Plural** sein <br/> Beispiel: /articles
<br/><br/>**CRUD Operations**: <br/>
Man kann folgende Operationen auf eine Ressource machen: <br/>
**create (POST)** : um eine neue Ressource zu erstellen <br/>
**Read (GET)** : um eine Ressource abzurufen <br/>
**Update (PUT oder PATCH)** : um eine Ressource zu ersetzen (PUT) oder zu bearbeiten (PATCH) <br/>
**Delete (DELETE)** : um eine Ressource zu löschen
2. **Parameter**: <br/>
Parameter sind dafür da, um Daten zu suchen, zu filtern und zu sortieren. <br/>
- man sollte mit **query parameter** arbeiten, wenn man eine **kleine Collection** betrachtet <br/>
Beispiel: Ich möchte Details über Arbeiter aus kamerun haben<br/> http://localhost:8080/Employees?country=cameroon <br/>
- man sollte mit uri parameter arbeiten, wenn man eine große Collection betrachtet<br/> Beispiel: Ich möchte alle Details von der Abteilung 1, 2, 3 haben <br/>http://localhost:8080/Departments/123 <br/>
3. **Rückgabewerte**: sollten in Json Format sein.
4. **Fehlerbehandlung**:
Die Fehler müssen behandelt werden und entsprechenden Standardfehlercode zurückgeben: <br/># clientseitige Fehler:
**400**: Bad Request → für clientseitige Eingabe fehlgeschlagen <br/>
**401**: Unauthorized →, wenn der Benutzer nicht authentifiziert ist und versucht eine Ressource
zuzugreifen <br/>
**403**: Forbidden →, wenn der Benutzer authentifiziert ist und kein Recht hat eine Ressource
zuzugreifen <br/>
**404**: Not Found: wenn eine Ressource nicht gefunden wird <br/>
**500**: Internal server error: Standard serverseitig Fehlermeldung. wenn etwas serverseitig schiefgeht <br/>
**502**: Bad Gateway →, wenn eine Response vom Server invalid ist <br/>
**503**: Service Unavailable →, wenn etwas Unerwartetes auf der Serverseite passiert <br/>
## Quelle:
* https://www.merixstudio.com/blog/best-practices-rest-api-development/ <br/>
* https://stackoverflow.blog/2020/03/02/best-practices-for-rest-api-design/ <br/>
* https://blog.dreamfactory.com/best-practices-for-naming-rest-api-endpoints/ <br/>
* https://swagger.io/resources/articles/best-practices-in-api-design/ <br/>
# Recherche andere ähnliche APIs, z.B. https://rapidapi.com/darkmanaminovic/api/question-generator inkl. Dokumentation
* https://opentdb.com/api_config.php <br/>
* https://jservice.io/ <br/>
* https://quizapi.io/ <br/>
* **Ergebnisse**: festgelegte Anzahl von Fragen nach kathegorien mit Antwortmöglichkeiten generieren <br/>
# Swagger-Doku
OpenAPI Specification (formerly Swagger Specification) is an API description format for REST APIs. <br/>
Swagger is a set of open-source tools built around the OpenAPI Specification that can help you design, build, document and consume REST APIs <br/>
* Quelle: https://swagger.io/docs/specification/about/ <br/>
* Dokumentation: https://swagger.io/specification/ <br/>
### Docker based Setup
Use the following command to start all containers (First run takes a lot of time):
docker-compose up --build
Afterwards the following services are available:
* Port 8080: ktor-server
* Port 8000: nlp_server
* Port 27017: MongoDB-Server
To access the OpenAPI documentation of nlp_server open **http://localhost:8080/docs**.
The following paths are mounted by default:
* nltk_data: to store nltk-data, e.g. big models, which we don't want to download every time we start our container.
* mongodb_data: stores the data of the mongodb container.
version: "3.7"
services:
python_nlp:
build: nlp
ports:
- 8000:8000
volumes:
- ./nltk_data:/usr/share/nltk_data
ktor-server:
build: ktor-server
ports:
- 8080:8080
openapi_validator:
build: docker-openapi-validator
environment:
WAIT_HOSTS: python_nlp:8000, ktor-server:8080
WAIT_TIMEOUT: 3600
WAIT_HOSTS_TIMEOUT: 3600
WAIT_SLEEP_INTERVAL: 5
depends_on:
- python_nlp
- ktor-server
mongodb_container:
image: mongo:latest
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: rootpassword
ports:
- 27017:27017
volumes:
- ./mongodb_data:/data/db
FROM node:15.14-alpine3.13
# ARG is used here to make auto-update easy
ARG version=0.46.0
RUN npm install -g ibm-openapi-validator@${version}
RUN apk add --no-cache wget
COPY ./entrypoint.sh /
# create lint-openapi config
RUN lint-openapi init
# download docker-compose-wait
ADD https://github.com/ufoscout/docker-compose-wait/releases/download/2.7.3/wait /wait
RUN chmod +x /wait
## Launch the wait tool and then your application
CMD /wait && /entrypoint.sh
#!/bin/sh
wget "http://python_nlp:8000/openapi.json"
#wget "http://ktor-server:8080/openapi.yml"
echo "####### Linting nlp_server OpenAPI.json #######"
lint-openapi ./openapi.json
#lint-openapi ./openapi.yml
FROM gradle:7.0.2-jdk11 AS build
COPY --chown=gradle:gradle . /home/gradle/src
WORKDIR /home/gradle/src
RUN gradle clean installDist --no-daemon
EXPOSE 8080
WORKDIR /home/gradle/src/build/install/ktor-server/bin
CMD ./ktor-server
......@@ -5,22 +5,12 @@ import de.h_da.fbi.smebt.intentfinder.server.sources.DocxReader
import io.ktor.application.*
import io.ktor.features.*
import io.ktor.http.*
import io.ktor.http.content.*
import io.ktor.request.*
import io.ktor.response.*
import io.ktor.routing.*
import io.ktor.serialization.*
import kotlinx.serialization.json.Json
import registerUploadRoutes
import java.io.File
import java.nio.file.Files
import java.nio.file.Paths
import kotlinx.serialization.Serializable
@Serializable
data class FileStatus(val path: String, val status: String)
import java.lang.RuntimeException
fun main(args: Array<String>): Unit = io.ktor.server.netty.EngineMain.main(args)
......@@ -32,6 +22,16 @@ fun Application.module(testing: Boolean = false) {
})
}
install(StatusPages){
exception<InternalServerErrorException> { cause ->
call.respond(HttpStatusCode.InternalServerError)
throw cause
}
statusFile(HttpStatusCode.NotFound, filePattern = "error/error#.html")
}
routing {
get("/summary") {
val response = PythonBridge().getSummary("test bridge")
......@@ -41,6 +41,47 @@ fun Application.module(testing: Boolean = false) {
call.respondText("IntentFinder is available")
}
// Definition eines Endpunkts zum Hochladen einer DOCX-Datei
post("/file/{chatbotId}") {
call.respondText("file was successful uploaded")
}
// Definition endpoint zur Änderung eine bereits existierende docx
put("/file/{chatbotId}/{id}/{filename}"){
}
// Definition eines Endpunkts zur Definition einer FAQ-Webseite mit JSON-Konfiguration
post("/faqRessource/{chatbotId}/{jsonStructure}"){
}
// get faq with Json Configuration
get("/faqRessource"){
//Rückgabe Json Object (vgl. #38)
}
// Definition eines Endpunkts zur Definition einer Docx-Datei mit JSON-Konfiguration
post("/docxRessource/{chatbotId}/{filename}"){
}
// get docx file with Json Configuration
get("/docxRessource"){
//Rückgabe Json Object (vgl. #37)
}
// Definition endpoint zum Auslesen aller hochgeladener docx mit status
get("/files"){
}
// Routen ohne Funktionalität
routing{
}
var fileDescription = ""
var fileName = ""
post("/file/{chatbotId}") {
......@@ -133,3 +174,7 @@ fun Application.module(testing: Boolean = false) {
}
registerUploadRoutes()
}
class InternalServerErrorException : RuntimeException()
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>The Page cannot be found!</title>
</head>
<body>
<h1>Not Found!</h1>
</body>
</html>
\ No newline at end of file
FROM python:3.9.4
EXPOSE 8000
# Install Debian and PyPi packages
RUN apt-get update && apt-get install -y gcc python3-dev python3-pip libxml2-dev libxslt1-dev zlib1g-dev g++
COPY ./requirements.txt /requirements.txt
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
# Copy app directory to container
COPY ./app /app
# RUN pip install fastapi uvicorn
# Run tests
# RUN python -m unittest /app/tests/test*
# Install and run linter
RUN pip install --no-input flake8
RUN python -m flake8 --extend-exclude venv ./app
# Start server
RUN pip install fastapi uvicorn
CMD ["uvicorn", "app.nlp_server:app", "--host", "0.0.0.0", "--port", "8000"]
# NLP with Python
## setup
## Setup
### Docker based Setup
Use the following command to start all containers (First run takes a lot of time):
docker-compose up --build
Afterwards the following services are available:
* Port 8080: ktor-server
* Port 8000: nlp_server
* Port 27017: MongoDB-Server
To access the OpenAPI documentation of nlp_server open **http://localhost:8080/docs**.
The following paths are mounted by default:
* nltk_data: to store nltk-data, e.g. big models, which we don't want to download every time we start our container.
* mongodb_data: stores the data of the mongodb container.
### Bare metal setup
- install latest version of python and pip, tested with Python 3.9.2 and pip 21.0.1
- open command shell as **admin** and run
......
......@@ -6,6 +6,15 @@ from app.summary.summary_bert import BertSummary
from app.summary.summary_sentence_embedding import SentenceEmbeddingSummarizer
from app.summary.summary_word_embedding import WordEmbeddingSummarizer
from app.utilities import generator
from app.questiongenerator import QuestionGenerator
from pydantic import BaseModel
class Item(BaseModel):
text: str
num_questions: int
answer_style: str
app = FastAPI(
title="IntentFinder: NLP-API",
......@@ -29,6 +38,16 @@ strategies = [
WordEmbeddingSummarizer()]
@app.post("/questionGenerator")
async def api_questionGenerator(item: Item):
qg = QuestionGenerator()
return qg.generate(
item.text,
num_questions=item.num_questions,
answer_style=item.answer_style
)
@app.get("/")
async def root():
return {"message": "nlp server is available"}
......
# import os
# import sys
# import math
import numpy as np
import torch
# import spacy
import re
import random
import json
import en_core_web_sm
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
)
class QuestionGenerator:
def __init__(self, model_dir=None):
QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
self.ANSWER_TOKEN = "<answer>"
self.CONTEXT_TOKEN = "<context>"
self.SEQ_LENGTH = 512
self.device = \
torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.qg_tokenizer = \
AutoTokenizer.from_pretrained(QG_PRETRAINED, use_fast=False)
self.qg_model = \
AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
self.qg_model.to(self.device)
self.qa_evaluator = QAEvaluator(model_dir)
def generate(
self,
article,
use_evaluator=True,
num_questions=None,
answer_style="all"
):
print("Generating questions...\n")
qg_inputs, qg_answers = \
self.generate_qg_inputs(article, answer_style)
generated_questions = \
self.generate_questions_from_inputs(qg_inputs)
message = "{} questions doesn't match {} answers".format(
len(generated_questions), len(qg_answers)
)
assert len(generated_questions) == len(qg_answers), message
if use_evaluator:
print("Evaluating QA pairs...\n")
encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
generated_questions, qg_answers
)
scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
if num_questions:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores, num_questions
)
else:
qa_list = self._get_ranked_qa_pairs(
generated_questions, qg_answers, scores
)
else:
print("Skipping evaluation step.\n")
qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
return qa_list
def generate_qg_inputs(self, text, answer_style):
VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
if answer_style not in VALID_ANSWER_STYLES:
raise ValueError(
"Invalid answer style {}. Please choose from {}".format(
answer_style, VALID_ANSWER_STYLES
)
)
inputs = []
answers = []
if answer_style == "sentences" or answer_style == "all":
segments = self._split_into_segments(text)
for segment in segments:
sentences = self._split_text(segment)
prepped_inputs, prepped_answers = self._prepare_qg_inputs(
sentences, segment
)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
if answer_style == "multiple_choice" or answer_style == "all":
sentences = self._split_text(text)
prepped_inputs, prepped_answers = \
self._prepare_qg_inputs_MC(sentences)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
return inputs, answers
def generate_questions_from_inputs(self, qg_inputs):
generated_questions = []
for qg_input in qg_inputs:
question = self._generate_question(qg_input)
generated_questions.append(question)
return generated_questions
def _split_text(self, text):
MAX_SENTENCE_LEN = 128
sentences = re.findall(r".*?[.!\?]", text)
cut_sentences = []
for sentence in sentences:
if len(sentence) > MAX_SENTENCE_LEN:
cut_sentences.extend(re.split("[,;:)]", sentence))
# temporary solution to remove useless post-quote sentence fragments
cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
sentences = sentences + cut_sentences
return list(set([s.strip(" ") for s in sentences]))
def _split_into_segments(self, text):
MAX_TOKENS = 490
paragraphs = text.split("\n")
tokenized_paragraphs = [
self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
]
segments = []
while len(tokenized_paragraphs) > 0:
segment = []
while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
paragraph = tokenized_paragraphs.pop(0)
segment.extend(paragraph)
segments.append(segment)
return [self.qg_tokenizer.decode(s) for s in segments]
def _prepare_qg_inputs(self, sentences, text):
inputs = []
answers = []
for sentence in sentences:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN, sentence, self.CONTEXT_TOKEN, text
)
inputs.append(qg_input)
answers.append(sentence)
return inputs, answers
def _prepare_qg_inputs_MC(self, sentences):
spacy_nlp = en_core_web_sm.load()
docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
inputs_from_text = []
answers_from_text = []
for i in range(len(sentences)):
entities = docs[i].ents
if entities:
for entity in entities:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN,
entity,
self.CONTEXT_TOKEN,
sentences[i]
)
answers = self._get_MC_answers(entity, docs)
inputs_from_text.append(qg_input)
answers_from_text.append(answers)
return inputs_from_text, answers_from_text
def _get_MC_answers(self, correct_answer, docs):
entities = []
for doc in docs:
entities.extend(
[
{
"text": e.text,
"label_": e.label_
} for e in doc.ents
]
)
# remove duplicate elements
entities_json = [json.dumps(kv) for kv in entities]
pool = set(entities_json)
num_choices = (
min(4, len(pool)) - 1
) # -1 because we already have the correct answer
# add the correct answer
final_choices = []
correct_label = correct_answer.label_
final_choices.append({"answer": correct_answer.text, "correct": True})
pool.remove(
json.dumps(
{"text": correct_answer.text, "label_": correct_answer.label_}
)
)
# find answers with the same NER label
matches = [e for e in pool if correct_label in e]
# if we don't have enough then add some other random answers
if len(matches) < num_choices:
choices = matches
pool = pool.difference(set(choices))
choices.extend(random.sample(pool, num_choices - len(choices)))
else:
choices = random.sample(matches, num_choices)
choices = [json.loads(s) for s in choices]
for choice in choices:
final_choices.append({"answer": choice["text"], "correct": False})
random.shuffle(final_choices)
return final_choices
def _generate_question(self, qg_input):
self.qg_model.eval()
encoded_input = self._encode_qg_input(qg_input)
with torch.no_grad():
output = self.qg_model.generate(
input_ids=encoded_input["input_ids"]
)
question = self.qg_tokenizer.decode(
output[0],
skip_special_tokens=True
)
return question
def _encode_qg_input(self, qg_input):
return self.qg_tokenizer(
qg_input,
padding='max_length',
max_length=self.SEQ_LENGTH,
truncation=True,
return_tensors="pt",
).to(self.device)