Commit e68f9cf6 authored by Ala Rouis's avatar Ala Rouis
Browse files

Merge branch 'fix/#56-Fragemöglichkeit_mit_Bibliothek_Question_Generator_generieren_' into 'master'

#56 fragemöglichkeit mit bibliothek question generator generieren

See merge request !82
parents a8af12e8 2e390103
Pipeline #79945 passed with stages
in 3 minutes and 7 seconds
import json
import os
from chatette.facade import Facade
from fastapi import FastAPI, Request
from fastapi.encoders import jsonable_encoder
from fastapi.exceptions import RequestValidationError
......@@ -27,12 +23,6 @@ from app.utilities.utilities import Utilities
class Item(BaseModel):
text: str
num_questions: int
answer_style: str
class Item2(BaseModel):
input_path: str
app = FastAPI(
......@@ -86,28 +76,18 @@ async def exception_handler(request: Request,
@app.post("/questionGenerator")
async def api_question_generator(item: Item):
"""
Question Generator is an NLP system to generate questions from texts.
the library on the first call takes so long.
a question may appear in English, but the answer is always in German
:return: The list of all questions
"""
question_generator = QuestionGenerator(_nlp=nlp)
return question_generator.generate(
item.text,
num_questions=item.num_questions,
answer_style=item.answer_style
)
@app.post("/chatette")
async def test(item2: Item2):
facade = Facade(
master_file_path=item2.input_path,
output_dir_path="./app/output",
force_overwriting=True
answer_style='sentences'
)
facade.run()
with open(os.path.join(
"./app/output/train/output.json"),
"r"
) as json_output:
return_json = json_output.read()
return json.loads(return_json)
@app.get("/")
......
# pylint: skip-file
# import os
# import sys
# import math
import json
import random
# import spacy
import re
import numpy as np
import torch
from transformers import (
AutoTokenizer,
......@@ -17,13 +10,11 @@ from transformers import (
class QuestionGenerator:
def __init__(self, _nlp, model_dir=None):
def __init__(self, _nlp):
qg_pretrained = "iarfmoose/t5-base-question-generator"
self.ANSWER_TOKEN = "<answer>"
self.CONTEXT_TOKEN = "<context>"
self.SEQ_LENGTH = 512
self.answer_token = "<answer>"
self.context_token = "<context>"
self.device = \
torch.device("cuda" if torch.cuda.is_available() else "cpu")
......@@ -33,7 +24,7 @@ class QuestionGenerator:
AutoModelForSeq2SeqLM.from_pretrained(qg_pretrained)
self.qg_model.to(self.device)
self.qa_evaluator = QAEvaluator(model_dir)
self.qa_evaluator = QAEvaluator()
self._nlp = _nlp
......@@ -45,8 +36,6 @@ class QuestionGenerator:
answer_style="all"
):
print("Generating questions...\n")
qg_inputs, qg_answers = \
self.generate_qg_inputs(article, answer_style)
generated_questions = \
......@@ -58,9 +47,6 @@ class QuestionGenerator:
assert len(generated_questions) == len(qg_answers), message
if use_evaluator:
print("Evaluating QA pairs...\n")
encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
generated_questions, qg_answers
)
......@@ -75,7 +61,6 @@ class QuestionGenerator:
)
else:
print("Skipping evaluation step.\n")
qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
return qa_list
......@@ -94,7 +79,7 @@ class QuestionGenerator:
inputs = []
answers = []
if answer_style == "sentences" or answer_style == "all":
if answer_style == "sentences":
segments = self._split_into_segments(text)
for segment in segments:
sentences = self._split_text(segment)
......@@ -104,13 +89,6 @@ class QuestionGenerator:
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
if answer_style == "multiple_choice" or answer_style == "all":
sentences = self._split_text(text)
prepped_inputs, prepped_answers = \
self._prepare_qg_inputs_mc(sentences)
inputs.extend(prepped_inputs)
answers.extend(prepped_answers)
return inputs, answers
def generate_questions_from_inputs(self, qg_inputs):
......@@ -135,8 +113,9 @@ class QuestionGenerator:
# temporary solution to remove useless post-quote sentence fragments
cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
sentences = sentences + cut_sentences
return list(set([s.strip(" ") for s in sentences]))
test1 = [s.strip(" ") for s in sentences]
test = set(test1)
return list(test)
def _split_into_segments(self, text):
max_tokens = 490
......@@ -161,7 +140,7 @@ class QuestionGenerator:
for sentence in sentences:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN, sentence, self.CONTEXT_TOKEN, text
self.answer_token, sentence, self.context_token, text
)
inputs.append(qg_input)
answers.append(sentence)
......@@ -174,15 +153,15 @@ class QuestionGenerator:
inputs_from_text = []
answers_from_text = []
for i in range(len(sentences)):
for i, sentence in enumerate(sentences):
entities = docs[i].ents
if entities:
for entity in entities:
qg_input = "{} {} {} {}".format(
self.ANSWER_TOKEN,
self.answer_token,
entity,
self.CONTEXT_TOKEN,
sentences[i]
self.context_token,
sentence
)
answers = self._get_mc_answers(entity, docs)
inputs_from_text.append(qg_input)
......@@ -255,7 +234,7 @@ class QuestionGenerator:
return self.qg_tokenizer(
qg_input,
padding='max_length',
max_length=self.SEQ_LENGTH,
max_length=512,
truncation=True,
return_tensors="pt",
).to(self.device)
......@@ -265,42 +244,36 @@ class QuestionGenerator:
):
if num_questions > len(scores):
num_questions = len(scores)
print(
"\nWas only able to generate {} questions".format(
num_questions
)
)
qa_list = []
for i in range(num_questions):
index = scores[i]
qa = self._make_dict(
q_a = self._make_dict(
generated_questions[index].split("?")[0] + "?",
qg_answers[index]
)
qa_list.append(qa)
qa_list.append(q_a)
return qa_list
def _get_all_qa_pairs(self, generated_questions, qg_answers):
qa_list = []
for i in range(len(generated_questions)):
qa = self._make_dict(
generated_questions[i].split("?")[0] + "?", qg_answers[i]
for i, generated_question in enumerate(generated_questions):
q_a = self._make_dict(
generated_question.split("?")[0] + "?", qg_answers[i]
)
qa_list.append(qa)
qa_list.append(q_a)
return qa_list
@staticmethod
def _make_dict(question, answer):
qa = {"question": question, "answer": answer}
return qa
q_a = {"question": question, "answer": answer}
return q_a
class QAEvaluator:
def __init__(self, model_dir=None):
def __init__(self):
qae_pretrained = "iarfmoose/bert-base-cased-qa-evaluator"
self.SEQ_LENGTH = 512
self.device = torch.\
device("cuda" if torch.cuda.is_available() else "cpu")
......@@ -313,8 +286,8 @@ class QAEvaluator:
def encode_qa_pairs(self, questions, answers):
encoded_pairs = []
for i in range(len(questions)):
encoded_qa = self._encode_qa(questions[i], answers[i])
for i, question in enumerate(questions):
encoded_qa = self._encode_qa(question, answers[i])
encoded_pairs.append(encoded_qa.to(self.device))
return encoded_pairs
......@@ -322,8 +295,8 @@ class QAEvaluator:
scores = {}
self.qae_model.eval()
with torch.no_grad():
for i in range(len(encoded_qa_pairs)):
scores[i] = self._evaluate_qa(encoded_qa_pairs[i])
for i, encoded_qa_pair in enumerate(encoded_qa_pairs):
scores[i] = self._evaluate_qa(encoded_qa_pair)
return [
k for k, v in sorted(
......@@ -334,17 +307,17 @@ class QAEvaluator:
]
def _encode_qa(self, question, answer):
if type(answer) is list:
for a in answer:
if a["correct"]:
correct_answer = a["answer"]
if isinstance(answer, list):
for a_a in answer:
if a_a["correct"]:
correct_answer = a_a["answer"]
else:
correct_answer = answer
return self.qae_tokenizer(
text=question,
text_pair=correct_answer,
padding="max_length",
max_length=self.SEQ_LENGTH,
max_length=512,
truncation=True,
return_tensors="pt",
)
......@@ -352,47 +325,3 @@ class QAEvaluator:
def _evaluate_qa(self, encoded_qa_pair):
output = self.qae_model(**encoded_qa_pair)
return output[0][0][1]
def print_qa(qa_list, show_answers=True):
for i in range(len(qa_list)):
space = " " *\
int(np.where(i < 9, 3, 4)) # wider space for 2 digit q nums
print("{}) Q: {}".format(i + 1, qa_list[i]["question"]))
answer = qa_list[i]["answer"]
# print a list of multiple choice answers
if type(answer) is list:
if show_answers:
print(
"{}A: 1.".format(space),
answer[0]["answer"],
np.where(answer[0]["correct"], "(correct)", ""),
)
for j in range(1, len(answer)):
print(
"{}{}.".format(space + " ", j + 1),
answer[j]["answer"],
np.where(
answer[j]["correct"] is True,
"(correct)",
""
)
)
else:
print("{}A: 1.".format(space), answer[0]["answer"])
for j in range(1, len(answer)):
print(
"{}{}.".format(space + " ", j + 1),
answer[j]["answer"]
)
print("")
# print full sentence answers
else:
if show_answers:
print("{}A:".format(space), answer, "\n")
......@@ -78,6 +78,5 @@ protobuf==3.16.0
summarizer==0.0.7
bert-extractive-summarizer==0.7.1
sentencepiece==0.1.95
chatette==1.6.3
pymongo==3.11.4
mongomock==3.23.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment