Commit 6870db2e authored by Iulia Damian's avatar Iulia Damian
Browse files

Merge branch '#38_Q&A_Exctraction_HTML' into 'master'

#38  Extraction von Fragen und Antworten einer HTML-FAQ-Datei auf Basis einer Konfiguration

See merge request !61
parents 18098e32 ef5738a6
Pipeline #73942 passed with stages
in 2 minutes and 12 seconds
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/../../../../../../../../../:\Users\iulia\Documents\informatik_hda\7.Semester\PSE\intentfinder\.idea/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="ProjectCodeStyleConfiguration">
<code_scheme name="Project" version="173">
<JetCodeStyleSettings>
<option name="CODE_STYLE_DEFAULTS" value="KOTLIN_OFFICIAL" />
</JetCodeStyleSettings>
<codeStyleSettings language="kotlin">
<option name="CALL_PARAMETERS_WRAP" value="5" />
<option name="CALL_PARAMETERS_LPAREN_ON_NEXT_LINE" value="true" />
<option name="CALL_PARAMETERS_RPAREN_ON_NEXT_LINE" value="true" />
<option name="METHOD_PARAMETERS_WRAP" value="5" />
<option name="METHOD_PARAMETERS_LPAREN_ON_NEXT_LINE" value="true" />
<option name="METHOD_PARAMETERS_RPAREN_ON_NEXT_LINE" value="true" />
<option name="EXTENDS_LIST_WRAP" value="1" />
<option name="METHOD_CALL_CHAIN_WRAP" value="1" />
<option name="ASSIGNMENT_WRAP" value="1" />
</codeStyleSettings>
</code_scheme>
</component>
\ No newline at end of file
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
</state>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="GradleMigrationSettings" migrationVersion="1" />
<component name="GradleSettings">
<option name="linkedExternalProjectsSettings">
<GradleProjectSettings>
<option name="delegatedBuild" value="true" />
<option name="testRunner" value="GRADLE" />
<option name="distributionType" value="DEFAULT_WRAPPED" />
<option name="externalProjectPath" value="$PROJECT_DIR$/src/ktor-server" />
<option name="gradleJvm" value="11" />
<option name="modules">
<set>
<option value="$PROJECT_DIR$/src/ktor-server" />
</set>
</option>
</GradleProjectSettings>
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="MavenRepo" />
<option name="name" value="MavenRepo" />
<option name="url" value="https://repo.maven.apache.org/maven2/" />
</remote-repository>
<remote-repository>
<option name="id" value="BintrayJCenter" />
<option name="name" value="BintrayJCenter" />
<option name="url" value="https://jcenter.bintray.com/" />
</remote-repository>
<remote-repository>
<option name="id" value="maven" />
<option name="name" value="maven" />
<option name="url" value="https://kotlin.bintray.com/kotlinx" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="libraries-with-intellij-classes">
<option name="intellijApiContainingLibraries">
<list>
<LibraryCoordinatesState>
<option name="artifactId" value="ideaIU" />
<option name="groupId" value="com.jetbrains.intellij.idea" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="ideaIU" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="ideaIC" />
<option name="groupId" value="com.jetbrains.intellij.idea" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="ideaIC" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="pycharmPY" />
<option name="groupId" value="com.jetbrains.intellij.pycharm" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="pycharmPY" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="pycharmPC" />
<option name="groupId" value="com.jetbrains.intellij.pycharm" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="pycharmPC" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="clion" />
<option name="groupId" value="com.jetbrains.intellij.clion" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="clion" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="riderRD" />
<option name="groupId" value="com.jetbrains.intellij.rider" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="riderRD" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="goland" />
<option name="groupId" value="com.jetbrains.intellij.goland" />
</LibraryCoordinatesState>
<LibraryCoordinatesState>
<option name="artifactId" value="goland" />
<option name="groupId" value="com.jetbrains" />
</LibraryCoordinatesState>
</list>
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$/src/ktor-server" />
</component>
<<<<<<< HEAD
<component name="ProjectRootManager" version="2" project-jdk-name="11" project-jdk-type="JavaSDK" />
=======
>>>>>>> e2ffae4 (learn: issue#38 Added jsoup-basics, Unittest)
<component name="SuppressABINotification">
<option name="modulesWithSuppressedNotConfigured">
<set>
<option value="ktor-server" />
</set>
</option>
</component>
<component name="SuppressKotlinCodeStyleNotification">
<option name="disableForAll" value="true" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/intentfinder.iml" filepath="$PROJECT_DIR$/.idea/intentfinder.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
......@@ -3,10 +3,10 @@
"name": "https://www.berlin.de/sen/finanzen/steuern/informationen-fuer-steuerzahler-/faq-steuern/artikel.697552.php",
"question" : {
"type": "text",
"css_selector": "div.block a"
"css_selector": "div.block .land-toggler-button"
},
"answer": {
"type": "text",
"css_selector": "div.block .text"
"css_selector": "div.block .html5-section.collapse"
}
}
\ No newline at end of file
Diese Seite kann aufgrund der starken Verschachtelung nicht ausgelesen werden.
{
"type": "url",
"name": "https://www.dge.de/wissenschaft/weitere-publikationen/faqs/?L=0",
......
......@@ -7,7 +7,6 @@
},
"answer": {
"type": "href",
"href_css_selector": "table.category > tbody td.list-title > a",
"css_selector": "div.item-page > div:nth-child(5)"
"css_selector": "#content > div.item-page > [itemprop = articleBody]"
}
}
\ No newline at end of file
......@@ -9,7 +9,7 @@
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
.idea/
# Generated files
.idea/**/contentModel.xml
......@@ -72,3 +72,5 @@ fabric.properties
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
#mongodb_data
mongodb_data
\ No newline at end of file
......@@ -4,6 +4,9 @@
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$" />
</component>
<component name="ProjectPlainTextFileTypeManager">
<file url="file://$PROJECT_DIR$/src/main/kotlin/de/h_da/fbi/smebt/intentfinder/server/FileReader.kt" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
......
......@@ -2,7 +2,7 @@ package de.h_da.fbi.smebt.intentfinder.server
import com.squareup.moshi.JsonDataException
import de.h_da.fbi.smebt.intentfinder.server.nlp.PythonBridge
import de.h_da.fbi.smebt.intentfinder.server.sources.MoshiReader
import de.h_da.fbi.smebt.intentfinder.server.sources.*
import io.ktor.application.*
import io.ktor.features.*
import io.ktor.http.*
......@@ -25,7 +25,7 @@ fun Application.module() {
})
}
install(StatusPages){
install(StatusPages) {
exception<InternalServerErrorException> { cause ->
call.respond(HttpStatusCode.InternalServerError)
......@@ -40,6 +40,8 @@ fun Application.module() {
val response = PythonBridge().getSummary("test bridge")
call.respond(response)
}
post("/{name}") {
......@@ -51,112 +53,87 @@ fun Application.module() {
}
// Definition endpoint zur Änderung eine bereits existierende docx
put("/file/{chatbotId}/{id}/{filename}"){
put("/file/{chatbotId}/{id}/{filename}") {
}
// Definition eines Endpunkts zur Definition einer FAQ-Webseite mit JSON-Konfiguration
post("/faqRessource/{chatbotId}/{jsonStructure}"){
try{
post("/faqResource/{chatbotId}/{jsonStructure}") {
var jsonObj: JSONUrlConfig? = null
var qAaMap: HashMap<String, String>? = null
try {
//parse file title
var title =call.parameters["jsonStructure"]
var title = call.parameters["jsonStructure"]
if (title != null) {
title=title.replace(".json", "", false)
title = title.replace(".json", "", false)
}
//set file directory
val uploadDir = "tmp"
val multipart = call.receiveMultipart()
var file = File("")
multipart.forEachPart { part ->
when (part) {
is PartData.FormItem -> {
if (part.name == "title") {
title = part.value
}
}
is PartData.FileItem -> {
val ext = File(part.originalFileName).extension
//generate file title
file = File(uploadDir, "upload-${System.currentTimeMillis()}-${title.hashCode()}.$ext")
val fileBytes = part.streamProvider().readBytes()
File(uploadDir).mkdir()
file.writeBytes(fileBytes)
}
}
part.dispose()
}
val fileReader = FileReader(multipart, title)
val file = fileReader.readFile()
//building json String from File
val stringList = file.readLines()
var json = ""
for(element in stringList){
json+=element
for (element in stringList) {
json += element
}
try{
//Generate DataObject from json string
val moshiReader = MoshiReader("url")
val jsonO = moshiReader.getObjectFromJson(json)
}
catch (exception: Exception){
when(exception) {
is JsonDataException -> {
val errorMessage = "The uploaded file doesn't match the predefined structure. The only valid structure is: \n" +
" {\n" +
" \"type\": \"url\",\n" +
" \"name\": \"https://your.url\",\n" +
" \"question\" : \n " +
" {\n" +
" \"type\": \"text\",\n" +
" \"css_selector\": \"table.category > tbody td.list-title > a\"\n" +
" },\n" +
" \"answer\": \n " +
" {\n" +
" \"type\": \"[text|href]\",\n" +
" \"href_css_selector\": \"table.category > tbody td.list-title > a\",\n" +
" \"css_selector\": \"div.item-page > div:nth-child(5)\"\n" +
" }\n" +
"} \n"+
"The following field is missing or couldn't be parsed: "
call.respondText(errorMessage+exception.message.toString())
}
else -> throw exception
}
}
}
catch (exception: Exception){
call.respondText("The uploaded File can not be parsed. Please choose another .json File")
println(exception.message.toString())
//Generate DataObject from json string
val moshiReader = MoshiReader("url")
jsonObj = moshiReader.getObjectFromJson(json) as JSONUrlConfig
//extract question answer pairs
val urlReader = UrlReader()
val urlDoc = urlReader.getDocument(jsonObj.name)
qAaMap = urlReader.extraction(
urlDoc,
jsonObj.question.css_selector,
jsonObj.answer.css_selector,
jsonObj.answer.type
)
} catch (exception: Exception) {
call.respondText(UrlReaderException("").getExceptionText(exception))
}
//speichern in db vgl. #39
call.respondText(
"{number_of_pairs: " +
qAaMap?.size +
"}"
)
}
// get faq with Json Configuration
get("/faqRessource"){
//Rückgabe Json Object (vgl. #38)
get("/faqResource") {
// Rückgabe Json Object (vgl. #38)
}
// Definition eines Endpunkts zur Definition einer Docx-Datei mit JSON-Konfiguration
post("/docxRessource/{chatbotId}/{filename}"){
post("/docxResource/{chatbotId}/{filename}") {
}
// get docx file with Json Configuration
get("/docxRessource"){
get("/docxResource") {
//Rückgabe Json Object (vgl. #37)
}
// Definition endpoint zum Auslesen aller hochgeladener docx mit status
get("/files"){
get("/files") {
}
// Routen ohne Funktionalität
routing{
routing {
}
}
// registerUploadRoutes()
// registerUploadRoutes()
}
class InternalServerErrorException : RuntimeException()
......
package de.h_da.fbi.smebt.intentfinder.server.sources
import io.ktor.http.content.*
import java.io.File
class FileReader(private val multipart: MultiPartData, private var title: String?) {
//set file directory
private val uploadDir = "tmp"
suspend fun readFile(): File {
var file = File("")
multipart.forEachPart { part ->
when (part) {
is PartData.FormItem -> {
if (part.name == "title") {
title = part.value
}
}
is PartData.FileItem -> {
val ext = File(part.originalFileName).extension
//generate file title
file = File(uploadDir, "upload-${System.currentTimeMillis()}-${title.hashCode()}.$ext")
val fileBytes = part.streamProvider().readBytes()
File(uploadDir).mkdir()
file.writeBytes(fileBytes)
}
}
part.dispose()
}
return file
}
}
\ No newline at end of file
......@@ -7,12 +7,11 @@ package de.h_da.fbi.smebt.intentfinder.server.sources
"question" :
{
"type": "text",
"css_selector": "table.category > tbody td.list-title > a",
"css_selector": "table.category > tbody td.list-title > a"
},
"answer":
{
"type": "href",
"href_css_selector": "table.category > tbody td.list-title > a",
"css_selector": "div.item-page > div:nth-child(5)"
}
}
......@@ -34,7 +33,5 @@ data class Question(
)
data class Answer(
val type:String,
val href_css_selector: String,
val css_selector:String
)
\ No newline at end of file
package de.h_da.fbi.smebt.intentfinder.server.sources
import org.jsoup.*
import org.jsoup.nodes.*
class UrlReader() {
//includes resultset
private var qAndA = HashMap<String, String>()
//connection to an website
fun getDocument(websiteUrl: String?): Document {
return Jsoup.connect(websiteUrl).get()
}
//extract question-answer pairs based on tags
fun extraction(
doc: Document,
questionsTag: String?,
answersTag: String?,
answerType: String?
): HashMap<String, String> {
val questions = doc.select(questionsTag)
var answers = ArrayList<Element>()
/*
if answertype =text, then answer and the questions are on the same Page
else on different pages.
*/
when (answerType) {
"text" -> {
answers = doc.select(answersTag)
}
"href" -> {
var websiteUrlAnswers = ""
for (el: Element in questions) {
//connect to linked page in question element
websiteUrlAnswers = el.attr("abs:href")
val docHref = Jsoup.connect(websiteUrlAnswers).get()
answers.addAll(docHref.select(answersTag))
}
}
else -> {
throw UrlReaderException("Unknown answer type. \n" + UrlReaderException("").errorMessage)
}
}
//returns pairs
return generateQuestionAnswerPairs(questions, answers)
}
private fun generateQuestionAnswerPairs(
questions: ArrayList<Element>,
answers: ArrayList<Element>
): HashMap<String, String> {
if (questions.size != answers.size) {
throw UrlReaderException(
"Based on your configuration file, no matching between " +
"answers (size: " + answers.size + " ) and questions (size:" + questions.size + " ) were found."
)
}
if (questions.size == 0) {
throw UrlReaderException(
"No data could be extracted from the website. Please check your " +
"configuration file"
)
}
//generate map with results
for ((index ,el: Element) in questions.withIndex()) {
qAndA[el.text()] = answers[index].text()
}
return qAndA
}
}