Commit c564cad8 authored by Amr Abdelghany Abdou's avatar Amr Abdelghany Abdou
Browse files

Merge branch 'issue#37' into 'master'

[issue, #37] Extraction von Fragen und Antworten einer DOCX-FAQ-Datei auf...

See merge request !63
parents accfe3eb c9651f57
Pipeline #74022 passed with stages
in 2 minutes and 8 seconds
package de.h_da.fbi.smebt.intentfinder.server.sources
import de.h_da.fbi.smebt.intentfinder.server.nlp.PythonBridge
import io.ktor.client.features.*
import org.apache.poi.xwpf.extractor.XWPFWordExtractor
import org.apache.poi.xwpf.usermodel.XWPFDocument
import org.apache.poi.xwpf.usermodel.XWPFParagraph
import org.apache.poi.xwpf.usermodel.*
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STHighlightColor
import java.io.FileOutputStream
import java.nio.file.Files
......@@ -11,14 +11,18 @@ import java.nio.file.Paths
import java.nio.file.StandardCopyOption
class DocxReader(private val fileName: String) {
class DocxReader(private val jsonConfig: JSONDocxConfig) {
private var text: String = ""
private var paragraphs: List<XWPFParagraph>? = null
private val runs = ArrayList<XWPFRun>()
private var style: XWPFStyles? = null
fun getParagraphs(): List<String> {
val paragraphs = ArrayList<String>()
XWPFDocument(
DocxReader::class.java.getResource("/" + fileName).openStream()
Files.newInputStream(Paths.get(jsonConfig.name))
).use { doc ->
val list: List<XWPFParagraph> = doc.getParagraphs()
for (paragraph in list) {
......@@ -32,7 +36,7 @@ class DocxReader(private val fileName: String) {
if (text.length > 0)
return text
XWPFDocument(
DocxReader::class.java.getResource("/" + fileName).openStream()
Files.newInputStream(Paths.get(jsonConfig.name))
).use { doc ->
val xwpfWordExtractor = XWPFWordExtractor(doc)
text = xwpfWordExtractor.text
......@@ -54,7 +58,8 @@ class DocxReader(private val fileName: String) {
fun extractFAQ(paragraphTypeQuestion: String): Pair<List<QuestionAnswer>, List<Int>> {
val faq = ArrayList<QuestionAnswer>()
val handledParagraphs = ArrayList<Int>() // it is possible, that the doc starts with paragraphs of different style and hence no question
val handledParagraphs =
ArrayList<Int>() // it is possible, that the doc starts with paragraphs of different style and hence no question
val paragraphTypeQuestionTuned = paragraphTypeQuestion.replace("Ü", "") //Ü ist cut off
var question = ""
var answer = ""
......@@ -79,15 +84,14 @@ class DocxReader(private val fileName: String) {
return Pair(faq, handledParagraphs)
}
fun highlightHandledParagraphs(handledParagraphs: List<Int>, highlightedDocName: String):Boolean {
fun highlightHandledParagraphs(handledParagraphs: List<Int>, highlightedDocName: String): Boolean {
iniParagraphs()
try {
val orgPath = DocxReader::class.java.getResource("/" + fileName).openStream()
val orgPath = Paths.get(jsonConfig.name)
val destPath = Paths.get(highlightedDocName)
Files.copy(orgPath, destPath, StandardCopyOption.REPLACE_EXISTING)
XWPFDocument(
DocxReader::class.java.getResource("/" + highlightedDocName).openStream()
Files.newInputStream(Paths.get(highlightedDocName))
).use { doc ->
val paragraphsNewDoc = doc.getParagraphs()
for (i in paragraphsNewDoc.indices) {
......@@ -97,15 +101,12 @@ class DocxReader(private val fileName: String) {
doc.write(FileOutputStream(highlightedDocName));
return true
}
}
catch (ex: Exception){
} catch (ex: Exception) {
println(ex.message)
// This test cant work properly currently, need to take a look at resources...
return true
return false
}
}
private fun highlightParagraph(p: XWPFParagraph, color: STHighlightColor.Enum) {
val runs = p.runs
for (i in runs.size - 1 downTo 1) {
......@@ -118,29 +119,35 @@ class DocxReader(private val fileName: String) {
private fun iniParagraphs() {
if (paragraphs == null) {
XWPFDocument(
DocxReader::class.java.getResource("/" + fileName).openStream()
Files.newInputStream(Paths.get(jsonConfig.name))
).use { doc ->
paragraphs = doc.getParagraphs()
style = doc.styles
}
}
}
fun getColoredParagraphs(color: String): Map<Int,ColoredPart>{
private fun iniRuns() {
for (p in paragraphs!!) {
runs.addAll(p.runs)
}
}
fun getColoredParagraphs(color: String): Map<Int, ColoredPart> {
iniParagraphs()
val result = HashMap<Int, ColoredPart>()
for (i in paragraphs!!.indices) {
for (run in paragraphs!![i].runs) {
if (run.color == color){ //RRGGBB
if (run.color == color) { //RRGGBB
if (run.text().trim().isNotEmpty()) {
if (result.containsKey(i)){
if (result.containsKey(i)) {
if (!result.get(i)!!.coloredText.contains(run.text().trim())) {
val tmp = result.get(i)
tmp!!.coloredText += run.text()
result.replace(i, tmp!!)
}
}
else {
val tmp = result.get(i)
tmp!!.coloredText += run.text()
result.replace(i, tmp!!)
}
} else {
result.put(i, ColoredPart(run.text().trim(), paragraphs!![i].text))
}
}
......@@ -156,6 +163,76 @@ class DocxReader(private val fileName: String) {
return Dialog(intentid, info)
}
fun getQuestionAnswerPairs(): MutableMap<String, String> {
iniParagraphs()
iniRuns()
var questionAnswer = mutableMapOf<String, String>()
var style = jsonConfig.question.style
var format = jsonConfig.question.format
var size = jsonConfig.question.size
try {
if (style.isNotBlank() && (format.isNotBlank() || size > 0)) {
throw Exception("DOCX Config should use either a style only or both format and size, but no all of them")
}
if (style.isNotBlank()) {
var normalizedStyle = style.toLowerCase().filterNot { it !in listOf("ä", "ö", "ü") }
var questionFound = ""
val re = Regex("[^A-Za-z0-9 ]")
for (p in paragraphs!!) {
if (p.style != null && p.style.contains(normalizedStyle)) {
if (re.replace(p.text, "").isNotBlank()) {
questionFound = p.text
questionAnswer[questionFound] = ""
}
} else if (re.replace(p.text, "").isNotBlank() && questionFound != "") {
questionAnswer[questionFound] += p.text + " "
}
}
}
if (format.isNotBlank() && size > 0) {
var questionFound = ""
val re = Regex("[^A-Za-z0-9 ]")
for (p in paragraphs!!) {
var countOccurences = 0
for (run in p.runs) {
var fontSize = 0.0
if (run.fontSizeAsDouble == null || run.fontSizeAsDouble == "-1".toDouble()) {
fontSize = this.style!!.defaultRunStyle.fontSizeAsDouble
}
if (fontSize == size.toDouble()) {
if (format.toLowerCase() == "bold") {
if (run.isBold)
countOccurences++
} else if (format.toLowerCase() == "italic") {
if (run.isItalic)
countOccurences++
} else if (format.toLowerCase() == "underline") {
if (run.underline.value == 1)
countOccurences++
}
}
}
if (countOccurences == p.runs.size && countOccurences > 1) {
questionFound = p.text
questionAnswer[p.text] = ""
} else if (re.replace(p.text, "").isNotBlank() && questionFound != "") {
questionAnswer[questionFound] += p.text + " "
}
}
}
} catch (e: Exception){
throw e
}
return questionAnswer
}
data class SourceInfo(val paragraphIndex: Int, val text: String)
data class QuestionAnswer(val question: String, val answer: String)
data class ColoredPart(var coloredText: String, val paragraphText: String)
......
......@@ -7,6 +7,23 @@ import org.junit.jupiter.api.Assertions.assertEquals
// TODO tests NEED refactoring, pretty bad
internal class DocxReaderTest {
private var testDocx: JSONDocxConfig =
JSONDocxConfig("docx", this::class.java.getResource("/testdocx4j.docx").path, QuestionDocx("", "", Integer(-1)))
// private var faqDocx: JSONDocxConfig =
// JSONDocxConfig("docx", this::class.java.getResource("/faq.docx").path, QuestionDocx("", "", Integer(-1)))
private var htcDocx: JSONDocxConfig =
JSONDocxConfig(
"docx",
this::class.java.getResource("/htc_sync_manager_faq.docx").path,
QuestionDocx("", "bold", Integer(12))
)
private var sportDocx: JSONDocxConfig =
JSONDocxConfig(
"docx",
this::class.java.getResource("/sportnahrung_engel_muskulatur_faq.docx").path,
QuestionDocx("Heading1", "", Integer(-1))
)
@org.junit.jupiter.api.BeforeEach
fun setUp() {
}
......@@ -17,8 +34,7 @@ internal class DocxReaderTest {
@org.junit.jupiter.api.Test
fun getParagraphs() {
DocxReaderTest::class.java.getResource("/html/file.html")
val reader = DocxReader("testdocx4j.docx")
val reader = DocxReader(this.testDocx)
val actual = reader.getParagraphs()
assertEquals(7, actual.size)
}
......@@ -29,7 +45,7 @@ internal class DocxReaderTest {
@org.junit.jupiter.api.Test
fun extractByParagraphType() {
val reader = DocxReader("testdocx4j.docx")
val reader = DocxReader(this.testDocx)
val actual = reader.extractByParagraphType("Überschrift4")
assertEquals(1, actual.size)
}
......@@ -44,7 +60,7 @@ internal class DocxReaderTest {
@org.junit.jupiter.api.Test
fun highlightHandledParagraphs() {
val reader = DocxReader("testdocx4j.docx")
val reader = DocxReader(this.testDocx)
val actual = reader.highlightHandledParagraphs(listOf(1, 3), "testHighlighted.docx")
assertEquals(true, actual)
}
......@@ -61,5 +77,18 @@ internal class DocxReaderTest {
// assertEquals(0, filtered3.size)
// }
@org.junit.jupiter.api.Test
fun getQuestionAnswerPairsByFormatAndSize() {
val reader = DocxReader(this.htcDocx)
val questionAnswer = reader.getQuestionAnswerPairs()
assertEquals(31, questionAnswer.keys.size)
assertEquals(31, questionAnswer.values.size)
}
@org.junit.jupiter.api.Test
fun getQuestionAnswerPairsByStyle() {
val reader = DocxReader(this.sportDocx)
val questionAnswer = reader.getQuestionAnswerPairs()
assertEquals(9, questionAnswer.keys.size)
assertEquals(9, questionAnswer.values.size)
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment