Skip to content
Snippets Groups Projects
Commit b295d361 authored by jmzk96's avatar jmzk96
Browse files

added create_corpus.py

parent f4894eeb
No related branches found
No related tags found
No related merge requests found
import ast
import json
import nltk
from nltk import ngrams
import xml.etree.ElementTree as ET
"""
corpus example:
{
"a":
{
'apple tastes good: 12,
'ananas is sweet: 43,
...
},
"b":
{
('banana gives energy'): 4,
('brand new world'): 28,
...
},
...
}
"""
# set n
n = 3
# create corpus dictionary
corpus = {}
# create tree from xml
tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
root = tree.getroot()
# go through tree
for elem in root:
for subelem in elem:
# if subelem has attributes
if subelem.attrib:
# for each sentence in the dataset
for subsubelem in subelem:
# get sid
sid = subsubelem.attrib["sid"]
# build sentence using only grammatically correct parts
sentence = ""
parts = list(subsubelem.iter())
for part in parts:
if part.tag in ["sentence", "ins"] and part.text is not None: # explicitly don't use 'del'
sentence += part.text
# add trailing text if exists
if part.tail:
sentence += part.tail
sentence = sentence.replace("\n", "")
# print sentence
# print(f"{sid}: {sentence}")
# TODO: remove punctuation? add _START_/_END_ tags?
# get n-grams of sentence
n_grams = list(ngrams(nltk.word_tokenize(sentence), n))
# for each n-gram
for n_gram in n_grams:
n_gram_str = " ".join(n_gram)
first_char = n_gram_str[0]
# add dict entry if it doesn't exist
if first_char not in corpus:
corpus[first_char] = {}
# if n-gram is already in corpus
if n_gram_str in corpus[first_char].keys():
corpus[first_char][n_gram_str] += 1
# if n-gram is not already in corpus
else:
# add a corpus entry
corpus[first_char][n_gram_str] = 1
# print built corpus
for k, v in corpus.items():
print(f"{k}: {v}")
# save corpus as json
with open(f"corpus_n={n}.json", "w") as outfile:
json.dump(corpus, outfile)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment