diff --git a/create_corpus.py b/create_corpus.py new file mode 100644 index 0000000000000000000000000000000000000000..9c582a2495689523555a25c7f4124e91a2bb5e37 --- /dev/null +++ b/create_corpus.py @@ -0,0 +1,86 @@ +import ast +import json +import nltk +from nltk import ngrams +import xml.etree.ElementTree as ET + +""" +corpus example: +{ + "a": + { + 'apple tastes good: 12, + 'ananas is sweet: 43, + ... + }, + "b": + { + ('banana gives energy'): 4, + ('brand new world'): 28, + ... + }, + ... +} +""" + +# set n +n = 3 + +# create corpus dictionary +corpus = {} + +# create tree from xml +tree = ET.parse('../data/aesw2016(v1.2)_train.xml') +root = tree.getroot() + +# go through tree +for elem in root: + for subelem in elem: + # if subelem has attributes + if subelem.attrib: + # for each sentence in the dataset + for subsubelem in subelem: + # get sid + sid = subsubelem.attrib["sid"] + + # build sentence using only grammatically correct parts + sentence = "" + parts = list(subsubelem.iter()) + for part in parts: + if part.tag in ["sentence", "ins"] and part.text is not None: # explicitly don't use 'del' + sentence += part.text + # add trailing text if exists + if part.tail: + sentence += part.tail + sentence = sentence.replace("\n", "") + + # print sentence + # print(f"{sid}: {sentence}") + + # TODO: remove punctuation? add _START_/_END_ tags? + + # get n-grams of sentence + n_grams = list(ngrams(nltk.word_tokenize(sentence), n)) + + # for each n-gram + for n_gram in n_grams: + n_gram_str = " ".join(n_gram) + first_char = n_gram_str[0] + # add dict entry if it doesn't exist + if first_char not in corpus: + corpus[first_char] = {} + # if n-gram is already in corpus + if n_gram_str in corpus[first_char].keys(): + corpus[first_char][n_gram_str] += 1 + # if n-gram is not already in corpus + else: + # add a corpus entry + corpus[first_char][n_gram_str] = 1 + +# print built corpus +for k, v in corpus.items(): + print(f"{k}: {v}") + +# save corpus as json +with open(f"corpus_n={n}.json", "w") as outfile: + json.dump(corpus, outfile)