create_corpus.py

import ast
import json
import nltk
from nltk import ngrams
import xml.etree.ElementTree as ET

"""
corpus example:
{
    "a":
        {
            'apple tastes good: 12,
            'ananas is sweet: 43,
            ...
        },
    "b":
        {
            ('banana gives energy'): 4,
            ('brand new world'): 28,
            ...
        },
    ...
}
"""

# set n
n = 3

# create corpus dictionary
corpus = {}

# create tree from xml
tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
root = tree.getroot()

# go through tree
for elem in root:
    for subelem in elem:
        # if subelem has attributes
        if subelem.attrib:
            # for each sentence in the dataset
            for subsubelem in subelem:
                # get sid
                sid = subsubelem.attrib["sid"]

                # build sentence using only grammatically correct parts
                sentence = ""
                parts = list(subsubelem.iter())
                for part in parts:
                    if part.tag in ["sentence", "ins"] and part.text is not None:  # explicitly don't use 'del'
                        sentence += part.text
                        # add trailing text if exists
                        if part.tail:
                            sentence += part.tail
                sentence = sentence.replace("\n", "")

                # print sentence
                # print(f"{sid}: {sentence}")

                # TODO: remove punctuation? add _START_/_END_ tags?

                # get n-grams of sentence
                n_grams = list(ngrams(nltk.word_tokenize(sentence), n))

                # for each n-gram
                for n_gram in n_grams:
                    n_gram_str = " ".join(n_gram)
                    first_char = n_gram_str[0]
                    # add dict entry if it doesn't exist
                    if first_char not in corpus:
                        corpus[first_char] = {}
                    # if n-gram is already in corpus
                    if n_gram_str in corpus[first_char].keys():
                        corpus[first_char][n_gram_str] += 1
                    # if n-gram is not already in corpus
                    else:
                        # add a corpus entry
                        corpus[first_char][n_gram_str] = 1

# print built corpus
for k, v in corpus.items():
    print(f"{k}: {v}")

# save corpus as json
with open(f"corpus_n={n}.json", "w") as outfile:
    json.dump(corpus, outfile)