Skip to content
Snippets Groups Projects
create_corpus.py 2.42 KiB
Newer Older
  • Learn to ignore specific revisions
  • jmzk96's avatar
    jmzk96 committed
    import ast
    import json
    import nltk
    from nltk import ngrams
    import xml.etree.ElementTree as ET
    
    """
    corpus example:
    {
        "a":
            {
                'apple tastes good: 12,
                'ananas is sweet: 43,
                ...
            },
        "b":
            {
                ('banana gives energy'): 4,
                ('brand new world'): 28,
                ...
            },
        ...
    }
    """
    
    # set n
    n = 3
    
    # create corpus dictionary
    corpus = {}
    
    # create tree from xml
    tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
    root = tree.getroot()
    
    # go through tree
    for elem in root:
        for subelem in elem:
            # if subelem has attributes
            if subelem.attrib:
                # for each sentence in the dataset
                for subsubelem in subelem:
                    # get sid
                    sid = subsubelem.attrib["sid"]
    
                    # build sentence using only grammatically correct parts
                    sentence = ""
                    parts = list(subsubelem.iter())
                    for part in parts:
                        if part.tag in ["sentence", "ins"] and part.text is not None:  # explicitly don't use 'del'
                            sentence += part.text
                            # add trailing text if exists
                            if part.tail:
                                sentence += part.tail
                    sentence = sentence.replace("\n", "")
    
                    # print sentence
                    # print(f"{sid}: {sentence}")
    
                    # TODO: remove punctuation? add _START_/_END_ tags?
    
                    # get n-grams of sentence
                    n_grams = list(ngrams(nltk.word_tokenize(sentence), n))
    
                    # for each n-gram
                    for n_gram in n_grams:
                        n_gram_str = " ".join(n_gram)
                        first_char = n_gram_str[0]
                        # add dict entry if it doesn't exist
                        if first_char not in corpus:
                            corpus[first_char] = {}
                        # if n-gram is already in corpus
                        if n_gram_str in corpus[first_char].keys():
                            corpus[first_char][n_gram_str] += 1
                        # if n-gram is not already in corpus
                        else:
                            # add a corpus entry
                            corpus[first_char][n_gram_str] = 1
    
    # print built corpus
    for k, v in corpus.items():
        print(f"{k}: {v}")
    
    # save corpus as json
    with open(f"corpus_n={n}.json", "w") as outfile:
        json.dump(corpus, outfile)