diff --git a/create_corpus.py b/create_corpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c582a2495689523555a25c7f4124e91a2bb5e37
--- /dev/null
+++ b/create_corpus.py
@@ -0,0 +1,86 @@
+import ast
+import json
+import nltk
+from nltk import ngrams
+import xml.etree.ElementTree as ET
+
+"""
+corpus example:
+{
+    "a":
+        {
+            'apple tastes good: 12,
+            'ananas is sweet: 43,
+            ...
+        },
+    "b":
+        {
+            ('banana gives energy'): 4,
+            ('brand new world'): 28,
+            ...
+        },
+    ...
+}
+"""
+
+# set n
+n = 3
+
+# create corpus dictionary
+corpus = {}
+
+# create tree from xml
+tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
+root = tree.getroot()
+
+# go through tree
+for elem in root:
+    for subelem in elem:
+        # if subelem has attributes
+        if subelem.attrib:
+            # for each sentence in the dataset
+            for subsubelem in subelem:
+                # get sid
+                sid = subsubelem.attrib["sid"]
+
+                # build sentence using only grammatically correct parts
+                sentence = ""
+                parts = list(subsubelem.iter())
+                for part in parts:
+                    if part.tag in ["sentence", "ins"] and part.text is not None:  # explicitly don't use 'del'
+                        sentence += part.text
+                        # add trailing text if exists
+                        if part.tail:
+                            sentence += part.tail
+                sentence = sentence.replace("\n", "")
+
+                # print sentence
+                # print(f"{sid}: {sentence}")
+
+                # TODO: remove punctuation? add _START_/_END_ tags?
+
+                # get n-grams of sentence
+                n_grams = list(ngrams(nltk.word_tokenize(sentence), n))
+
+                # for each n-gram
+                for n_gram in n_grams:
+                    n_gram_str = " ".join(n_gram)
+                    first_char = n_gram_str[0]
+                    # add dict entry if it doesn't exist
+                    if first_char not in corpus:
+                        corpus[first_char] = {}
+                    # if n-gram is already in corpus
+                    if n_gram_str in corpus[first_char].keys():
+                        corpus[first_char][n_gram_str] += 1
+                    # if n-gram is not already in corpus
+                    else:
+                        # add a corpus entry
+                        corpus[first_char][n_gram_str] = 1
+
+# print built corpus
+for k, v in corpus.items():
+    print(f"{k}: {v}")
+
+# save corpus as json
+with open(f"corpus_n={n}.json", "w") as outfile:
+    json.dump(corpus, outfile)