Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import ast
import json
import nltk
from nltk import ngrams
import xml.etree.ElementTree as ET
"""
corpus example:
{
"a":
{
'apple tastes good: 12,
'ananas is sweet: 43,
...
},
"b":
{
('banana gives energy'): 4,
('brand new world'): 28,
...
},
...
}
"""
# set n
n = 3
# create corpus dictionary
corpus = {}
# create tree from xml
tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
root = tree.getroot()
# go through tree
for elem in root:
for subelem in elem:
# if subelem has attributes
if subelem.attrib:
# for each sentence in the dataset
for subsubelem in subelem:
# get sid
sid = subsubelem.attrib["sid"]
# build sentence using only grammatically correct parts
sentence = ""
parts = list(subsubelem.iter())
for part in parts:
if part.tag in ["sentence", "ins"] and part.text is not None: # explicitly don't use 'del'
sentence += part.text
# add trailing text if exists
if part.tail:
sentence += part.tail
sentence = sentence.replace("\n", "")
# print sentence
# print(f"{sid}: {sentence}")
# TODO: remove punctuation? add _START_/_END_ tags?
# get n-grams of sentence
n_grams = list(ngrams(nltk.word_tokenize(sentence), n))
# for each n-gram
for n_gram in n_grams:
n_gram_str = " ".join(n_gram)
first_char = n_gram_str[0]
# add dict entry if it doesn't exist
if first_char not in corpus:
corpus[first_char] = {}
# if n-gram is already in corpus
if n_gram_str in corpus[first_char].keys():
corpus[first_char][n_gram_str] += 1
# if n-gram is not already in corpus
else:
# add a corpus entry
corpus[first_char][n_gram_str] = 1
# print built corpus
for k, v in corpus.items():
print(f"{k}: {v}")
# save corpus as json
with open(f"corpus_n={n}.json", "w") as outfile:
json.dump(corpus, outfile)