Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import json
import nltk
import requests
import threading
import numpy as np
from time import sleep
from tester import Tester
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from pyinflect import getAllInflections
class GrammarChecker:
def __init__(self):
# required
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# variables
self.default_prob = 1e-10
self.thresholds = {2: 5.6, 3: 7.7}
# tokenizer
self.tokenizer = nltk.RegexpTokenizer(r"\w+")
# lemmatizer
self.lemmatizer = WordNetLemmatizer()
def check(self, sentence):
""" checks a sentence for errors and recursively corrects the first one """
# lower case sentence
sentence = sentence.lower()
# create n_grams
n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
for n in [2, 3]:
n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
# find errors
i_errors = self.find_index_of_error(n_grams)
# get corrections
unigrams, i_corrections = self.get_corrections(n_grams, i_errors)
print(f"unigrams: {unigrams}")
print(f"i_corrections: {i_corrections}")
return unigrams, i_corrections
def get_corrections(self, n_grams, i_errors):
""" gets corrections for errors """
# get unigrams and create result corrections dict
unigrams = [unigram[0] for unigram in n_grams[1]]
i_corrections = {i_error: unigrams[i_error] for i_error in i_errors}
# if errors are found
if i_corrections:
# collect probabilities of inflections for all errors
probs = {}
for i_error, word in i_corrections.items():
probs[i_error] = {}
try:
inflections = set(self.suggest_inflections(word))
except Exception:
continue
for n in n_grams:
if n == 1:
continue
probs[i_error][n] = {}
n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error)
error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes]
# threads for checking error_n_grams with inflections in parallel
threads = []
for error_n_gram in error_n_grams:
threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word)))
threads[-1].setDaemon(True)
threads[-1].start()
for thread in threads:
thread.join()
# voting mechanism
prob_accumulator = {}
for i_error, ns in probs.items():
prob_accumulator[i_error] = {}
for n, error_n_grams in ns.items():
for error_n_gram, inflections in error_n_grams.items():
for inflection, prob in inflections.items():
if inflection in prob_accumulator[i_error]:
prob_accumulator[i_error][inflection] += prob
else:
prob_accumulator[i_error][inflection] = prob
# determine best inflections
for i_error, inflections in prob_accumulator.items():
if inflections:
i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0]
return unigrams, i_corrections
def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word):
probs[i_error][n][error_n_gram] = {}
inflection_n_grams = []
for inflection in inflections:
tmp = list(error_n_gram)
index = tmp.index(word)
tmp[index] = inflection
inflection_n_grams.append(tmp)
inflection_probs = self.get_probs_of_n_grams(inflection_n_grams)
for i, inflection in enumerate(inflections):
probs[i_error][n][error_n_gram][inflection] = inflection_probs[i]
def suggest_inflections(self, word):
pos = pos_tag([word])[0][1]
if pos.startswith("N"):
# Nouns mapped with noun markers
startswith ="N"
lemmparam = "n"
list_of_suggestions = None
elif pos.startswith("R"):
# adverbs mapped with adverb markers
startswith ="A"
lemmparam ="r"
list_of_suggestions = None
elif pos.startswith("J"):
# adjectives mapped with adjective markers
startswith ="A"
lemmparam ="a"
list_of_suggestions = None
elif pos.startswith("V"):
# Verbs mapped with verb markers
startswith ="V"
lemmparam ="v"
list_of_suggestions = None
elif pos == "PRP" or pos =="PRP$":
# If word in posessive pronoun, try all posessive pronouns
list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
"his", "her", "hers", "their", "theirs", "your" , "yours"]
startswith = None
else:
# Else, return nothing
startswith = None
list_of_suggestions = None
if list_of_suggestions is None and startswith is not None:
# if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
if lemmparam == "r":
# for adverbs , inflections of th
s = []
suggestion = ""
for ss in wn.synsets(word):
for lemmas in ss.lemmas(): # all possible lemmas.
s.append(lemmas)
for pers in s:
posword = pers.pertainyms()
if len(posword) == 0:
continue
else:
posword = posword[0].name()
if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
suggestion = posword
break
word = self.lemmatizer.lemmatize(suggestion,lemmparam)
inflections = getAllInflections(word)
tags =[ key for key in inflections.keys() ]
suggestion_list = [inflections[tag] for tag in tags]
suggestion = [i for sub in suggestion_list for i in sub]
return suggestion
else:
word = self.lemmatizer.lemmatize(word,lemmparam)
inflections = getAllInflections(word)
tags =[ key for key in inflections.keys() ]
suggestion_list = [inflections[tag] for tag in tags]
suggestion = [i for sub in suggestion_list for i in sub]
return suggestion
elif list_of_suggestions is not None and startswith is None:
return list_of_suggestions
def get_google_ngram_prob(self, n_gram):
""" gets probability for given n_gram """
url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
successful = False
wait_time = 0.0001
while not successful:
response = requests.get(url)
sleep(wait_time)
if response.ok:
successful = True
results = json.loads(response.content)
if results:
max_prob = 0.0
for result in results:
cur_max_prob = max(results[0]["timeseries"])
max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
return max_prob
else:
return None
if not successful:
if wait_time < 10:
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
wait_time *= 10
else:
pass
# print("still no response.")
def get_prob_of_n_gram(self, n_gram, probs, i):
""" calculates probability of n_gram """
# get n_gram probability
prob = self.get_google_ngram_prob(n_gram)
probs[i] = prob if prob != 0.0 and prob != None else self.default_prob
def get_probs_of_n_grams(self, n_grams):
# create target list
probs = [None]*len(n_grams)
# create and start threads
threads = []
for i, n_gram in enumerate(n_grams):
threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i)))
threads[-1].setDaemon(True)
threads[-1].start()
# join threads
for thread in threads:
thread.join()
return probs
def get_word_indexes_from_n_gram_index(self, n, n_gram_index):
word_indexes = [n_gram_index]
for i in range(n-1):
word_indexes.append(word_indexes[-1]+1)
return word_indexes
def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index):
n_gram_indexes = [0] if word_index < n else [word_index-n+1]
for i in range(word_index%n if word_index < n else n-1):
nxt = n_gram_indexes[-1]+1
if nxt < n_gram_cnt:
n_gram_indexes.append(nxt)
return n_gram_indexes
def find_index_of_error(self, n_grams):
""" finds index of greatest error in n_grams"""
# get probabilities for all n_grams
probs = {}
thresholds_passed = {}
smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}}
for n in n_grams:
# don't take 1-grams into account
if n == 1:
continue
# smallest prob
probs[n] = self.get_probs_of_n_grams(n_grams[n])
try:
for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))):
smallest_prob_counter[n][index] += 1
except Exception:
pass
# threshholds check
if np.prod(probs[n]) == 0:
thresholds_passed[n] = True
else:
thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n]
# determine indexes of errors
i_errors = []
max_counter = 0
total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))}
for n, smallest_probs in smallest_prob_counter.items():
if True: # thresholds_passed[n]:
for index in total_smallest_prob_counter:
total_smallest_prob_counter[index] += smallest_probs[index]
for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]):
if counter >= max_counter and counter != 0:
i_errors.append(index)
max_counter = counter
print(f"below thresholds? bi-gram: {thresholds_passed[2]}, tri-gram: {thresholds_passed[3]}")
print("notice: sentence level thresholds ignored. we currently always assume an error.")
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
return i_errors
if __name__ == "__main__":
# get sentences
tester = Tester()
# create grammar checker
grammar_checker = GrammarChecker()
# check sentences
print("CORRECT SENTENCES\n\n")
for sentence in tester.correct_sentences:
print(sentence.text)
grammar_checker.check(sentence.text)
print()
print("\nTYPE 1 ERROR SENTENCES\n\n")
for sentence in tester.type_1_error_sentences:
print(sentence.text)
print(sentence.original)
grammar_checker.check(sentence.text)
print()
print("\nTYPE 2 ERROR SENTENCES\n\n")
for sentence in tester.type_2_error_sentences:
print(sentence.text)
print(sentence.original)
grammar_checker.check(sentence.text)
print()
print("\nTYPE 3 ERROR SENTENCES\n\n")
for sentence in tester.type_3_error_sentences:
print(sentence.text)
print(sentence.original)
grammar_checker.check(sentence.text)
print()