Newer
Older
import math
import requests
from pyinflect import getAllInflections, getInflection
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from time import sleep
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
lemmatizer = WordNetLemmatizer()
def suggest_inflection(word):
pos = pos_tag([word])[0][1]
if pos.startswith("N"):
# Nouns mapped with noun markers
startswith ="N"
lemmparam = "n"
list_of_suggestions = None
elif pos.startswith("R"):
# adverbs mapped with adverb markers
startswith ="A"
lemmparam ="r"
list_of_suggestions = None
elif pos.startswith("J"):
# adjectives mapped with adjective markers
startswith ="A"
lemmparam ="a"
list_of_suggestions = None
elif pos.startswith("V"):
# Verbs mapped with verb markers
startswith ="V"
lemmparam ="v"
list_of_suggestions = None
elif pos == "PRP" or pos =="PRP$":
# If word in posessive pronoun, try all posessive pronouns
list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
"his", "her", "hers", "their", "theirs", "your" , "yours"]
startswith = None
else:
# Else, return nothing
startswith = None
list_of_suggestions = None
if list_of_suggestions is None and startswith is not None :
# if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
if lemmparam == "r":
# for adverbs , inflections of th
s = []
suggestion = ""
for ss in wn.synsets(word):
for lemmas in ss.lemmas(): # all possible lemmas.
s.append(lemmas)
for pers in s:
posword = pers.pertainyms()
if len(posword) == 0:
continue
else:
posword = posword[0].name()
if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
suggestion = posword
break
word = lemmatizer.lemmatize(suggestion,lemmparam)
inflections = getAllInflections(word)
tags =[ key for key in inflections.keys() ]
suggestion_list = [inflections[tag] for tag in tags]
suggestion = [i for sub in suggestion_list for i in sub]
return suggestion
else:
word = lemmatizer.lemmatize(word,lemmparam)
inflections = getAllInflections(word)
tags =[ key for key in inflections.keys() ]
suggestion_list = [inflections[tag] for tag in tags]
suggestion = [i for sub in suggestion_list for i in sub]
return suggestion
elif list_of_suggestions is not None and startswith is None:
return list_of_suggestions
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class GrammarCheckerGoogle:
def __init__(self,n,float_min,threshold):
# required
nltk.download('punkt')
self.float_min= float_min
# variables
self.threshold = threshold
self.n = n
# tokenizer
self.tokenizer = nltk.RegexpTokenizer(r"\w+")
def check(self, sentence):
""" checks a sentence for errors and recursively corrects the first one """
# lower case sentence
sentence = sentence.lower()
n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence),self.n))
# find error
i_error = self.find_index_of_error(n_grams)
return i_error
def get_google_ngram_prob(self, n_gram):
""" gets probability for given n_gram """
url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
successful = False
wait_time = 0.0001
while not successful:
response = requests.get(url)
sleep(wait_time)
if response.ok:
successful = True
results = json.loads(response.content)
if results:
max_prob = 0.0
for result in results:
cur_max_prob = max(results[0]["timeseries"])
max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
return max_prob
else:
return None
if not successful:
if wait_time < 10:
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
wait_time *= 10
else:
pass
# print("still no response.")
def get_prob_of_n_gram(self, n_gram):
""" calculates probability of n_gram """
# smallest possible positive float (1e-324 == 0.0)
# float_min = 1e-323
# get n_gram probability
prob = self.get_google_ngram_prob(n_gram)
return prob if prob != 0.0 and prob != None else self.float_min
def find_index_of_error(self, n_grams):
""" finds index of greatest error in n_grams"""
if len(n_grams) != 0:
probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
else:
probs = [self.float_min]
chained_probs = math.prod(probs)
diff_list = [(j-i) for i, j in zip(probs[:-1], probs[1:])]
logged_chained_prob = -np.log10((chained_probs)**(1/len(n_grams))) if (chained_probs)**(1/len(n_grams)) != 0 else (chained_probs)**(1/len(n_grams))
if logged_chained_prob <= self.threshold:
return probs.index(min(probs))
else:
return None