Skip to content
Snippets Groups Projects
Commit 2e9bdd5e authored by jmzk96's avatar jmzk96
Browse files

added codes to script grammar_checker_google_jeremy

parent 06bd8a2f
Branches main
No related tags found
No related merge requests found
...@@ -26,7 +26,7 @@ class GrammarChecker: ...@@ -26,7 +26,7 @@ class GrammarChecker:
sentence = sentence.lower() sentence = sentence.lower()
# create n_grams # create n_grams
n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))} n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
for n in [2, 3] for n in [2, 3]:
n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n)) n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
# find error # find error
i_errors = self.find_index_of_error(n_grams) i_errors = self.find_index_of_error(n_grams)
......
import json
import nltk
import math
import requests
from time import sleep
class GrammarCheckerGoogle:
def __init__(self,n,float_min,threshold):
# required
nltk.download('punkt')
self.float_min= float_min
# variables
self.threshold = threshold
self.n = n
# tokenizer
self.tokenizer = nltk.RegexpTokenizer(r"\w+")
def check(self, sentence):
""" checks a sentence for errors and recursively corrects the first one """
# lower case sentence
sentence = sentence.lower()
n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence),self.n))
# find error
i_error = self.find_index_of_error(n_grams)
return i_error
def get_google_ngram_prob(self, n_gram):
""" gets probability for given n_gram """
url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
successful = False
wait_time = 0.0001
while not successful:
response = requests.get(url)
sleep(wait_time)
if response.ok:
successful = True
results = json.loads(response.content)
if results:
max_prob = 0.0
for result in results:
cur_max_prob = max(results[0]["timeseries"])
max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
return max_prob
else:
return None
if not successful:
if wait_time < 10:
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
wait_time *= 10
else:
pass
# print("still no response.")
def get_prob_of_n_gram(self, n_gram):
""" calculates probability of n_gram """
# smallest possible positive float (1e-324 == 0.0)
# float_min = 1e-323
# get n_gram probability
prob = self.get_google_ngram_prob(n_gram)
return prob if prob != 0.0 and prob != None else self.float_min
def find_index_of_error(self, n_grams):
""" finds index of greatest error in n_grams"""
if len(n_grams) != 0:
probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
else:
probs = [self.float_min]
chained_probs = math.prod(probs)
diff_list = [(j-i) for i, j in zip(probs[:-1], probs[1:])]
if (chained_probs)**(1/len(n_grams)) <= self.threshold:
return probs.index(min(probs))
else:
return None
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment