import json import nltk import requests import threading import numpy as np from time import sleep from tester import Tester from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet as wn from nltk import pos_tag from pyinflect import getAllInflections class GrammarChecker: def __init__(self): # required nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') # variables self.default_prob = 1e-10 self.thresholds = {2: 5.6, 3: 7.7} # tokenizer self.tokenizer = nltk.RegexpTokenizer(r"\w+") # lemmatizer self.lemmatizer = WordNetLemmatizer() def check(self, sentence): """ checks a sentence for errors and recursively corrects the first one """ # lower case sentence sentence = sentence.lower() # create n_grams n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))} for n in [2, 3]: n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n)) # find errors i_errors = self.find_index_of_error(n_grams) # get corrections unigrams, i_corrections = self.get_corrections(n_grams, i_errors) print(f"unigrams: {unigrams}") print(f"i_corrections: {i_corrections}") return unigrams, i_corrections def get_corrections(self, n_grams, i_errors): """ gets corrections for errors """ # get unigrams and create result corrections dict unigrams = [unigram[0] for unigram in n_grams[1]] i_corrections = {i_error: unigrams[i_error] for i_error in i_errors} # if errors are found if i_corrections: # collect probabilities of inflections for all errors probs = {} for i_error, word in i_corrections.items(): probs[i_error] = {} try: inflections = set(self.suggest_inflections(word)) except Exception: continue for n in n_grams: if n == 1: continue probs[i_error][n] = {} n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error) error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes] # threads for checking error_n_grams with inflections in parallel threads = [] for error_n_gram in error_n_grams: threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word))) threads[-1].setDaemon(True) threads[-1].start() for thread in threads: thread.join() # voting mechanism prob_accumulator = {} for i_error, ns in probs.items(): prob_accumulator[i_error] = {} for n, error_n_grams in ns.items(): for error_n_gram, inflections in error_n_grams.items(): for inflection, prob in inflections.items(): if inflection in prob_accumulator[i_error]: prob_accumulator[i_error][inflection] += prob else: prob_accumulator[i_error][inflection] = prob # determine best inflections for i_error, inflections in prob_accumulator.items(): if inflections: i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0] return unigrams, i_corrections def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word): probs[i_error][n][error_n_gram] = {} inflection_n_grams = [] for inflection in inflections: tmp = list(error_n_gram) index = tmp.index(word) tmp[index] = inflection inflection_n_grams.append(tmp) inflection_probs = self.get_probs_of_n_grams(inflection_n_grams) for i, inflection in enumerate(inflections): probs[i_error][n][error_n_gram][inflection] = inflection_probs[i] def suggest_inflections(self, word): pos = pos_tag([word])[0][1] if pos.startswith("N"): # Nouns mapped with noun markers startswith ="N" lemmparam = "n" list_of_suggestions = None elif pos.startswith("R"): # adverbs mapped with adverb markers startswith ="A" lemmparam ="r" list_of_suggestions = None elif pos.startswith("J"): # adjectives mapped with adjective markers startswith ="A" lemmparam ="a" list_of_suggestions = None elif pos.startswith("V"): # Verbs mapped with verb markers startswith ="V" lemmparam ="v" list_of_suggestions = None elif pos == "PRP" or pos =="PRP$": # If word in posessive pronoun, try all posessive pronouns list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\ "his", "her", "hers", "their", "theirs", "your" , "yours"] startswith = None else: # Else, return nothing startswith = None list_of_suggestions = None if list_of_suggestions is None and startswith is not None: # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag if lemmparam == "r": # for adverbs , inflections of th s = [] suggestion = "" for ss in wn.synsets(word): for lemmas in ss.lemmas(): # all possible lemmas. s.append(lemmas) for pers in s: posword = pers.pertainyms() if len(posword) == 0: continue else: posword = posword[0].name() if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] : suggestion = posword break word = self.lemmatizer.lemmatize(suggestion,lemmparam) inflections = getAllInflections(word) tags =[ key for key in inflections.keys() ] suggestion_list = [inflections[tag] for tag in tags] suggestion = [i for sub in suggestion_list for i in sub] return suggestion else: word = self.lemmatizer.lemmatize(word,lemmparam) inflections = getAllInflections(word) tags =[ key for key in inflections.keys() ] suggestion_list = [inflections[tag] for tag in tags] suggestion = [i for sub in suggestion_list for i in sub] return suggestion elif list_of_suggestions is not None and startswith is None: return list_of_suggestions def get_google_ngram_prob(self, n_gram): """ gets probability for given n_gram """ url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true" successful = False wait_time = 0.0001 while not successful: response = requests.get(url) sleep(wait_time) if response.ok: successful = True results = json.loads(response.content) if results: max_prob = 0.0 for result in results: cur_max_prob = max(results[0]["timeseries"]) max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob return max_prob else: return None if not successful: if wait_time < 10: # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.") wait_time *= 10 else: pass # print("still no response.") def get_prob_of_n_gram(self, n_gram, probs, i): """ calculates probability of n_gram """ # get n_gram probability prob = self.get_google_ngram_prob(n_gram) probs[i] = prob if prob != 0.0 and prob != None else self.default_prob def get_probs_of_n_grams(self, n_grams): # create target list probs = [None]*len(n_grams) # create and start threads threads = [] for i, n_gram in enumerate(n_grams): threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i))) threads[-1].setDaemon(True) threads[-1].start() # join threads for thread in threads: thread.join() return probs def get_word_indexes_from_n_gram_index(self, n, n_gram_index): word_indexes = [n_gram_index] for i in range(n-1): word_indexes.append(word_indexes[-1]+1) return word_indexes def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index): n_gram_indexes = [0] if word_index < n else [word_index-n+1] for i in range(word_index%n if word_index < n else n-1): nxt = n_gram_indexes[-1]+1 if nxt < n_gram_cnt: n_gram_indexes.append(nxt) return n_gram_indexes def find_index_of_error(self, n_grams): """ finds index of greatest error in n_grams""" # get probabilities for all n_grams probs = {} thresholds_passed = {} smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}} for n in n_grams: # don't take 1-grams into account if n == 1: continue # smallest prob probs[n] = self.get_probs_of_n_grams(n_grams[n]) try: for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))): smallest_prob_counter[n][index] += 1 except Exception: pass # threshholds check if np.prod(probs[n]) == 0: thresholds_passed[n] = True else: thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n] # determine indexes of errors i_errors = [] max_counter = 0 total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))} for n, smallest_probs in smallest_prob_counter.items(): if True: # thresholds_passed[n]: for index in total_smallest_prob_counter: total_smallest_prob_counter[index] += smallest_probs[index] for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]): if counter >= max_counter and counter != 0: i_errors.append(index) max_counter = counter print(f"below thresholds? bi-gram: {thresholds_passed[2]}, tri-gram: {thresholds_passed[3]}") print("notice: sentence level thresholds ignored. we currently always assume an error.") return i_errors if __name__ == "__main__": # get sentences tester = Tester() # create grammar checker grammar_checker = GrammarChecker() # check sentences print("CORRECT SENTENCES\n\n") for sentence in tester.correct_sentences: print(sentence.text) grammar_checker.check(sentence.text) print() print("\nTYPE 1 ERROR SENTENCES\n\n") for sentence in tester.type_1_error_sentences: print(sentence.text) print(sentence.original) grammar_checker.check(sentence.text) print() print("\nTYPE 2 ERROR SENTENCES\n\n") for sentence in tester.type_2_error_sentences: print(sentence.text) print(sentence.original) grammar_checker.check(sentence.text) print() print("\nTYPE 3 ERROR SENTENCES\n\n") for sentence in tester.type_3_error_sentences: print(sentence.text) print(sentence.original) grammar_checker.check(sentence.text) print()