grammar_checker_google_jeremy.py

import json
import nltk
from nltk import pos_tag
import math
import requests
from pyinflect import getAllInflections, getInflection
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import numpy as np

from time import sleep

lemmatizer = WordNetLemmatizer()

def suggest_inflection(word):
    pos = pos_tag([word])[0][1]
    if pos.startswith("N"):
        # Nouns mapped with noun markers
        startswith ="N"
        lemmparam = "n"
        list_of_suggestions = None
    elif pos.startswith("R"):
        # adverbs mapped with adverb markers
        startswith ="A"
        lemmparam ="r"
        list_of_suggestions = None
    elif pos.startswith("J"):
        # adjectives mapped with adjective markers
        startswith ="A"
        lemmparam ="a"
        list_of_suggestions = None  
    elif pos.startswith("V"):
        # Verbs mapped with verb markers
        startswith ="V"
        lemmparam ="v"
        list_of_suggestions = None
    elif pos == "PRP" or pos =="PRP$":
        # If word in posessive pronoun, try all posessive pronouns
        list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
                               "his", "her", "hers", "their", "theirs", "your" , "yours"]
        startswith = None
    else:
        # Else, return nothing
        startswith = None
        list_of_suggestions = None
    if list_of_suggestions is None and startswith is not None :
        # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
        if lemmparam == "r":
            # for adverbs , inflections of th
            s = []
            suggestion = ""
            for ss in wn.synsets(word):
                for lemmas in ss.lemmas(): # all possible lemmas.
                    s.append(lemmas)

            for pers in s:
                posword = pers.pertainyms()
                if len(posword) == 0:
                    continue
                else:
                    posword = posword[0].name()
                    if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
                        suggestion = posword
                        break
            word = lemmatizer.lemmatize(suggestion,lemmparam)
            inflections = getAllInflections(word)
            tags =[ key for key in inflections.keys()  ]
            suggestion_list =  [inflections[tag] for tag in tags]
            suggestion = [i for sub in suggestion_list for i in sub]
            return suggestion
        else:
            word = lemmatizer.lemmatize(word,lemmparam)
            inflections = getAllInflections(word)
            tags =[ key for key in inflections.keys()  ]
            suggestion_list =  [inflections[tag] for tag in tags]
            suggestion = [i for sub in suggestion_list for i in sub]
            return suggestion
    elif list_of_suggestions is not None and startswith is None:
        return list_of_suggestions

class GrammarCheckerGoogle:
    def __init__(self,n,float_min,threshold):
        # required
        nltk.download('punkt')
        self.float_min= float_min
        # variables 
        self.threshold = threshold
        self.n = n
        # tokenizer
        self.tokenizer = nltk.RegexpTokenizer(r"\w+") 

    def check(self, sentence):
        """ checks a sentence for errors and recursively corrects the first one """
        # lower case sentence
        sentence = sentence.lower()
    
        n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence),self.n))
        # find error
        i_error = self.find_index_of_error(n_grams)
        
        return i_error

    def get_google_ngram_prob(self, n_gram):
        """ gets probability for given n_gram """
        url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
        successful = False
        wait_time = 0.0001
        while not successful:
            response = requests.get(url)
            sleep(wait_time)
            if response.ok:
                successful = True
                results = json.loads(response.content)
                if results:
                    max_prob = 0.0
                    for result in results:
                        cur_max_prob = max(results[0]["timeseries"])
                        max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
                    return max_prob
                else:
                    return None
            if not successful:
                if wait_time < 10:
                    # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
                    wait_time *= 10
                else:
                    pass
                    # print("still no response.")
        
    def get_prob_of_n_gram(self, n_gram):
        """ calculates probability of n_gram """
        # smallest possible positive float (1e-324 == 0.0)
        # float_min = 1e-323
        # get n_gram probability
        prob = self.get_google_ngram_prob(n_gram)
        return prob if prob != 0.0 and prob != None else self.float_min

    def find_index_of_error(self, n_grams):
        """ finds index of greatest error in n_grams"""
        if len(n_grams) != 0:
            probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
        else:
            probs = [self.float_min]
        chained_probs = math.prod(probs)
        diff_list = [(j-i) for i, j in zip(probs[:-1], probs[1:])]
        logged_chained_prob = -np.log10((chained_probs)**(1/len(n_grams))) if (chained_probs)**(1/len(n_grams)) != 0 else (chained_probs)**(1/len(n_grams))
        
        
        if logged_chained_prob <= self.threshold:
            return probs.index(min(probs))
        else:
            return None