Skip to content
Snippets Groups Projects
grammar_checker_google_jeremy.py 5.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • import json
    import nltk
    
    from nltk import pos_tag
    
    import math
    import requests
    
    from pyinflect import getAllInflections, getInflection
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet as wn
    
    import numpy as np
    
    lemmatizer = WordNetLemmatizer()
    
    def suggest_inflection(word):
        pos = pos_tag([word])[0][1]
        if pos.startswith("N"):
            # Nouns mapped with noun markers
            startswith ="N"
            lemmparam = "n"
            list_of_suggestions = None
        elif pos.startswith("R"):
            # adverbs mapped with adverb markers
            startswith ="A"
            lemmparam ="r"
            list_of_suggestions = None
        elif pos.startswith("J"):
            # adjectives mapped with adjective markers
            startswith ="A"
            lemmparam ="a"
            list_of_suggestions = None  
        elif pos.startswith("V"):
            # Verbs mapped with verb markers
            startswith ="V"
            lemmparam ="v"
            list_of_suggestions = None
        elif pos == "PRP" or pos =="PRP$":
            # If word in posessive pronoun, try all posessive pronouns
            list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
                                   "his", "her", "hers", "their", "theirs", "your" , "yours"]
            startswith = None
        else:
            # Else, return nothing
            startswith = None
            list_of_suggestions = None
        if list_of_suggestions is None and startswith is not None :
            # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
            if lemmparam == "r":
                # for adverbs , inflections of th
                s = []
                suggestion = ""
                for ss in wn.synsets(word):
                    for lemmas in ss.lemmas(): # all possible lemmas.
                        s.append(lemmas)
    
                for pers in s:
                    posword = pers.pertainyms()
                    if len(posword) == 0:
                        continue
                    else:
                        posword = posword[0].name()
                        if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
                            suggestion = posword
                            break
                word = lemmatizer.lemmatize(suggestion,lemmparam)
                inflections = getAllInflections(word)
                tags =[ key for key in inflections.keys()  ]
                suggestion_list =  [inflections[tag] for tag in tags]
                suggestion = [i for sub in suggestion_list for i in sub]
                return suggestion
            else:
                word = lemmatizer.lemmatize(word,lemmparam)
                inflections = getAllInflections(word)
                tags =[ key for key in inflections.keys()  ]
                suggestion_list =  [inflections[tag] for tag in tags]
                suggestion = [i for sub in suggestion_list for i in sub]
                return suggestion
        elif list_of_suggestions is not None and startswith is None:
            return list_of_suggestions
    
    
    class GrammarCheckerGoogle:
        def __init__(self,n,float_min,threshold):
            # required
            nltk.download('punkt')
            self.float_min= float_min
            # variables 
            self.threshold = threshold
            self.n = n
            # tokenizer
            self.tokenizer = nltk.RegexpTokenizer(r"\w+") 
    
        def check(self, sentence):
            """ checks a sentence for errors and recursively corrects the first one """
            # lower case sentence
            sentence = sentence.lower()
        
            n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence),self.n))
            # find error
            i_error = self.find_index_of_error(n_grams)
            
            return i_error
    
        def get_google_ngram_prob(self, n_gram):
            """ gets probability for given n_gram """
            url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
            successful = False
            wait_time = 0.0001
            while not successful:
                response = requests.get(url)
                sleep(wait_time)
                if response.ok:
                    successful = True
                    results = json.loads(response.content)
                    if results:
                        max_prob = 0.0
                        for result in results:
                            cur_max_prob = max(results[0]["timeseries"])
                            max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
                        return max_prob
                    else:
                        return None
                if not successful:
                    if wait_time < 10:
                        # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
                        wait_time *= 10
                    else:
                        pass
                        # print("still no response.")
            
        def get_prob_of_n_gram(self, n_gram):
            """ calculates probability of n_gram """
            # smallest possible positive float (1e-324 == 0.0)
            # float_min = 1e-323
            # get n_gram probability
            prob = self.get_google_ngram_prob(n_gram)
            return prob if prob != 0.0 and prob != None else self.float_min
    
        def find_index_of_error(self, n_grams):
            """ finds index of greatest error in n_grams"""
            if len(n_grams) != 0:
                probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
            else:
                probs = [self.float_min]
            chained_probs = math.prod(probs)
            diff_list = [(j-i) for i, j in zip(probs[:-1], probs[1:])]
    
    jmzk96's avatar
    jmzk96 committed
            logged_chained_prob = -np.log10((chained_probs)**(1/len(n_grams))) if (chained_probs)**(1/len(n_grams)) != 0 else (chained_probs)**(1/len(n_grams))
    
            if logged_chained_prob <= self.threshold:
    
                return probs.index(min(probs))
            else:
                return None