grammar_checker.py

import json
import nltk
import requests
import threading
import numpy as np
from time import sleep
from tester import Tester

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from pyinflect import getAllInflections


class GrammarChecker:
    def __init__(self):
        # required
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet')
        # variables
        self.default_prob = 1e-10
        self.thresholds = {2: 5.6, 3: 7.7}
        # tokenizer
        self.tokenizer = nltk.RegexpTokenizer(r"\w+") 
        # lemmatizer
        self.lemmatizer = WordNetLemmatizer()

    def check(self, sentence):
        """ checks a sentence for errors and recursively corrects the first one """
        # lower case sentence
        sentence = sentence.lower()
        # create n_grams
        n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
        for n in [2, 3]:
            n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
        # find errors
        i_errors = self.find_index_of_error(n_grams)
        # get corrections
        unigrams, i_corrections = self.get_corrections(n_grams, i_errors)
        print(f"unigrams: {unigrams}")
        print(f"i_corrections: {i_corrections}")
        return unigrams, i_corrections

    def get_corrections(self, n_grams, i_errors):
        """ gets corrections for errors """
        # get unigrams and create result corrections dict
        unigrams = [unigram[0] for unigram in n_grams[1]]
        i_corrections = {i_error: unigrams[i_error] for i_error in i_errors}

        # if errors are found
        if i_corrections:
            # collect probabilities of inflections for all errors
            probs = {}
            for i_error, word in i_corrections.items():
                probs[i_error] = {}
                try:
                    inflections = set(self.suggest_inflections(word))
                except Exception:
                    continue
                for n in n_grams:
                    if n == 1:
                        continue
                    probs[i_error][n] = {}
                    n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error)
                    error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes]
                    # threads for checking error_n_grams with inflections in parallel
                    threads = []
                    for error_n_gram in error_n_grams:
                        threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word)))
                        threads[-1].setDaemon(True)
                        threads[-1].start()
                    for thread in threads:
                        thread.join()

            # voting mechanism
            prob_accumulator = {}
            for i_error, ns in probs.items():
                prob_accumulator[i_error] = {}
                for n, error_n_grams in ns.items():
                    for error_n_gram, inflections in error_n_grams.items():
                        for inflection, prob in inflections.items():
                            if inflection in prob_accumulator[i_error]:
                                prob_accumulator[i_error][inflection] += prob
                            else:
                                prob_accumulator[i_error][inflection] = prob

            # determine best inflections
            for i_error, inflections in prob_accumulator.items():
                if inflections:
                    i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0]

        return unigrams, i_corrections

    def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word):
        probs[i_error][n][error_n_gram] = {}
        inflection_n_grams = []
        for inflection in inflections:
            tmp = list(error_n_gram)
            index = tmp.index(word)
            tmp[index] = inflection
            inflection_n_grams.append(tmp)
        inflection_probs = self.get_probs_of_n_grams(inflection_n_grams)
        for i, inflection in enumerate(inflections):
            probs[i_error][n][error_n_gram][inflection] = inflection_probs[i]

    def suggest_inflections(self, word):
        pos = pos_tag([word])[0][1]
        if pos.startswith("N"):
            # Nouns mapped with noun markers
            startswith ="N"
            lemmparam = "n"
            list_of_suggestions = None
        elif pos.startswith("R"):
            # adverbs mapped with adverb markers
            startswith ="A"
            lemmparam ="r"
            list_of_suggestions = None
        elif pos.startswith("J"):
            # adjectives mapped with adjective markers
            startswith ="A"
            lemmparam ="a"
            list_of_suggestions = None  
        elif pos.startswith("V"):
            # Verbs mapped with verb markers
            startswith ="V"
            lemmparam ="v"
            list_of_suggestions = None
        elif pos == "PRP" or pos =="PRP$":
            # If word in posessive pronoun, try all posessive pronouns
            list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
                                   "his", "her", "hers", "their", "theirs", "your" , "yours"]
            startswith = None
        else:
            # Else, return nothing
            startswith = None
            list_of_suggestions = None
        if list_of_suggestions is None and startswith is not None:
            # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
            if lemmparam == "r":
                # for adverbs , inflections of th
                s = []
                suggestion = ""
                for ss in wn.synsets(word):
                    for lemmas in ss.lemmas(): # all possible lemmas.
                        s.append(lemmas)

                for pers in s:
                    posword = pers.pertainyms()
                    if len(posword) == 0:
                        continue
                    else:
                        posword = posword[0].name()
                        if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
                            suggestion = posword
                            break
                word = self.lemmatizer.lemmatize(suggestion,lemmparam)
                inflections = getAllInflections(word)
                tags =[ key for key in inflections.keys()  ]
                suggestion_list =  [inflections[tag] for tag in tags]
                suggestion = [i for sub in suggestion_list for i in sub]
                return suggestion
            else:
                word = self.lemmatizer.lemmatize(word,lemmparam)
                inflections = getAllInflections(word)
                tags =[ key for key in inflections.keys()  ]
                suggestion_list =  [inflections[tag] for tag in tags]
                suggestion = [i for sub in suggestion_list for i in sub]
                return suggestion
        elif list_of_suggestions is not None and startswith is None:
            return list_of_suggestions

    def get_google_ngram_prob(self, n_gram):
        """ gets probability for given n_gram """
        url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
        successful = False
        wait_time = 0.0001
        while not successful:
            response = requests.get(url)
            sleep(wait_time)
            if response.ok:
                successful = True
                results = json.loads(response.content)
                if results:
                    max_prob = 0.0
                    for result in results:
                        cur_max_prob = max(results[0]["timeseries"])
                        max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
                    return max_prob
                else:
                    return None
            if not successful:
                if wait_time < 10:
                    # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
                    wait_time *= 10
                else:
                    pass
                    # print("still no response.")
        
    def get_prob_of_n_gram(self, n_gram, probs, i):
        """ calculates probability of n_gram """
        # get n_gram probability
        prob = self.get_google_ngram_prob(n_gram)
        probs[i] = prob if prob != 0.0 and prob != None else self.default_prob

    def get_probs_of_n_grams(self, n_grams):
        # create target list
        probs = [None]*len(n_grams)
        # create and start threads
        threads = []
        for i, n_gram in enumerate(n_grams):
            threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i)))
            threads[-1].setDaemon(True)
            threads[-1].start()
        # join threads
        for thread in threads:
            thread.join()
        return probs

    def get_word_indexes_from_n_gram_index(self, n, n_gram_index):
        word_indexes = [n_gram_index]
        for i in range(n-1):
            word_indexes.append(word_indexes[-1]+1)
        return word_indexes

    def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index):
        n_gram_indexes = [0] if word_index < n else [word_index-n+1]
        for i in range(word_index%n if word_index < n else n-1):
            nxt = n_gram_indexes[-1]+1
            if nxt < n_gram_cnt:
                n_gram_indexes.append(nxt)
        return n_gram_indexes

    def find_index_of_error(self, n_grams):
        """ finds index of greatest error in n_grams"""
        # get probabilities for all n_grams
        probs = {}
        thresholds_passed = {}
        smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}}
        for n in n_grams:
            # don't take 1-grams into account
            if n == 1:
                continue
            # smallest prob
            probs[n] = self.get_probs_of_n_grams(n_grams[n])
            try:
                for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))):
                    smallest_prob_counter[n][index] += 1
            except Exception:
                pass
            # threshholds check
            if np.prod(probs[n]) == 0:
                thresholds_passed[n] = True
            else:
                thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n]

        # determine indexes of errors
        i_errors = []
        max_counter = 0
        total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))}
        for n, smallest_probs in smallest_prob_counter.items():
            if True:  # thresholds_passed[n]:
                for index in total_smallest_prob_counter:
                    total_smallest_prob_counter[index] += smallest_probs[index]
        for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]):
            if counter >= max_counter and counter != 0:
                i_errors.append(index)
                max_counter = counter

        print(f"below thresholds? bi-gram: {thresholds_passed[2]}, tri-gram: {thresholds_passed[3]}")
        print("notice: sentence level thresholds ignored. we currently always assume an error.")
        return i_errors


if __name__ == "__main__":
    # get sentences
    tester = Tester()

    # create grammar checker
    grammar_checker = GrammarChecker()

    # check sentences
    print("CORRECT SENTENCES\n\n")
    for sentence in tester.correct_sentences:
        print(sentence.text)
        grammar_checker.check(sentence.text)
        print()

    print("\nTYPE 1 ERROR SENTENCES\n\n")
    for sentence in tester.type_1_error_sentences:
        print(sentence.text)
        print(sentence.original)
        grammar_checker.check(sentence.text)
        print()

    print("\nTYPE 2 ERROR SENTENCES\n\n")
    for sentence in tester.type_2_error_sentences:
        print(sentence.text)
        print(sentence.original)
        grammar_checker.check(sentence.text)
        print()

    print("\nTYPE 3 ERROR SENTENCES\n\n")
    for sentence in tester.type_3_error_sentences:
        print(sentence.text)
        print(sentence.original)
        grammar_checker.check(sentence.text)
        print()