Skip to content
Snippets Groups Projects
grammar_checker.py 12.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • Heiko Raible's avatar
    Heiko Raible committed
    import json
    import nltk
    import requests
    import threading
    import numpy as np
    from time import sleep
    from tester import Tester
    
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet as wn
    from nltk import pos_tag
    from pyinflect import getAllInflections
    
    
    class GrammarChecker:
        def __init__(self):
            # required
            nltk.download('punkt')
            nltk.download('averaged_perceptron_tagger')
            nltk.download('wordnet')
            # variables
            self.default_prob = 1e-10
            self.thresholds = {2: 5.6, 3: 7.7}
            # tokenizer
            self.tokenizer = nltk.RegexpTokenizer(r"\w+") 
            # lemmatizer
            self.lemmatizer = WordNetLemmatizer()
    
        def check(self, sentence):
            """ checks a sentence for errors and recursively corrects the first one """
            # lower case sentence
            sentence = sentence.lower()
            # create n_grams
            n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
            for n in [2, 3]:
                n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
            # find errors
            i_errors = self.find_index_of_error(n_grams)
            # get corrections
            unigrams, i_corrections = self.get_corrections(n_grams, i_errors)
            print(f"unigrams: {unigrams}")
            print(f"i_corrections: {i_corrections}")
            return unigrams, i_corrections
    
        def get_corrections(self, n_grams, i_errors):
            """ gets corrections for errors """
            # get unigrams and create result corrections dict
            unigrams = [unigram[0] for unigram in n_grams[1]]
            i_corrections = {i_error: unigrams[i_error] for i_error in i_errors}
    
            # if errors are found
            if i_corrections:
                # collect probabilities of inflections for all errors
                probs = {}
                for i_error, word in i_corrections.items():
                    probs[i_error] = {}
                    try:
                        inflections = set(self.suggest_inflections(word))
                    except Exception:
                        continue
                    for n in n_grams:
                        if n == 1:
                            continue
                        probs[i_error][n] = {}
                        n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error)
                        error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes]
                        # threads for checking error_n_grams with inflections in parallel
                        threads = []
                        for error_n_gram in error_n_grams:
                            threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word)))
                            threads[-1].setDaemon(True)
                            threads[-1].start()
                        for thread in threads:
                            thread.join()
    
                # voting mechanism
                prob_accumulator = {}
                for i_error, ns in probs.items():
                    prob_accumulator[i_error] = {}
                    for n, error_n_grams in ns.items():
                        for error_n_gram, inflections in error_n_grams.items():
                            for inflection, prob in inflections.items():
                                if inflection in prob_accumulator[i_error]:
                                    prob_accumulator[i_error][inflection] += prob
                                else:
                                    prob_accumulator[i_error][inflection] = prob
    
                # determine best inflections
                for i_error, inflections in prob_accumulator.items():
                    if inflections:
                        i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0]
    
            return unigrams, i_corrections
    
        def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word):
            probs[i_error][n][error_n_gram] = {}
            inflection_n_grams = []
            for inflection in inflections:
                tmp = list(error_n_gram)
                index = tmp.index(word)
                tmp[index] = inflection
                inflection_n_grams.append(tmp)
            inflection_probs = self.get_probs_of_n_grams(inflection_n_grams)
            for i, inflection in enumerate(inflections):
                probs[i_error][n][error_n_gram][inflection] = inflection_probs[i]
    
        def suggest_inflections(self, word):
            pos = pos_tag([word])[0][1]
            if pos.startswith("N"):
                # Nouns mapped with noun markers
                startswith ="N"
                lemmparam = "n"
                list_of_suggestions = None
            elif pos.startswith("R"):
                # adverbs mapped with adverb markers
                startswith ="A"
                lemmparam ="r"
                list_of_suggestions = None
            elif pos.startswith("J"):
                # adjectives mapped with adjective markers
                startswith ="A"
                lemmparam ="a"
                list_of_suggestions = None  
            elif pos.startswith("V"):
                # Verbs mapped with verb markers
                startswith ="V"
                lemmparam ="v"
                list_of_suggestions = None
            elif pos == "PRP" or pos =="PRP$":
                # If word in posessive pronoun, try all posessive pronouns
                list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
                                       "his", "her", "hers", "their", "theirs", "your" , "yours"]
                startswith = None
            else:
                # Else, return nothing
                startswith = None
                list_of_suggestions = None
            if list_of_suggestions is None and startswith is not None:
                # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
                if lemmparam == "r":
                    # for adverbs , inflections of th
                    s = []
                    suggestion = ""
                    for ss in wn.synsets(word):
                        for lemmas in ss.lemmas(): # all possible lemmas.
                            s.append(lemmas)
    
                    for pers in s:
                        posword = pers.pertainyms()
                        if len(posword) == 0:
                            continue
                        else:
                            posword = posword[0].name()
                            if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
                                suggestion = posword
                                break
                    word = self.lemmatizer.lemmatize(suggestion,lemmparam)
                    inflections = getAllInflections(word)
                    tags =[ key for key in inflections.keys()  ]
                    suggestion_list =  [inflections[tag] for tag in tags]
                    suggestion = [i for sub in suggestion_list for i in sub]
                    return suggestion
                else:
                    word = self.lemmatizer.lemmatize(word,lemmparam)
                    inflections = getAllInflections(word)
                    tags =[ key for key in inflections.keys()  ]
                    suggestion_list =  [inflections[tag] for tag in tags]
                    suggestion = [i for sub in suggestion_list for i in sub]
                    return suggestion
            elif list_of_suggestions is not None and startswith is None:
                return list_of_suggestions
    
        def get_google_ngram_prob(self, n_gram):
            """ gets probability for given n_gram """
            url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
            successful = False
            wait_time = 0.0001
            while not successful:
                response = requests.get(url)
                sleep(wait_time)
                if response.ok:
                    successful = True
                    results = json.loads(response.content)
                    if results:
                        max_prob = 0.0
                        for result in results:
                            cur_max_prob = max(results[0]["timeseries"])
                            max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
                        return max_prob
                    else:
                        return None
                if not successful:
                    if wait_time < 10:
                        # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
                        wait_time *= 10
                    else:
                        pass
                        # print("still no response.")
            
        def get_prob_of_n_gram(self, n_gram, probs, i):
            """ calculates probability of n_gram """
            # get n_gram probability
            prob = self.get_google_ngram_prob(n_gram)
            probs[i] = prob if prob != 0.0 and prob != None else self.default_prob
    
        def get_probs_of_n_grams(self, n_grams):
            # create target list
            probs = [None]*len(n_grams)
            # create and start threads
            threads = []
            for i, n_gram in enumerate(n_grams):
                threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i)))
                threads[-1].setDaemon(True)
                threads[-1].start()
            # join threads
            for thread in threads:
                thread.join()
            return probs
    
        def get_word_indexes_from_n_gram_index(self, n, n_gram_index):
            word_indexes = [n_gram_index]
            for i in range(n-1):
                word_indexes.append(word_indexes[-1]+1)
            return word_indexes
    
        def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index):
            n_gram_indexes = [0] if word_index < n else [word_index-n+1]
            for i in range(word_index%n if word_index < n else n-1):
                nxt = n_gram_indexes[-1]+1
                if nxt < n_gram_cnt:
                    n_gram_indexes.append(nxt)
            return n_gram_indexes
    
        def find_index_of_error(self, n_grams):
            """ finds index of greatest error in n_grams"""
            # get probabilities for all n_grams
            probs = {}
            thresholds_passed = {}
            smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}}
            for n in n_grams:
                # don't take 1-grams into account
                if n == 1:
                    continue
                # smallest prob
                probs[n] = self.get_probs_of_n_grams(n_grams[n])
                try:
                    for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))):
                        smallest_prob_counter[n][index] += 1
                except Exception:
                    pass
                # threshholds check
                if np.prod(probs[n]) == 0:
                    thresholds_passed[n] = True
                else:
                    thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n]
    
            # determine indexes of errors
            i_errors = []
            max_counter = 0
            total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))}
            for n, smallest_probs in smallest_prob_counter.items():
                if True:  # thresholds_passed[n]:
                    for index in total_smallest_prob_counter:
                        total_smallest_prob_counter[index] += smallest_probs[index]
            for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]):
                if counter >= max_counter and counter != 0:
                    i_errors.append(index)
                    max_counter = counter
    
    
    Heiko Raible's avatar
    Heiko Raible committed
            print(f"below thresholds? bi-gram: {thresholds_passed[2]}, tri-gram: {thresholds_passed[3]}")
            print("notice: sentence level thresholds ignored. we currently always assume an error.")
    
    Heiko Raible's avatar
    Heiko Raible committed
            return i_errors
    
    
    if __name__ == "__main__":
        # get sentences
        tester = Tester()
    
        # create grammar checker
        grammar_checker = GrammarChecker()
    
        # check sentences
        print("CORRECT SENTENCES\n\n")
        for sentence in tester.correct_sentences:
            print(sentence.text)
            grammar_checker.check(sentence.text)
            print()
    
        print("\nTYPE 1 ERROR SENTENCES\n\n")
        for sentence in tester.type_1_error_sentences:
            print(sentence.text)
            print(sentence.original)
            grammar_checker.check(sentence.text)
            print()
    
        print("\nTYPE 2 ERROR SENTENCES\n\n")
        for sentence in tester.type_2_error_sentences:
            print(sentence.text)
            print(sentence.original)
            grammar_checker.check(sentence.text)
            print()
    
        print("\nTYPE 3 ERROR SENTENCES\n\n")
        for sentence in tester.type_3_error_sentences:
            print(sentence.text)
            print(sentence.original)
            grammar_checker.check(sentence.text)
            print()