Heiko Raible · 469bf123
--- a/deployment/grammar_checker.py 0 → 100644

+ 314

− 0
+++ b/deployment/grammar_checker.py 0 → 100644

+ 314

− 0
+import ast
+import sys
+import time
+import json
+import math
+import nltk
+import requests
+import pandas as pd
+import threading
+import numpy as np
+from nltk import TweetTokenizer
+from nltk.util import ngrams
+from time import sleep
+from tester import Tester
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet as wn
+from nltk import pos_tag
+from pyinflect import getAllInflections
+class GrammarChecker:
+    def __init__(self):
+        # required
+        nltk.download('punkt')
+        nltk.download('averaged_perceptron_tagger')
+        nltk.download('wordnet')
+        # variables
+        self.default_prob = 1e-10
+        self.thresholds = {2: 5.6, 3: 7.7}
+        # tokenizer
+        self.tokenizer = nltk.RegexpTokenizer(r"\w+") 
+        # lemmatizer
+        self.lemmatizer = WordNetLemmatizer()
+    def check(self, sentence):
+        """ checks a sentence for errors and recursively corrects the first one """
+        # lower case sentence
+        sentence = sentence.lower()
+        # create n_grams
+        n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
+        for n in [2, 3]:
+            n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
+        # find errors
+        i_errors = self.find_index_of_error(n_grams)
+        # get corrections
+        unigrams, i_corrections = self.get_corrections(n_grams, i_errors)
+        print(f"unigrams: {unigrams}")
+        print(f"i_corrections: {i_corrections}")
+        return unigrams, i_corrections
+    def get_corrections(self, n_grams, i_errors):
+        """ gets corrections for errors """
+        # get unigrams and create result corrections dict
+        unigrams = [unigram[0] for unigram in n_grams[1]]
+        i_corrections = {i_error: unigrams[i_error] for i_error in i_errors}
+        # if errors are found
+        if i_corrections:
+            # collect probabilities of inflections for all errors
+            probs = {}
+            for i_error, word in i_corrections.items():
+                probs[i_error] = {}
+                try:
+                    inflections = set(self.suggest_inflections(word))
+                except Exception:
+                    continue
+                for n in n_grams:
+                    if n == 1:
+                        continue
+                    probs[i_error][n] = {}
+                    n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error)
+                    error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes]
+                    # threads for checking error_n_grams with inflections in parallel
+                    threads = []
+                    for error_n_gram in error_n_grams:
+                        threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word)))
+                        threads[-1].setDaemon(True)
+                        threads[-1].start()
+                    for thread in threads:
+                        thread.join()
+            # voting mechanism
+            prob_accumulator = {}
+            for i_error, ns in probs.items():
+                prob_accumulator[i_error] = {}
+                for n, error_n_grams in ns.items():
+                    for error_n_gram, inflections in error_n_grams.items():
+                        for inflection, prob in inflections.items():
+                            if inflection in prob_accumulator[i_error]:
+                                prob_accumulator[i_error][inflection] += prob
+                            else:
+                                prob_accumulator[i_error][inflection] = prob
+            # determine best inflections
+            for i_error, inflections in prob_accumulator.items():
+                if inflections:
+                    i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0]
+        return unigrams, i_corrections
+    def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word):
+        probs[i_error][n][error_n_gram] = {}
+        inflection_n_grams = []
+        for inflection in inflections:
+            tmp = list(error_n_gram)
+            index = tmp.index(word)
+            tmp[index] = inflection
+            inflection_n_grams.append(tmp)
+        inflection_probs = self.get_probs_of_n_grams(inflection_n_grams)
+        for i, inflection in enumerate(inflections):
+            probs[i_error][n][error_n_gram][inflection] = inflection_probs[i]
+    def suggest_inflections(self, word):
+        pos = pos_tag([word])[0][1]
+        if pos.startswith("N"):
+            # Nouns mapped with noun markers
+            startswith ="N"
+            lemmparam = "n"
+            list_of_suggestions = None
+        elif pos.startswith("R"):
+            # adverbs mapped with adverb markers
+            startswith ="A"
+            lemmparam ="r"
+            list_of_suggestions = None
+        elif pos.startswith("J"):
+            # adjectives mapped with adjective markers
+            startswith ="A"
+            lemmparam ="a"
+            list_of_suggestions = None  
+        elif pos.startswith("V"):
+            # Verbs mapped with verb markers
+            startswith ="V"
+            lemmparam ="v"
+            list_of_suggestions = None
+        elif pos == "PRP" or pos =="PRP$":
+            # If word in posessive pronoun, try all posessive pronouns
+            list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
+                                   "his", "her", "hers", "their", "theirs", "your" , "yours"]
+            startswith = None
+        else:
+            # Else, return nothing
+            startswith = None
+            list_of_suggestions = None
+        if list_of_suggestions is None and startswith is not None:
+            # if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
+            if lemmparam == "r":
+                # for adverbs , inflections of th
+                s = []
+                suggestion = ""
+                for ss in wn.synsets(word):
+                    for lemmas in ss.lemmas(): # all possible lemmas.
+                        s.append(lemmas)
+                for pers in s:
+                    posword = pers.pertainyms()
+                    if len(posword) == 0:
+                        continue
+                    else:
+                        posword = posword[0].name()
+                        if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
+                            suggestion = posword
+                            break
+                word = self.lemmatizer.lemmatize(suggestion,lemmparam)
+                inflections = getAllInflections(word)
+                tags =[ key for key in inflections.keys()  ]
+                suggestion_list =  [inflections[tag] for tag in tags]
+                suggestion = [i for sub in suggestion_list for i in sub]
+                return suggestion
+            else:
+                word = self.lemmatizer.lemmatize(word,lemmparam)
+                inflections = getAllInflections(word)
+                tags =[ key for key in inflections.keys()  ]
+                suggestion_list =  [inflections[tag] for tag in tags]
+                suggestion = [i for sub in suggestion_list for i in sub]
+                return suggestion
+        elif list_of_suggestions is not None and startswith is None:
+            return list_of_suggestions
+    def get_google_ngram_prob(self, n_gram):
+        """ gets probability for given n_gram """
+        url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
+        successful = False
+        wait_time = 0.0001
+        while not successful:
+            response = requests.get(url)
+            sleep(wait_time)
+            if response.ok:
+                successful = True
+                results = json.loads(response.content)
+                if results:
+                    max_prob = 0.0
+                    for result in results:
+                        cur_max_prob = max(results[0]["timeseries"])
+                        max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
+                    return max_prob
+                else:
+                    return None
+            if not successful:
+                if wait_time < 10:
+                    # print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
+                    wait_time *= 10
+                else:
+                    pass
+                    # print("still no response.")
+    def get_prob_of_n_gram(self, n_gram, probs, i):
+        """ calculates probability of n_gram """
+        # get n_gram probability
+        prob = self.get_google_ngram_prob(n_gram)
+        probs[i] = prob if prob != 0.0 and prob != None else self.default_prob
+    def get_probs_of_n_grams(self, n_grams):
+        # create target list
+        probs = [None]*len(n_grams)
+        # create and start threads
+        threads = []
+        for i, n_gram in enumerate(n_grams):
+            threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i)))
+            threads[-1].setDaemon(True)
+            threads[-1].start()
+        # join threads
+        for thread in threads:
+            thread.join()
+        return probs
+    def get_word_indexes_from_n_gram_index(self, n, n_gram_index):
+        word_indexes = [n_gram_index]
+        for i in range(n-1):
+            word_indexes.append(word_indexes[-1]+1)
+        return word_indexes
+    def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index):
+        n_gram_indexes = [0] if word_index < n else [word_index-n+1]
+        for i in range(word_index%n if word_index < n else n-1):
+            nxt = n_gram_indexes[-1]+1
+            if nxt < n_gram_cnt:
+                n_gram_indexes.append(nxt)
+        return n_gram_indexes
+    def find_index_of_error(self, n_grams):
+        """ finds index of greatest error in n_grams"""
+        # get probabilities for all n_grams
+        probs = {}
+        thresholds_passed = {}
+        smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}}
+        for n in n_grams:
+            # don't take 1-grams into account
+            if n == 1:
+                continue
+            # smallest prob
+            probs[n] = self.get_probs_of_n_grams(n_grams[n])
+            try:
+                for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))):
+                    smallest_prob_counter[n][index] += 1
+            except Exception:
+                pass
+            # threshholds check
+            if np.prod(probs[n]) == 0:
+                thresholds_passed[n] = True
+            else:
+                thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n]
+        # determine indexes of errors
+        i_errors = []
+        max_counter = 0
+        total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))}
+        for n, smallest_probs in smallest_prob_counter.items():
+            if True:  # thresholds_passed[n]:
+                for index in total_smallest_prob_counter:
+                    total_smallest_prob_counter[index] += smallest_probs[index]
+        for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]):
+            if counter >= max_counter and counter != 0:
+                i_errors.append(index)
+                max_counter = counter
+        print(f"thresholds_passed: {thresholds_passed}")
+        return i_errors
+if __name__ == "__main__":
+    # get sentences
+    tester = Tester()
+    # create grammar checker
+    grammar_checker = GrammarChecker()
+    # check sentences
+    print("CORRECT SENTENCES\n\n")
+    for sentence in tester.correct_sentences:
+        print(sentence.text)
+        grammar_checker.check(sentence.text)
+        print()
+    print("\nTYPE 1 ERROR SENTENCES\n\n")
+    for sentence in tester.type_1_error_sentences:
+        print(sentence.text)
+        print(sentence.original)
+        grammar_checker.check(sentence.text)
+        print()
+    print("\nTYPE 2 ERROR SENTENCES\n\n")
+    for sentence in tester.type_2_error_sentences:
+        print(sentence.text)
+        print(sentence.original)
+        grammar_checker.check(sentence.text)
+        print()
+    print("\nTYPE 3 ERROR SENTENCES\n\n")
+    for sentence in tester.type_3_error_sentences:
+        print(sentence.text)
+        print(sentence.original)
+        grammar_checker.check(sentence.text)
+        print()