Merge branch 'pos_tag_threshold' into 'main'

Pos tag threshold See merge request !7

Merge branch 'pos_tag_threshold' into 'main'
9cd37f64 · Jeremy Mah Zhee Kein · 0aa038c2 · 3a3f7f9d · 9cd37f64 · 9cd37f64
Commit 9cd37f64 authored 3 years ago by Jeremy Mah Zhee Kein
--- a/data/corpus_POS_n=1.json
+++ b/data/corpus_POS_n=1.json
+{"TO": {"TO": 473369}, "VB": {"VB": 648041}, "DT": {"DT": 3074013}, "JJR": {"JJR": 68807}, "NN": {"NN": 4094732}, "IN": {"IN": 3328924}, "PRP": {"PRP": 423778}, "VBP": {"VBP": 520568}, "NNP": {"NNP": 865041}, "CC": {"CC": 711476}, "NNS": {"NNS": 1433890}, "JJ": {"JJ": 2288679}, "MATH": {"MATH": 1278037}, ",": {",": 1349627}, "MATHDISP": {"MATHDISP": 188349}, "WRB": {"WRB": 120758}, "CD": {"CD": 379731}, "RB": {"RB": 842861}, ".": {".": 1176329}, "VBZ": {"VBZ": 895876}, "MD": {"MD": 233961}, "VBG": {"VBG": 382263}, "REF": {"REF": 226499}, "VBN": {"VBN": 732389}, ":": {":": 129205}, "(": {"(": 336356}, ")": {")": 337872}, "RBR": {"RBR": 29970}, "WDT": {"WDT": 149617}, "JJS": {"JJS": 34222}, "VBD": {"VBD": 176140}, "EX": {"EX": 40107}, "CITE": {"CITE": 152054}, "WP": {"WP": 8880}, "``": {"``": 20466}, "''": {"''": 21342}, "NNPS": {"NNPS": 2245}, "POS": {"POS": 29793}, "PRP$": {"PRP$": 93378}, "FW": {"FW": 15290}, "RP": {"RP": 16874}, "PDT": {"PDT": 14990}, "RBS": {"RBS": 7680}, "WP$": {"WP$": 5088}, "LS": {"LS": 114}, "$": {"$": 2387}, "UH": {"UH": 669}, "SYM": {"SYM": 151}, "#": {"#": 343}}
\ No newline at end of file
--- a/data/corpus_POS_n=2.json
+++ b/data/corpus_POS_n=2.json
--- a/data/corpus_POS_n=4.json
+++ b/data/corpus_POS_n=4.json
--- a/src/create_corpus.py
+++ b/src/create_corpus.py
@@ -241,5 +241,5 @@ if __name__ == "__main__":
        # threshold = 10
        # print(f"threshold={threshold}")
        # trim_corpus(n, threshold)
-    print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
+    # print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
-    # create_POS_corpus(3)
+    create_POS_corpus(4)
\ No newline at end of file
--- a/src/grammar_checker_pos.py
+++ b/src/grammar_checker_pos.py
+import sys
+import json
+import nltk
+from nltk.util import ngrams
+from nltk import pos_tag
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+from sklearn.metrics import confusion_matrix
+import logging
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+log.addHandler(handler)
+class GrammarCheckerWithPOS:
+    def __init__(self, n,default_prob):
+        # required
+        nltk.download('punkt')
+        # variables 
+        self.n = n
+        self.start_tag = "START "
+        self.default_prob = default_prob
+        self.end_tag = " END"
+        self.__old_sentence = None
+        # tokenizer
+        self.tokenizer = nltk.RegexpTokenizer(r"\w+") 
+        self.pos_tag = pos_tag
+        # load corpus
+        self.corpus = {}
+        with open(f"../data/corpus_POS_n={n}.json", "r") as infile:
+            self.corpus = json.load(infile)
+    def check(self, sentence,threshold):
+        """ checks a sentence for errors and recursively corrects the first one """
+        sentence = sentence.lower()
+        self.threshold = threshold
+        # add padding
+        # self.__old_sentence = self.tokenizer.tokenize(sentence)
+        # sentence = self.start_tag*(self.n-1) + sentence + self.end_tag*(self.n-1)
+        # create n_grams list
+        tokens_with_tags = pos_tag(self.tokenizer.tokenize(sentence))
+        tags = [tag for token, tag in tokens_with_tags]
+        words = [token for token, tag in tokens_with_tags]
+        n_grams = list(ngrams(tags,self.n))
+        word_n_grams = list(ngrams(words,self.n))
+        # find error
+        i_error = self.find_index_of_error(n_grams)
+        if i_error is not None:
+            # print(f"erroneous n-gram:\n{word_n_grams[i_error]} at position {i_error}")
+            return 1
+        else:
+            return 0
+        # if error detected
+        # if i_error:
+        #     # find correction
+        #     correction = self.find_correction(n_grams[i_error])
+        #     # if correction found
+        #     if correction:
+        #         # apply correction
+        #         corrected_sentence = ""
+        #         for i, n_gram in enumerate(n_grams[:-2]):
+        #             corrected_sentence += correction.split(" ")[-1] if i == i_error else n_gram[-1]
+        #             if i+1 != len(n_grams[:-2]):
+        #                 corrected_sentence += " "
+        #         # recheck
+        #         if i_error and " ".join(self.__old_sentence) != corrected_sentence:
+        #             return self.check(corrected_sentence)
+        #         else:
+        #             raise Exception("Could not determine correction for mistake!")
+        #     else:
+        #         raise Exception("Could not determine correction for mistake!")
+        # else:
+        #     # return sentence
+        #     return sentence
+    def get_prob_of_n_gram(self, n_gram):
+        """ calculates probability of n_gram """
+        # smallest possible positive float (1e-324 == 0.0)
+        # float_min = 1e-323
+        # get first character n_gram and n_gram without last word
+        first_char = n_gram[0]
+        all_but_last_tokens = n_gram[:-1]
+        # if n_gram isn't in corpus
+        n_gram = " ".join(n_gram)
+        if n_gram not in self.corpus[first_char]:
+            # return smallest possible positive float
+            return self.default_prob
+        # get n_gram occurences and total occurences starting with the same n-1 words
+        n_gram_occurrences = self.corpus[first_char][n_gram]
+        total_value = 0
+        for key, value in self.corpus[first_char].items():
+            # split key string into list of tokens
+            splitted = key.split(" ")
+            # if first n-1 words are the same as of n_gram
+            if splitted[:-1] == list(all_but_last_tokens):
+                # add occurences to total number of occurrences
+                total_value += value
+        # calculate n_gram probability
+        prob = n_gram_occurrences/total_value
+        # return it if it's not 0, else return smallest possible positive float
+        return prob if prob != 0.0 else self.default_prob
+    def get_chained_probabilities(self, probs):
+        """ get list of cumulative markov chains for probs """
+        chained_probs =[probs[0]]
+        for i in range(1, len(probs)):
+            chained_probs.append(chained_probs[-1]*probs[i])
+        return chained_probs
+    def find_index_of_error(self, n_grams):
+        """ finds index of greatest error in n_grams"""
+        # print("N-grams: "+ str(n_grams))
+        # get probabilities of n_grams
+        if len(n_grams) != 0:
+            probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
+        else:
+            probs = [self.default_prob]
+        # get cumulative chained probs
+        chained_probs = self.get_chained_probabilities(probs)
+        # print((chained_probs[-1])**(1/len(n_grams)))
+        # print(f"list of chained probabilities:\n{chained_probs}\n")
+        # calculate differences between values in chained_probs
+        diff_list = [abs(j-i) for i, j in zip(chained_probs[:-1], chained_probs[1:])]
+        # print(f"list of differences:\n{diff_list}\n")
+        if len(n_grams) != 0:
+            return probs.index(min(probs)) if (chained_probs[-1])**(1/len(n_grams)) <= self.threshold else None
+        else:
+            return probs.index(min(probs))
+    # def find_correction(self, bad_n_gram):
+    #     """ corrects error (now most occurrences, later combination with character similarity) """
+    #     # create string from first n-1 words of n_gram
+    #     n_gram_str = " ".join(bad_n_gram[:-1])
+    #     # collect suggestions
+    #     suggestions = []
+    #     for n_gram, occurrence in self.corpus[n_gram_str[0]].items():
+    #         if n_gram.startswith(n_gram_str):
+    #             suggestions.append([occurrence, n_gram])
+    #     # determine winner (now returns n_gram with most occurrences)
+    #     suggestions = sorted(suggestions)
+    #     return suggestions[-1][1] if suggestions else None
+if __name__ == "__main__":
+    # text = "Note that in are contour time-integrals we essentially integrate over _MATH_."
+    # sentence_list = [
+    #     "If many node moves by M1, it will not make any other membership move.",
+    #     "I sees movies",
+    #     "There cans be at most _MATH_ membership moves.",
+    #     "In particular, the sequence _MATH_ has a limit in the sense of Painleve-Kuratowski if and only if the following conditions hold: _MATHDISP_.",
+    #     "If _MATH_ is small enough relative to _MATH_, then we will see that this number exceeds _MATH_.",
+    #     "Obviously, networks evolve over time, and even static systems will need to update the membership list now and then, but the presumption is that this can be treated as an offline activity.",
+    #     "We use the fact that the difference between _MATH_ and _MATH_ is at most the total number of arrivals and departures during the time interval between _MATH_ and _MATH_. ",
+    #     "In this distorted image, we needs to find the point that corresponds to the true center of the solar disk.",
+    #     "I don't texts while driving"
+    # ]
+    # print(text)
+    with open("../data/corpus_POS_n=3.json") as corpusjson:
+        corpus3 = json.load(corpusjson)
+    list_of_values_3 = []
+    for key in corpus3.keys():
+        values = list(corpus3[key].values())
+        list_of_values_3.append(values)
+    flat_list_3 = [item for sublist in list_of_values_3 for item in sublist]
+    grammar_checker = GrammarCheckerWithPOS(3,100/sum(flat_list_3))
+    testdf = pd.read_csv("../testsentences.csv",sep=";")
+    list_of_thresholds = [x for x in np.linspace(0.01,0.4,20)]
+    list_of_precision = list()
+    list_of_recall = list()
+    list_of_f1 = list()
+    for threshold in list_of_thresholds:
+        try:
+            testdf["results"] = testdf["Sentence"].apply(lambda x: grammar_checker.check(x,threshold))
+            truth = testdf["Incorrect"]
+            pred = testdf["results"]
+            confmat = confusion_matrix(truth,pred)
+            precision = confmat[1][1]/(confmat[1][1]+confmat[0][1])
+            recall = confmat[1][1]/(confmat[1][1]+confmat[1][0])
+            f1score = (2*precision*recall)/(precision+recall)
+            list_of_precision.append(precision)
+            list_of_recall.append(recall)
+            list_of_f1.append(f1score)
+        except Exception as e:
+            log.error(e)
+    df_list = [list_of_thresholds,list_of_precision,list_of_recall,list_of_f1]
+    result = pd.DataFrame(df_list).transpose()
+    result.columns= ["threshold","precision","recall","f1"]
+    print(result)
\ No newline at end of file