From 0dd6fcaf2213c6e5f53ec4bc4dbe320469941e81 Mon Sep 17 00:00:00 2001
From: Naa <Heiko.Raible@gmail.com>
Date: Thu, 25 Nov 2021 11:16:26 +0100
Subject: [PATCH] fix code cleanup

---
 src/grammar_checker.py | 111 +++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 61 deletions(-)

diff --git a/src/grammar_checker.py b/src/grammar_checker.py
index 0d2512b..be0aab9 100644
--- a/src/grammar_checker.py
+++ b/src/grammar_checker.py
@@ -1,7 +1,6 @@
+import sys
 import json
 import nltk
-from typing import Union
-import sys
 from nltk.util import ngrams
 
 
@@ -21,42 +20,6 @@ class GrammarChecker:
         with open(f"../data/corpus_n={n}.json", "r") as infile:
             self.corpus = json.load(infile)
 
-    def get_probs_of_gram(self,gram: Union[list,tuple]) -> float:
-        # Get first character of toekn/word in gram
-        first_char = gram[0][0]
-        # Get length of gram -1 for preceeding tokens
-        length_gram = len(gram) -1
-        # Get first n-1 tokens in gram
-        first_n_1_token = gram[:length_gram]
-        # Search corpus for first character
-        search_corpus =self.corpus[first_char]
-        #Initiate count for total  of grams with preceding tokens and 
-        #also count for total grams with exactly the same gram
-        total_value = 0
-        conditional_value = 0
-        for key, value  in search_corpus.items():
-            # split words into list of tokens//grams in corpus
-            splitted = key.split()
-            if  splitted[:length_gram] == list(first_n_1_token):
-                total_value += value
-            if splitted == list(gram):
-                conditional_value += value
-        try:
-            prob = conditional_value/total_value
-            if prob == 0.0:
-                prob = 1/sys.maxsize
-        except ZeroDivisionError:
-            prob = 1/sys.maxsize
-        return prob
-
-    def get_chained_probability(self,list_of_ngrams:list) -> list:
-        list_of_probs=[self.get_probs_of_gram(i) for i in list_of_ngrams]
-        # list_of_cum_probs = [get_probs_of_gram(i,dictionary) for i in split_sliding_ngram(input,n)[:position]]
-        list_of_cum =[list_of_probs[0]]
-        for i in range(1,len(list_of_probs)):
-            list_of_cum.append(list_of_cum[-1]*list_of_probs[i])
-        return  list_of_cum
-
     def check(self, sentence):
         """ checks a sentence for errors and recursively corrects the first one """
         # add padding
@@ -64,9 +27,10 @@ class GrammarChecker:
         sentence = self.start_tag*(self.n-1) + sentence + self.end_tag*(self.n-1)
         # create n_grams list
         n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence), self.n))
-        # find first error and it's correction
-        i_error = self.find_gram_lowest_prob(n_grams)
-        print(n_grams[i_error])
+        # find error
+        i_error = self.find_index_of_error(n_grams)
+        print(f"erroneous n-gram:\n{n_grams[i_error]} at position {i_error}")
+
         # if error detected
         # if i_error:
         #     # find correction
@@ -90,23 +54,50 @@ class GrammarChecker:
         #     # return sentence
         #     return sentence
         
-    # def find_first_error(self, n_grams):
-    #     """ finds error (now look up, later markov chains) """
-    #     # iterate through n_grams
-    #     for i, n_gram in enumerate(n_grams):
-    #         # determine first character for look up in corpus
-    #         initial_char = n_gram[0][0]
-    #         # create n_gram_str to look up
-    #         n_gram_str = " ".join(n_gram)
-    #         # if n_gram is not in corpus
-    #         if n_gram_str not in self.corpus[initial_char]:
-    #             return i
-    #     return None
+    def get_prob_of_n_gram(self, n_gram):
+        """ calculates probability of n_gram """
+        # smallest possible positive float (1e-324 == 0.0)
+        float_min = 1e-323
+        # get first character n_gram and n_gram without last word
+        first_char = n_gram[0][0]
+        all_but_last_tokens = n_gram[:-1]
+        # if n_gram isn't in corpus
+        if n_gram not in self.corpus[first_char]:
+            # return smallest possible positive float
+            return float_min
+        # get n_gram occurences and total occurences starting with the same n-1 words
+        n_gram_occurrences = self.corpus[first_char][n_gram]
+        total_value = 0
+        for key, value in self.corpus[first_char].items():
+            # split key string into list of tokens
+            splitted = key.split(" ")
+            # if first n-1 words are the same as of n_gram
+            if splitted[:-1] == list(all_but_last_tokens):
+                # add occurences to total number of occurrences
+                total_value += value
+        # calculate n_gram probability
+        prob = conditional_value/total_value
+        # return it if it's not 0, else return smallest possible positive float
+        return prob if prob != 0.0 else smallest_possible_positive_float
+
+    def get_chained_probabilities(self, probs):
+        """ get list of cumulative markov chains for probs """
+        chained_probs =[probs[0]]
+        for i in range(1, len(probs)):
+            chained_probs.append(chained_probs[-1]*probs[i])
+        return chained_probs
 
-    def find_gram_lowest_prob(self,ngrams:list):
-        probs_list = self.get_chained_probability(ngrams)
-        diff_list = [abs(j-i) for i, j in zip(probs_list[:-1], probs_list[1:])]
-        print(probs_list,"\n",diff_list)
+    def find_index_of_error(self, n_grams):
+        """ finds index of greatest error in n_grams"""
+        # get probabilities of n_grams
+        probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
+        print(f"\nlist of probabilities:\n{probs}\n")
+        # get cumulative chained probs
+        chained_probs = self.get_chained_probabilities(probs)
+        print(f"list of chained probabilities:\n{chained_probs}\n")
+        # calculate differences between values in chained_probs
+        diff_list = [abs(j-i) for i, j in zip(chained_probs[:-1], chained_probs[1:])]
+        print(f"list of differences:\n{diff_list}\n")
         return diff_list.index(max(diff_list))+1
 
     def find_correction(self, bad_n_gram):
@@ -124,10 +115,8 @@ class GrammarChecker:
 
 
 if __name__ == "__main__":
-    text = "Note that in all contour time-integrals we essentially integrates _MATH_."
-    # text = "Optimal filters was categorized to recursive and batch filters."
+    text = "Note that in all contour time-integral we essentially integrate over _MATH_.\n"
+    print(text)
 
     grammar_checker = GrammarChecker(3)
     grammar_checker.check(text)
-    print(text)
-
-- 
GitLab