push after meeting 25.11.2021

1811e34b · jmzk96 · 2a535025 · 1811e34b
Commit 1811e34b authored 3 years ago by jmzk96
--- a/src/grammar_checker.py
+++ b/src/grammar_checker.py
@@ -21,6 +21,42 @@ class GrammarChecker:
        with open(f"../data/corpus_n={n}.json", "r") as infile:
            self.corpus = json.load(infile)

+    def get_probs_of_gram(self,gram: Union[list,tuple]) -> float:
+        # Get first character of toekn/word in gram
+        first_char = gram[0][0]
+        # Get length of gram -1 for preceeding tokens
+        length_gram = len(gram) -1
+        # Get first n-1 tokens in gram
+        first_n_1_token = gram[:length_gram]
+        # Search corpus for first character
+        search_corpus =self.corpus[first_char]
+        #Initiate count for total  of grams with preceding tokens and 
+        #also count for total grams with exactly the same gram
+        total_value = 0
+        conditional_value = 0
+        for key, value  in search_corpus.items():
+            # split words into list of tokens//grams in corpus
+            splitted = key.split()
+            if  splitted[:length_gram] == list(first_n_1_token):
+                total_value += value
+            if splitted == list(gram):
+                conditional_value += value
+        try:
+            prob = conditional_value/total_value
+            if prob == 0.0:
+                prob = 1/sys.maxsize
+        except ZeroDivisionError:
+            prob = 1/sys.maxsize
+        return prob
+
+    def get_chained_probability(self,list_of_ngrams:list) -> list:
+        list_of_probs=[self.get_probs_of_gram(i) for i in list_of_ngrams]
+        # list_of_cum_probs = [get_probs_of_gram(i,dictionary) for i in split_sliding_ngram(input,n)[:position]]
+        list_of_cum =[list_of_probs[0]]
+        for i in range(1,len(list_of_probs)):
+            list_of_cum.append(list_of_cum[-1]*list_of_probs[i])
+        return  list_of_cum
+
    def check(self, sentence):
        """ checks a sentence for errors and recursively corrects the first one """
        # add padding
@@ -28,9 +64,9 @@ class GrammarChecker:
        sentence = self.start_tag*(self.n-1) + sentence + self.end_tag*(self.n-1)
        # create n_grams list
        n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence), self.n))
-        # find error
-        i_error = self.find_index_of_error(n_grams)
-        print(f"erroneous n-gram:\n{n_grams[i_error]}")
+        # find first error and it's correction
+        i_error = self.find_gram_lowest_prob(n_grams)
+        print(n_grams[i_error])
        # if error detected
        # if i_error:
        #     # find correction
@@ -67,48 +103,10 @@ class GrammarChecker:
    #             return i
    #     return None

-    def get_prob_of_gram(self, n_gram):
-        """ calculates probability of n_gram """
-        # smallest possible positive float (1e-324 == 0.0)
-        float_min = 1e-323
-        # get first character n_gram and n_gram without last word
-        first_char = n_gram[0][0]
-        all_but_last_tokens = n_gram[:-1]
-        # if n_gram isn't in corpus
-        if n_gram not in self.corpus[first_char]:
-            # return smallest possible positive float
-            return float_min
-        # get n_gram occurences and total occurences starting with the same n-1 words
-        n_gram_occurrences = self.corpus[first_char][n_gram]
-        total_value = 0
-        for key, value in self.corpus[first_char].items():
-            # split key string into list of tokens
-            splitted = key.split(" ")
-            # if first n-1 words are the same as of n_gram
-            if splitted[:-1] == list(all_but_last_tokens):
-                # add occurences to total number of occurrences
-                total_value += value
-        # calculate n_gram probability
-        prob = conditional_value/total_value
-        # return it if it's not 0, else return smallest possible positive float
-        return prob if prob != 0.0 else smallest_possible_positive_float
-
-    def get_chained_probabilities(self, n_grams):
-        """ get list of cumulative markov chains for n_grams """
-        probs = [self.get_prob_of_gram(n_gram) for n_gram in n_grams]
-        print(f"list of probabilities:\n{probs}\n")
-        chained_probs =[probs[0]]
-        for i in range(1, len(list_of_probs)):
-            chained_probs.append(chained_probs[-1]*probs[i])
-        print(f"list of chained probabilities:\n{chained_probs}\n")
-        return chained_probs
-
-    def find_index_of_error(self, n_grams):
-        """ finds index of greatest error in n_grams"""
-        chained_probs = self.get_chained_probabilities(n_grams)
-        # calculate differences between values in chained_probs
-        diff_list = [abs(j-i) for i, j in zip(chained_probs[:-1], chained_probs[1:])]
-        print("list of differences:\n{chained_probs}\n")
+    def find_gram_lowest_prob(self,ngrams:list):
+        probs_list = self.get_chained_probability(ngrams)
+        diff_list = [abs(j-i) for i, j in zip(probs_list[:-1], probs_list[1:])]
+        print(probs_list,"\n",diff_list)
        return diff_list.index(max(diff_list))+1

    def find_correction(self, bad_n_gram):
@@ -126,7 +124,7 @@ class GrammarChecker:


 if __name__ == "__main__":
-    text = "Note that in all contour time-integral we essentially integrate over _MATH_."
+    text = "Note that in all contour time-integrals we essentially integrates _MATH_."
    # text = "Optimal filters was categorized to recursive and batch filters."

    grammar_checker = GrammarChecker(3)