From 0dd6fcaf2213c6e5f53ec4bc4dbe320469941e81 Mon Sep 17 00:00:00 2001 From: Naa <Heiko.Raible@gmail.com> Date: Thu, 25 Nov 2021 11:16:26 +0100 Subject: [PATCH] fix code cleanup --- src/grammar_checker.py | 111 +++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 61 deletions(-) diff --git a/src/grammar_checker.py b/src/grammar_checker.py index 0d2512b..be0aab9 100644 --- a/src/grammar_checker.py +++ b/src/grammar_checker.py @@ -1,7 +1,6 @@ +import sys import json import nltk -from typing import Union -import sys from nltk.util import ngrams @@ -21,42 +20,6 @@ class GrammarChecker: with open(f"../data/corpus_n={n}.json", "r") as infile: self.corpus = json.load(infile) - def get_probs_of_gram(self,gram: Union[list,tuple]) -> float: - # Get first character of toekn/word in gram - first_char = gram[0][0] - # Get length of gram -1 for preceeding tokens - length_gram = len(gram) -1 - # Get first n-1 tokens in gram - first_n_1_token = gram[:length_gram] - # Search corpus for first character - search_corpus =self.corpus[first_char] - #Initiate count for total of grams with preceding tokens and - #also count for total grams with exactly the same gram - total_value = 0 - conditional_value = 0 - for key, value in search_corpus.items(): - # split words into list of tokens//grams in corpus - splitted = key.split() - if splitted[:length_gram] == list(first_n_1_token): - total_value += value - if splitted == list(gram): - conditional_value += value - try: - prob = conditional_value/total_value - if prob == 0.0: - prob = 1/sys.maxsize - except ZeroDivisionError: - prob = 1/sys.maxsize - return prob - - def get_chained_probability(self,list_of_ngrams:list) -> list: - list_of_probs=[self.get_probs_of_gram(i) for i in list_of_ngrams] - # list_of_cum_probs = [get_probs_of_gram(i,dictionary) for i in split_sliding_ngram(input,n)[:position]] - list_of_cum =[list_of_probs[0]] - for i in range(1,len(list_of_probs)): - list_of_cum.append(list_of_cum[-1]*list_of_probs[i]) - return list_of_cum - def check(self, sentence): """ checks a sentence for errors and recursively corrects the first one """ # add padding @@ -64,9 +27,10 @@ class GrammarChecker: sentence = self.start_tag*(self.n-1) + sentence + self.end_tag*(self.n-1) # create n_grams list n_grams = list(nltk.ngrams(self.tokenizer.tokenize(sentence), self.n)) - # find first error and it's correction - i_error = self.find_gram_lowest_prob(n_grams) - print(n_grams[i_error]) + # find error + i_error = self.find_index_of_error(n_grams) + print(f"erroneous n-gram:\n{n_grams[i_error]} at position {i_error}") + # if error detected # if i_error: # # find correction @@ -90,23 +54,50 @@ class GrammarChecker: # # return sentence # return sentence - # def find_first_error(self, n_grams): - # """ finds error (now look up, later markov chains) """ - # # iterate through n_grams - # for i, n_gram in enumerate(n_grams): - # # determine first character for look up in corpus - # initial_char = n_gram[0][0] - # # create n_gram_str to look up - # n_gram_str = " ".join(n_gram) - # # if n_gram is not in corpus - # if n_gram_str not in self.corpus[initial_char]: - # return i - # return None + def get_prob_of_n_gram(self, n_gram): + """ calculates probability of n_gram """ + # smallest possible positive float (1e-324 == 0.0) + float_min = 1e-323 + # get first character n_gram and n_gram without last word + first_char = n_gram[0][0] + all_but_last_tokens = n_gram[:-1] + # if n_gram isn't in corpus + if n_gram not in self.corpus[first_char]: + # return smallest possible positive float + return float_min + # get n_gram occurences and total occurences starting with the same n-1 words + n_gram_occurrences = self.corpus[first_char][n_gram] + total_value = 0 + for key, value in self.corpus[first_char].items(): + # split key string into list of tokens + splitted = key.split(" ") + # if first n-1 words are the same as of n_gram + if splitted[:-1] == list(all_but_last_tokens): + # add occurences to total number of occurrences + total_value += value + # calculate n_gram probability + prob = conditional_value/total_value + # return it if it's not 0, else return smallest possible positive float + return prob if prob != 0.0 else smallest_possible_positive_float + + def get_chained_probabilities(self, probs): + """ get list of cumulative markov chains for probs """ + chained_probs =[probs[0]] + for i in range(1, len(probs)): + chained_probs.append(chained_probs[-1]*probs[i]) + return chained_probs - def find_gram_lowest_prob(self,ngrams:list): - probs_list = self.get_chained_probability(ngrams) - diff_list = [abs(j-i) for i, j in zip(probs_list[:-1], probs_list[1:])] - print(probs_list,"\n",diff_list) + def find_index_of_error(self, n_grams): + """ finds index of greatest error in n_grams""" + # get probabilities of n_grams + probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams] + print(f"\nlist of probabilities:\n{probs}\n") + # get cumulative chained probs + chained_probs = self.get_chained_probabilities(probs) + print(f"list of chained probabilities:\n{chained_probs}\n") + # calculate differences between values in chained_probs + diff_list = [abs(j-i) for i, j in zip(chained_probs[:-1], chained_probs[1:])] + print(f"list of differences:\n{diff_list}\n") return diff_list.index(max(diff_list))+1 def find_correction(self, bad_n_gram): @@ -124,10 +115,8 @@ class GrammarChecker: if __name__ == "__main__": - text = "Note that in all contour time-integrals we essentially integrates _MATH_." - # text = "Optimal filters was categorized to recursive and batch filters." + text = "Note that in all contour time-integral we essentially integrate over _MATH_.\n" + print(text) grammar_checker = GrammarChecker(3) grammar_checker.check(text) - print(text) - -- GitLab