diff --git a/src/grammar_checker_google.py b/src/grammar_checker_google.py index 2fd246883bd5c0ba7de07fd9f76738d9ad9acdf9..1345bd8eecd33944c62910101421abccf22f20ab 100644 --- a/src/grammar_checker_google.py +++ b/src/grammar_checker_google.py @@ -35,8 +35,9 @@ class GrammarChecker: for i_error in sorted(i_errors): error.append(n_grams[1][i_error][0]) print(f"Error: {' '.join(error)}") + self.suggest_correction(n_grams, i_errors) - def get_google_ngram_prob(self, n_gram): + def get_google_ngram_prob(self, n_gram, suggestion=False): """ gets probability for given n_gram """ url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true" successful = False @@ -49,6 +50,8 @@ class GrammarChecker: results = json.loads(response.content) if results: max_prob = 0.0 + if suggestion: + print(results) for result in results: cur_max_prob = max(results[0]["timeseries"]) max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob @@ -66,7 +69,7 @@ class GrammarChecker: def get_prob_of_n_gram(self, n_gram): """ calculates probability of n_gram """ # smallest possible positive float (1e-324 == 0.0) - float_min = 1e-100 + float_min = 1e-6 # float_min = 1e-323 # get n_gram probability prob = self.get_google_ngram_prob(n_gram) @@ -78,6 +81,14 @@ class GrammarChecker: word_indexes.append(word_indexes[-1]+1) return word_indexes + def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index): + n_gram_indexes = [0] if word_index < n else [word_index-n+1] + for i in range(word_index%n if word_index < n else n-1): + nxt = n_gram_indexes[-1]+1 + if nxt < n_gram_cnt: + n_gram_indexes.append(nxt) + return n_gram_indexes + def find_index_of_error(self, n_grams): """ finds index of greatest error in n_grams""" # get probabilities for all n_grams @@ -106,9 +117,34 @@ class GrammarChecker: if counter >= max_counter: i_errors.append(index) max_counter = counter + # over_threshold = True if (chained_probs)**(1/len(n_grams)) <= self.threshold else False return i_errors - # return i_errors if markov[?] <= self.threshold else None + def suggest_correction(self, n_grams, i_errors): + print() + + for i_error in i_errors: + print(f"i_error: {i_error}") + print(n_grams[1][i_error]) + for n, grams in n_grams.items(): + # skip unigrams + if n == 1: + continue + # create asterisk n_grams + n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(grams), i_error) + for i, n_gram_index in enumerate(n_gram_indexes): + pos = i_error-i if i_error < n else n-1-i + tmp = list(grams[n_gram_index]) + tmp[pos] = "*" + # create n_gram + construct = " ".join(tmp) + print(construct) + # get suggestions + # self.get_google_ngram_prob(construct, suggestion=True) + print() + print() + print() + assert False if __name__ == "__main__":