Skip to content
Snippets Groups Projects

Upload New File

Merged Heiko Raible requested to merge stheraib-main-patch-70913 into main
1 file
+ 314
0
Compare changes
  • Side-by-side
  • Inline
+ 314
0
 
import ast
 
import sys
 
import time
 
import json
 
import math
 
import nltk
 
import requests
 
import pandas as pd
 
import threading
 
import numpy as np
 
from nltk import TweetTokenizer
 
from nltk.util import ngrams
 
from time import sleep
 
from tester import Tester
 
 
from nltk.stem import WordNetLemmatizer
 
from nltk.corpus import wordnet as wn
 
from nltk import pos_tag
 
from pyinflect import getAllInflections
 
 
 
class GrammarChecker:
 
def __init__(self):
 
# required
 
nltk.download('punkt')
 
nltk.download('averaged_perceptron_tagger')
 
nltk.download('wordnet')
 
# variables
 
self.default_prob = 1e-10
 
self.thresholds = {2: 5.6, 3: 7.7}
 
# tokenizer
 
self.tokenizer = nltk.RegexpTokenizer(r"\w+")
 
# lemmatizer
 
self.lemmatizer = WordNetLemmatizer()
 
 
def check(self, sentence):
 
""" checks a sentence for errors and recursively corrects the first one """
 
# lower case sentence
 
sentence = sentence.lower()
 
# create n_grams
 
n_grams = {1: list(nltk.ngrams(self.tokenizer.tokenize(sentence), 1))}
 
for n in [2, 3]:
 
n_grams[n] = list(nltk.ngrams(self.tokenizer.tokenize(sentence), n))
 
# find errors
 
i_errors = self.find_index_of_error(n_grams)
 
# get corrections
 
unigrams, i_corrections = self.get_corrections(n_grams, i_errors)
 
print(f"unigrams: {unigrams}")
 
print(f"i_corrections: {i_corrections}")
 
return unigrams, i_corrections
 
 
def get_corrections(self, n_grams, i_errors):
 
""" gets corrections for errors """
 
# get unigrams and create result corrections dict
 
unigrams = [unigram[0] for unigram in n_grams[1]]
 
i_corrections = {i_error: unigrams[i_error] for i_error in i_errors}
 
 
# if errors are found
 
if i_corrections:
 
# collect probabilities of inflections for all errors
 
probs = {}
 
for i_error, word in i_corrections.items():
 
probs[i_error] = {}
 
try:
 
inflections = set(self.suggest_inflections(word))
 
except Exception:
 
continue
 
for n in n_grams:
 
if n == 1:
 
continue
 
probs[i_error][n] = {}
 
n_gram_indexes = self.get_n_gram_indexes_from_word_index(n, len(n_grams[n]), i_error)
 
error_n_grams = [n_grams[n][n_gram_index] for n_gram_index in n_gram_indexes]
 
# threads for checking error_n_grams with inflections in parallel
 
threads = []
 
for error_n_gram in error_n_grams:
 
threads.append(threading.Thread(target=self.check_n_gram_inflections, args=(probs, i_error, n, error_n_gram, inflections, word)))
 
threads[-1].setDaemon(True)
 
threads[-1].start()
 
for thread in threads:
 
thread.join()
 
 
# voting mechanism
 
prob_accumulator = {}
 
for i_error, ns in probs.items():
 
prob_accumulator[i_error] = {}
 
for n, error_n_grams in ns.items():
 
for error_n_gram, inflections in error_n_grams.items():
 
for inflection, prob in inflections.items():
 
if inflection in prob_accumulator[i_error]:
 
prob_accumulator[i_error][inflection] += prob
 
else:
 
prob_accumulator[i_error][inflection] = prob
 
 
# determine best inflections
 
for i_error, inflections in prob_accumulator.items():
 
if inflections:
 
i_corrections[i_error] = sorted(inflections.items(), key=lambda index: -index[1])[0][0]
 
 
return unigrams, i_corrections
 
 
def check_n_gram_inflections(self, probs, i_error, n, error_n_gram, inflections, word):
 
probs[i_error][n][error_n_gram] = {}
 
inflection_n_grams = []
 
for inflection in inflections:
 
tmp = list(error_n_gram)
 
index = tmp.index(word)
 
tmp[index] = inflection
 
inflection_n_grams.append(tmp)
 
inflection_probs = self.get_probs_of_n_grams(inflection_n_grams)
 
for i, inflection in enumerate(inflections):
 
probs[i_error][n][error_n_gram][inflection] = inflection_probs[i]
 
 
def suggest_inflections(self, word):
 
pos = pos_tag([word])[0][1]
 
if pos.startswith("N"):
 
# Nouns mapped with noun markers
 
startswith ="N"
 
lemmparam = "n"
 
list_of_suggestions = None
 
elif pos.startswith("R"):
 
# adverbs mapped with adverb markers
 
startswith ="A"
 
lemmparam ="r"
 
list_of_suggestions = None
 
elif pos.startswith("J"):
 
# adjectives mapped with adjective markers
 
startswith ="A"
 
lemmparam ="a"
 
list_of_suggestions = None
 
elif pos.startswith("V"):
 
# Verbs mapped with verb markers
 
startswith ="V"
 
lemmparam ="v"
 
list_of_suggestions = None
 
elif pos == "PRP" or pos =="PRP$":
 
# If word in posessive pronoun, try all posessive pronouns
 
list_of_suggestions = ["I","you", "he", "she", "it", "we", "they", "me", "him", "her", "us","my", "mine", "our", "ours", "its",\
 
"his", "her", "hers", "their", "theirs", "your" , "yours"]
 
startswith = None
 
else:
 
# Else, return nothing
 
startswith = None
 
list_of_suggestions = None
 
if list_of_suggestions is None and startswith is not None:
 
# if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
 
if lemmparam == "r":
 
# for adverbs , inflections of th
 
s = []
 
suggestion = ""
 
for ss in wn.synsets(word):
 
for lemmas in ss.lemmas(): # all possible lemmas.
 
s.append(lemmas)
 
 
for pers in s:
 
posword = pers.pertainyms()
 
if len(posword) == 0:
 
continue
 
else:
 
posword = posword[0].name()
 
if posword[0:3] == word[0:3] or posword[0:4] == word[0:4] :
 
suggestion = posword
 
break
 
word = self.lemmatizer.lemmatize(suggestion,lemmparam)
 
inflections = getAllInflections(word)
 
tags =[ key for key in inflections.keys() ]
 
suggestion_list = [inflections[tag] for tag in tags]
 
suggestion = [i for sub in suggestion_list for i in sub]
 
return suggestion
 
else:
 
word = self.lemmatizer.lemmatize(word,lemmparam)
 
inflections = getAllInflections(word)
 
tags =[ key for key in inflections.keys() ]
 
suggestion_list = [inflections[tag] for tag in tags]
 
suggestion = [i for sub in suggestion_list for i in sub]
 
return suggestion
 
elif list_of_suggestions is not None and startswith is None:
 
return list_of_suggestions
 
 
def get_google_ngram_prob(self, n_gram):
 
""" gets probability for given n_gram """
 
url = f"https://books.google.com/ngrams/json?content={' '.join(n_gram)}&case_insensitive=true"
 
successful = False
 
wait_time = 0.0001
 
while not successful:
 
response = requests.get(url)
 
sleep(wait_time)
 
if response.ok:
 
successful = True
 
results = json.loads(response.content)
 
if results:
 
max_prob = 0.0
 
for result in results:
 
cur_max_prob = max(results[0]["timeseries"])
 
max_prob = cur_max_prob if cur_max_prob > max_prob else max_prob
 
return max_prob
 
else:
 
return None
 
if not successful:
 
if wait_time < 10:
 
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
 
wait_time *= 10
 
else:
 
pass
 
# print("still no response.")
 
 
def get_prob_of_n_gram(self, n_gram, probs, i):
 
""" calculates probability of n_gram """
 
# get n_gram probability
 
prob = self.get_google_ngram_prob(n_gram)
 
probs[i] = prob if prob != 0.0 and prob != None else self.default_prob
 
 
def get_probs_of_n_grams(self, n_grams):
 
# create target list
 
probs = [None]*len(n_grams)
 
# create and start threads
 
threads = []
 
for i, n_gram in enumerate(n_grams):
 
threads.append(threading.Thread(target=self.get_prob_of_n_gram, args=(n_gram, probs, i)))
 
threads[-1].setDaemon(True)
 
threads[-1].start()
 
# join threads
 
for thread in threads:
 
thread.join()
 
return probs
 
 
def get_word_indexes_from_n_gram_index(self, n, n_gram_index):
 
word_indexes = [n_gram_index]
 
for i in range(n-1):
 
word_indexes.append(word_indexes[-1]+1)
 
return word_indexes
 
 
def get_n_gram_indexes_from_word_index(self, n, n_gram_cnt, word_index):
 
n_gram_indexes = [0] if word_index < n else [word_index-n+1]
 
for i in range(word_index%n if word_index < n else n-1):
 
nxt = n_gram_indexes[-1]+1
 
if nxt < n_gram_cnt:
 
n_gram_indexes.append(nxt)
 
return n_gram_indexes
 
 
def find_index_of_error(self, n_grams):
 
""" finds index of greatest error in n_grams"""
 
# get probabilities for all n_grams
 
probs = {}
 
thresholds_passed = {}
 
smallest_prob_counter = {2: {i: 0 for i in range(len(n_grams[1]))}, 3: {i: 0 for i in range(len(n_grams[1]))}}
 
for n in n_grams:
 
# don't take 1-grams into account
 
if n == 1:
 
continue
 
# smallest prob
 
probs[n] = self.get_probs_of_n_grams(n_grams[n])
 
try:
 
for index in self.get_word_indexes_from_n_gram_index(n, probs[n].index(min(probs[n]))):
 
smallest_prob_counter[n][index] += 1
 
except Exception:
 
pass
 
# threshholds check
 
if np.prod(probs[n]) == 0:
 
thresholds_passed[n] = True
 
else:
 
thresholds_passed[n] = -np.log10((np.prod(probs[n]))**(1/len(n_grams[n]))) <= self.thresholds[n]
 
 
# determine indexes of errors
 
i_errors = []
 
max_counter = 0
 
total_smallest_prob_counter = {i: 0 for i in range(len(n_grams[1]))}
 
for n, smallest_probs in smallest_prob_counter.items():
 
if True: # thresholds_passed[n]:
 
for index in total_smallest_prob_counter:
 
total_smallest_prob_counter[index] += smallest_probs[index]
 
for index, counter in sorted(total_smallest_prob_counter.items(), key=lambda index: -index[1]):
 
if counter >= max_counter and counter != 0:
 
i_errors.append(index)
 
max_counter = counter
 
 
print(f"thresholds_passed: {thresholds_passed}")
 
return i_errors
 
 
 
if __name__ == "__main__":
 
# get sentences
 
tester = Tester()
 
 
# create grammar checker
 
grammar_checker = GrammarChecker()
 
 
# check sentences
 
print("CORRECT SENTENCES\n\n")
 
for sentence in tester.correct_sentences:
 
print(sentence.text)
 
grammar_checker.check(sentence.text)
 
print()
 
 
print("\nTYPE 1 ERROR SENTENCES\n\n")
 
for sentence in tester.type_1_error_sentences:
 
print(sentence.text)
 
print(sentence.original)
 
grammar_checker.check(sentence.text)
 
print()
 
 
print("\nTYPE 2 ERROR SENTENCES\n\n")
 
for sentence in tester.type_2_error_sentences:
 
print(sentence.text)
 
print(sentence.original)
 
grammar_checker.check(sentence.text)
 
print()
 
 
print("\nTYPE 3 ERROR SENTENCES\n\n")
 
for sentence in tester.type_3_error_sentences:
 
print(sentence.text)
 
print(sentence.original)
 
grammar_checker.check(sentence.text)
 
print()
Loading