Skip to content
Snippets Groups Projects
Commit 9cd37f64 authored by Jeremy Mah Zhee Kein's avatar Jeremy Mah Zhee Kein
Browse files

Merge branch 'pos_tag_threshold' into 'main'

Pos tag threshold

See merge request !7
parents 0aa038c2 3a3f7f9d
No related branches found
No related tags found
1 merge request!7Pos tag threshold
{"TO": {"TO": 473369}, "VB": {"VB": 648041}, "DT": {"DT": 3074013}, "JJR": {"JJR": 68807}, "NN": {"NN": 4094732}, "IN": {"IN": 3328924}, "PRP": {"PRP": 423778}, "VBP": {"VBP": 520568}, "NNP": {"NNP": 865041}, "CC": {"CC": 711476}, "NNS": {"NNS": 1433890}, "JJ": {"JJ": 2288679}, "MATH": {"MATH": 1278037}, ",": {",": 1349627}, "MATHDISP": {"MATHDISP": 188349}, "WRB": {"WRB": 120758}, "CD": {"CD": 379731}, "RB": {"RB": 842861}, ".": {".": 1176329}, "VBZ": {"VBZ": 895876}, "MD": {"MD": 233961}, "VBG": {"VBG": 382263}, "REF": {"REF": 226499}, "VBN": {"VBN": 732389}, ":": {":": 129205}, "(": {"(": 336356}, ")": {")": 337872}, "RBR": {"RBR": 29970}, "WDT": {"WDT": 149617}, "JJS": {"JJS": 34222}, "VBD": {"VBD": 176140}, "EX": {"EX": 40107}, "CITE": {"CITE": 152054}, "WP": {"WP": 8880}, "``": {"``": 20466}, "''": {"''": 21342}, "NNPS": {"NNPS": 2245}, "POS": {"POS": 29793}, "PRP$": {"PRP$": 93378}, "FW": {"FW": 15290}, "RP": {"RP": 16874}, "PDT": {"PDT": 14990}, "RBS": {"RBS": 7680}, "WP$": {"WP$": 5088}, "LS": {"LS": 114}, "$": {"$": 2387}, "UH": {"UH": 669}, "SYM": {"SYM": 151}, "#": {"#": 343}}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
...@@ -241,5 +241,5 @@ if __name__ == "__main__": ...@@ -241,5 +241,5 @@ if __name__ == "__main__":
# threshold = 10 # threshold = 10
# print(f"threshold={threshold}") # print(f"threshold={threshold}")
# trim_corpus(n, threshold) # trim_corpus(n, threshold)
print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003)) # print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
# create_POS_corpus(3) create_POS_corpus(4)
\ No newline at end of file \ No newline at end of file
import sys
import json
import nltk
from nltk.util import ngrams
from nltk import pos_tag
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import logging
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
log.addHandler(handler)
class GrammarCheckerWithPOS:
def __init__(self, n,default_prob):
# required
nltk.download('punkt')
# variables
self.n = n
self.start_tag = "START "
self.default_prob = default_prob
self.end_tag = " END"
self.__old_sentence = None
# tokenizer
self.tokenizer = nltk.RegexpTokenizer(r"\w+")
self.pos_tag = pos_tag
# load corpus
self.corpus = {}
with open(f"../data/corpus_POS_n={n}.json", "r") as infile:
self.corpus = json.load(infile)
def check(self, sentence,threshold):
""" checks a sentence for errors and recursively corrects the first one """
sentence = sentence.lower()
self.threshold = threshold
# add padding
# self.__old_sentence = self.tokenizer.tokenize(sentence)
# sentence = self.start_tag*(self.n-1) + sentence + self.end_tag*(self.n-1)
# create n_grams list
tokens_with_tags = pos_tag(self.tokenizer.tokenize(sentence))
tags = [tag for token, tag in tokens_with_tags]
words = [token for token, tag in tokens_with_tags]
n_grams = list(ngrams(tags,self.n))
word_n_grams = list(ngrams(words,self.n))
# find error
i_error = self.find_index_of_error(n_grams)
if i_error is not None:
# print(f"erroneous n-gram:\n{word_n_grams[i_error]} at position {i_error}")
return 1
else:
return 0
# if error detected
# if i_error:
# # find correction
# correction = self.find_correction(n_grams[i_error])
# # if correction found
# if correction:
# # apply correction
# corrected_sentence = ""
# for i, n_gram in enumerate(n_grams[:-2]):
# corrected_sentence += correction.split(" ")[-1] if i == i_error else n_gram[-1]
# if i+1 != len(n_grams[:-2]):
# corrected_sentence += " "
# # recheck
# if i_error and " ".join(self.__old_sentence) != corrected_sentence:
# return self.check(corrected_sentence)
# else:
# raise Exception("Could not determine correction for mistake!")
# else:
# raise Exception("Could not determine correction for mistake!")
# else:
# # return sentence
# return sentence
def get_prob_of_n_gram(self, n_gram):
""" calculates probability of n_gram """
# smallest possible positive float (1e-324 == 0.0)
# float_min = 1e-323
# get first character n_gram and n_gram without last word
first_char = n_gram[0]
all_but_last_tokens = n_gram[:-1]
# if n_gram isn't in corpus
n_gram = " ".join(n_gram)
if n_gram not in self.corpus[first_char]:
# return smallest possible positive float
return self.default_prob
# get n_gram occurences and total occurences starting with the same n-1 words
n_gram_occurrences = self.corpus[first_char][n_gram]
total_value = 0
for key, value in self.corpus[first_char].items():
# split key string into list of tokens
splitted = key.split(" ")
# if first n-1 words are the same as of n_gram
if splitted[:-1] == list(all_but_last_tokens):
# add occurences to total number of occurrences
total_value += value
# calculate n_gram probability
prob = n_gram_occurrences/total_value
# return it if it's not 0, else return smallest possible positive float
return prob if prob != 0.0 else self.default_prob
def get_chained_probabilities(self, probs):
""" get list of cumulative markov chains for probs """
chained_probs =[probs[0]]
for i in range(1, len(probs)):
chained_probs.append(chained_probs[-1]*probs[i])
return chained_probs
def find_index_of_error(self, n_grams):
""" finds index of greatest error in n_grams"""
# print("N-grams: "+ str(n_grams))
# get probabilities of n_grams
if len(n_grams) != 0:
probs = [self.get_prob_of_n_gram(n_gram) for n_gram in n_grams]
else:
probs = [self.default_prob]
# get cumulative chained probs
chained_probs = self.get_chained_probabilities(probs)
# print((chained_probs[-1])**(1/len(n_grams)))
# print(f"list of chained probabilities:\n{chained_probs}\n")
# calculate differences between values in chained_probs
diff_list = [abs(j-i) for i, j in zip(chained_probs[:-1], chained_probs[1:])]
# print(f"list of differences:\n{diff_list}\n")
if len(n_grams) != 0:
return probs.index(min(probs)) if (chained_probs[-1])**(1/len(n_grams)) <= self.threshold else None
else:
return probs.index(min(probs))
# def find_correction(self, bad_n_gram):
# """ corrects error (now most occurrences, later combination with character similarity) """
# # create string from first n-1 words of n_gram
# n_gram_str = " ".join(bad_n_gram[:-1])
# # collect suggestions
# suggestions = []
# for n_gram, occurrence in self.corpus[n_gram_str[0]].items():
# if n_gram.startswith(n_gram_str):
# suggestions.append([occurrence, n_gram])
# # determine winner (now returns n_gram with most occurrences)
# suggestions = sorted(suggestions)
# return suggestions[-1][1] if suggestions else None
if __name__ == "__main__":
# text = "Note that in are contour time-integrals we essentially integrate over _MATH_."
# sentence_list = [
# "If many node moves by M1, it will not make any other membership move.",
# "I sees movies",
# "There cans be at most _MATH_ membership moves.",
# "In particular, the sequence _MATH_ has a limit in the sense of Painleve-Kuratowski if and only if the following conditions hold: _MATHDISP_.",
# "If _MATH_ is small enough relative to _MATH_, then we will see that this number exceeds _MATH_.",
# "Obviously, networks evolve over time, and even static systems will need to update the membership list now and then, but the presumption is that this can be treated as an offline activity.",
# "We use the fact that the difference between _MATH_ and _MATH_ is at most the total number of arrivals and departures during the time interval between _MATH_ and _MATH_. ",
# "In this distorted image, we needs to find the point that corresponds to the true center of the solar disk.",
# "I don't texts while driving"
# ]
# print(text)
with open("../data/corpus_POS_n=3.json") as corpusjson:
corpus3 = json.load(corpusjson)
list_of_values_3 = []
for key in corpus3.keys():
values = list(corpus3[key].values())
list_of_values_3.append(values)
flat_list_3 = [item for sublist in list_of_values_3 for item in sublist]
grammar_checker = GrammarCheckerWithPOS(3,100/sum(flat_list_3))
testdf = pd.read_csv("../testsentences.csv",sep=";")
list_of_thresholds = [x for x in np.linspace(0.01,0.4,20)]
list_of_precision = list()
list_of_recall = list()
list_of_f1 = list()
for threshold in list_of_thresholds:
try:
testdf["results"] = testdf["Sentence"].apply(lambda x: grammar_checker.check(x,threshold))
truth = testdf["Incorrect"]
pred = testdf["results"]
confmat = confusion_matrix(truth,pred)
precision = confmat[1][1]/(confmat[1][1]+confmat[0][1])
recall = confmat[1][1]/(confmat[1][1]+confmat[1][0])
f1score = (2*precision*recall)/(precision+recall)
list_of_precision.append(precision)
list_of_recall.append(recall)
list_of_f1.append(f1score)
except Exception as e:
log.error(e)
df_list = [list_of_thresholds,list_of_precision,list_of_recall,list_of_f1]
result = pd.DataFrame(df_list).transpose()
result.columns= ["threshold","precision","recall","f1"]
print(result)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment