Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Statistical_Grammar_Checker
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
NLP_WS_2021
Statistical_Grammar_Checker
Merge requests
!11
Upload New File
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Upload New File
stheraib-main-patch-70913
into
main
Overview
0
Commits
1
Pipelines
0
Changes
1
Merged
Heiko Raible
requested to merge
stheraib-main-patch-70913
into
main
3 years ago
Overview
0
Commits
1
Pipelines
0
Changes
1
Expand
0
0
Merge request reports
Compare
main
main (base)
and
latest version
latest version
469bf123
1 commit,
3 years ago
1 file
+
314
−
0
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
deployment/grammar_checker.py
0 → 100644
+
314
−
0
Options
import
ast
import
sys
import
time
import
json
import
math
import
nltk
import
requests
import
pandas
as
pd
import
threading
import
numpy
as
np
from
nltk
import
TweetTokenizer
from
nltk.util
import
ngrams
from
time
import
sleep
from
tester
import
Tester
from
nltk.stem
import
WordNetLemmatizer
from
nltk.corpus
import
wordnet
as
wn
from
nltk
import
pos_tag
from
pyinflect
import
getAllInflections
class
GrammarChecker
:
def
__init__
(
self
):
# required
nltk
.
download
(
'
punkt
'
)
nltk
.
download
(
'
averaged_perceptron_tagger
'
)
nltk
.
download
(
'
wordnet
'
)
# variables
self
.
default_prob
=
1e-10
self
.
thresholds
=
{
2
:
5.6
,
3
:
7.7
}
# tokenizer
self
.
tokenizer
=
nltk
.
RegexpTokenizer
(
r
"
\w+
"
)
# lemmatizer
self
.
lemmatizer
=
WordNetLemmatizer
()
def
check
(
self
,
sentence
):
"""
checks a sentence for errors and recursively corrects the first one
"""
# lower case sentence
sentence
=
sentence
.
lower
()
# create n_grams
n_grams
=
{
1
:
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
1
))}
for
n
in
[
2
,
3
]:
n_grams
[
n
]
=
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
n
))
# find errors
i_errors
=
self
.
find_index_of_error
(
n_grams
)
# get corrections
unigrams
,
i_corrections
=
self
.
get_corrections
(
n_grams
,
i_errors
)
print
(
f
"
unigrams:
{
unigrams
}
"
)
print
(
f
"
i_corrections:
{
i_corrections
}
"
)
return
unigrams
,
i_corrections
def
get_corrections
(
self
,
n_grams
,
i_errors
):
"""
gets corrections for errors
"""
# get unigrams and create result corrections dict
unigrams
=
[
unigram
[
0
]
for
unigram
in
n_grams
[
1
]]
i_corrections
=
{
i_error
:
unigrams
[
i_error
]
for
i_error
in
i_errors
}
# if errors are found
if
i_corrections
:
# collect probabilities of inflections for all errors
probs
=
{}
for
i_error
,
word
in
i_corrections
.
items
():
probs
[
i_error
]
=
{}
try
:
inflections
=
set
(
self
.
suggest_inflections
(
word
))
except
Exception
:
continue
for
n
in
n_grams
:
if
n
==
1
:
continue
probs
[
i_error
][
n
]
=
{}
n_gram_indexes
=
self
.
get_n_gram_indexes_from_word_index
(
n
,
len
(
n_grams
[
n
]),
i_error
)
error_n_grams
=
[
n_grams
[
n
][
n_gram_index
]
for
n_gram_index
in
n_gram_indexes
]
# threads for checking error_n_grams with inflections in parallel
threads
=
[]
for
error_n_gram
in
error_n_grams
:
threads
.
append
(
threading
.
Thread
(
target
=
self
.
check_n_gram_inflections
,
args
=
(
probs
,
i_error
,
n
,
error_n_gram
,
inflections
,
word
)))
threads
[
-
1
].
setDaemon
(
True
)
threads
[
-
1
].
start
()
for
thread
in
threads
:
thread
.
join
()
# voting mechanism
prob_accumulator
=
{}
for
i_error
,
ns
in
probs
.
items
():
prob_accumulator
[
i_error
]
=
{}
for
n
,
error_n_grams
in
ns
.
items
():
for
error_n_gram
,
inflections
in
error_n_grams
.
items
():
for
inflection
,
prob
in
inflections
.
items
():
if
inflection
in
prob_accumulator
[
i_error
]:
prob_accumulator
[
i_error
][
inflection
]
+=
prob
else
:
prob_accumulator
[
i_error
][
inflection
]
=
prob
# determine best inflections
for
i_error
,
inflections
in
prob_accumulator
.
items
():
if
inflections
:
i_corrections
[
i_error
]
=
sorted
(
inflections
.
items
(),
key
=
lambda
index
:
-
index
[
1
])[
0
][
0
]
return
unigrams
,
i_corrections
def
check_n_gram_inflections
(
self
,
probs
,
i_error
,
n
,
error_n_gram
,
inflections
,
word
):
probs
[
i_error
][
n
][
error_n_gram
]
=
{}
inflection_n_grams
=
[]
for
inflection
in
inflections
:
tmp
=
list
(
error_n_gram
)
index
=
tmp
.
index
(
word
)
tmp
[
index
]
=
inflection
inflection_n_grams
.
append
(
tmp
)
inflection_probs
=
self
.
get_probs_of_n_grams
(
inflection_n_grams
)
for
i
,
inflection
in
enumerate
(
inflections
):
probs
[
i_error
][
n
][
error_n_gram
][
inflection
]
=
inflection_probs
[
i
]
def
suggest_inflections
(
self
,
word
):
pos
=
pos_tag
([
word
])[
0
][
1
]
if
pos
.
startswith
(
"
N
"
):
# Nouns mapped with noun markers
startswith
=
"
N
"
lemmparam
=
"
n
"
list_of_suggestions
=
None
elif
pos
.
startswith
(
"
R
"
):
# adverbs mapped with adverb markers
startswith
=
"
A
"
lemmparam
=
"
r
"
list_of_suggestions
=
None
elif
pos
.
startswith
(
"
J
"
):
# adjectives mapped with adjective markers
startswith
=
"
A
"
lemmparam
=
"
a
"
list_of_suggestions
=
None
elif
pos
.
startswith
(
"
V
"
):
# Verbs mapped with verb markers
startswith
=
"
V
"
lemmparam
=
"
v
"
list_of_suggestions
=
None
elif
pos
==
"
PRP
"
or
pos
==
"
PRP$
"
:
# If word in posessive pronoun, try all posessive pronouns
list_of_suggestions
=
[
"
I
"
,
"
you
"
,
"
he
"
,
"
she
"
,
"
it
"
,
"
we
"
,
"
they
"
,
"
me
"
,
"
him
"
,
"
her
"
,
"
us
"
,
"
my
"
,
"
mine
"
,
"
our
"
,
"
ours
"
,
"
its
"
,
\
"
his
"
,
"
her
"
,
"
hers
"
,
"
their
"
,
"
theirs
"
,
"
your
"
,
"
yours
"
]
startswith
=
None
else
:
# Else, return nothing
startswith
=
None
list_of_suggestions
=
None
if
list_of_suggestions
is
None
and
startswith
is
not
None
:
# if startswith is not None return list of suggestions/ inflections of the word given based on the POS tag
if
lemmparam
==
"
r
"
:
# for adverbs , inflections of th
s
=
[]
suggestion
=
""
for
ss
in
wn
.
synsets
(
word
):
for
lemmas
in
ss
.
lemmas
():
# all possible lemmas.
s
.
append
(
lemmas
)
for
pers
in
s
:
posword
=
pers
.
pertainyms
()
if
len
(
posword
)
==
0
:
continue
else
:
posword
=
posword
[
0
].
name
()
if
posword
[
0
:
3
]
==
word
[
0
:
3
]
or
posword
[
0
:
4
]
==
word
[
0
:
4
]
:
suggestion
=
posword
break
word
=
self
.
lemmatizer
.
lemmatize
(
suggestion
,
lemmparam
)
inflections
=
getAllInflections
(
word
)
tags
=
[
key
for
key
in
inflections
.
keys
()
]
suggestion_list
=
[
inflections
[
tag
]
for
tag
in
tags
]
suggestion
=
[
i
for
sub
in
suggestion_list
for
i
in
sub
]
return
suggestion
else
:
word
=
self
.
lemmatizer
.
lemmatize
(
word
,
lemmparam
)
inflections
=
getAllInflections
(
word
)
tags
=
[
key
for
key
in
inflections
.
keys
()
]
suggestion_list
=
[
inflections
[
tag
]
for
tag
in
tags
]
suggestion
=
[
i
for
sub
in
suggestion_list
for
i
in
sub
]
return
suggestion
elif
list_of_suggestions
is
not
None
and
startswith
is
None
:
return
list_of_suggestions
def
get_google_ngram_prob
(
self
,
n_gram
):
"""
gets probability for given n_gram
"""
url
=
f
"
https://books.google.com/ngrams/json?content=
{
'
'
.
join
(
n_gram
)
}
&case_insensitive=true
"
successful
=
False
wait_time
=
0.0001
while
not
successful
:
response
=
requests
.
get
(
url
)
sleep
(
wait_time
)
if
response
.
ok
:
successful
=
True
results
=
json
.
loads
(
response
.
content
)
if
results
:
max_prob
=
0.0
for
result
in
results
:
cur_max_prob
=
max
(
results
[
0
][
"
timeseries
"
])
max_prob
=
cur_max_prob
if
cur_max_prob
>
max_prob
else
max_prob
return
max_prob
else
:
return
None
if
not
successful
:
if
wait_time
<
10
:
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
wait_time
*=
10
else
:
pass
# print("still no response.")
def
get_prob_of_n_gram
(
self
,
n_gram
,
probs
,
i
):
"""
calculates probability of n_gram
"""
# get n_gram probability
prob
=
self
.
get_google_ngram_prob
(
n_gram
)
probs
[
i
]
=
prob
if
prob
!=
0.0
and
prob
!=
None
else
self
.
default_prob
def
get_probs_of_n_grams
(
self
,
n_grams
):
# create target list
probs
=
[
None
]
*
len
(
n_grams
)
# create and start threads
threads
=
[]
for
i
,
n_gram
in
enumerate
(
n_grams
):
threads
.
append
(
threading
.
Thread
(
target
=
self
.
get_prob_of_n_gram
,
args
=
(
n_gram
,
probs
,
i
)))
threads
[
-
1
].
setDaemon
(
True
)
threads
[
-
1
].
start
()
# join threads
for
thread
in
threads
:
thread
.
join
()
return
probs
def
get_word_indexes_from_n_gram_index
(
self
,
n
,
n_gram_index
):
word_indexes
=
[
n_gram_index
]
for
i
in
range
(
n
-
1
):
word_indexes
.
append
(
word_indexes
[
-
1
]
+
1
)
return
word_indexes
def
get_n_gram_indexes_from_word_index
(
self
,
n
,
n_gram_cnt
,
word_index
):
n_gram_indexes
=
[
0
]
if
word_index
<
n
else
[
word_index
-
n
+
1
]
for
i
in
range
(
word_index
%
n
if
word_index
<
n
else
n
-
1
):
nxt
=
n_gram_indexes
[
-
1
]
+
1
if
nxt
<
n_gram_cnt
:
n_gram_indexes
.
append
(
nxt
)
return
n_gram_indexes
def
find_index_of_error
(
self
,
n_grams
):
"""
finds index of greatest error in n_grams
"""
# get probabilities for all n_grams
probs
=
{}
thresholds_passed
=
{}
smallest_prob_counter
=
{
2
:
{
i
:
0
for
i
in
range
(
len
(
n_grams
[
1
]))},
3
:
{
i
:
0
for
i
in
range
(
len
(
n_grams
[
1
]))}}
for
n
in
n_grams
:
# don't take 1-grams into account
if
n
==
1
:
continue
# smallest prob
probs
[
n
]
=
self
.
get_probs_of_n_grams
(
n_grams
[
n
])
try
:
for
index
in
self
.
get_word_indexes_from_n_gram_index
(
n
,
probs
[
n
].
index
(
min
(
probs
[
n
]))):
smallest_prob_counter
[
n
][
index
]
+=
1
except
Exception
:
pass
# threshholds check
if
np
.
prod
(
probs
[
n
])
==
0
:
thresholds_passed
[
n
]
=
True
else
:
thresholds_passed
[
n
]
=
-
np
.
log10
((
np
.
prod
(
probs
[
n
]))
**
(
1
/
len
(
n_grams
[
n
])))
<=
self
.
thresholds
[
n
]
# determine indexes of errors
i_errors
=
[]
max_counter
=
0
total_smallest_prob_counter
=
{
i
:
0
for
i
in
range
(
len
(
n_grams
[
1
]))}
for
n
,
smallest_probs
in
smallest_prob_counter
.
items
():
if
True
:
# thresholds_passed[n]:
for
index
in
total_smallest_prob_counter
:
total_smallest_prob_counter
[
index
]
+=
smallest_probs
[
index
]
for
index
,
counter
in
sorted
(
total_smallest_prob_counter
.
items
(),
key
=
lambda
index
:
-
index
[
1
]):
if
counter
>=
max_counter
and
counter
!=
0
:
i_errors
.
append
(
index
)
max_counter
=
counter
print
(
f
"
thresholds_passed:
{
thresholds_passed
}
"
)
return
i_errors
if
__name__
==
"
__main__
"
:
# get sentences
tester
=
Tester
()
# create grammar checker
grammar_checker
=
GrammarChecker
()
# check sentences
print
(
"
CORRECT SENTENCES
\n\n
"
)
for
sentence
in
tester
.
correct_sentences
:
print
(
sentence
.
text
)
grammar_checker
.
check
(
sentence
.
text
)
print
()
print
(
"
\n
TYPE 1 ERROR SENTENCES
\n\n
"
)
for
sentence
in
tester
.
type_1_error_sentences
:
print
(
sentence
.
text
)
print
(
sentence
.
original
)
grammar_checker
.
check
(
sentence
.
text
)
print
()
print
(
"
\n
TYPE 2 ERROR SENTENCES
\n\n
"
)
for
sentence
in
tester
.
type_2_error_sentences
:
print
(
sentence
.
text
)
print
(
sentence
.
original
)
grammar_checker
.
check
(
sentence
.
text
)
print
()
print
(
"
\n
TYPE 3 ERROR SENTENCES
\n\n
"
)
for
sentence
in
tester
.
type_3_error_sentences
:
print
(
sentence
.
text
)
print
(
sentence
.
original
)
grammar_checker
.
check
(
sentence
.
text
)
print
()
Loading