Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Statistical_Grammar_Checker
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
NLP_WS_2021
Statistical_Grammar_Checker
Commits
2e9bdd5e
Commit
2e9bdd5e
authored
3 years ago
by
jmzk96
Browse files
Options
Downloads
Patches
Plain Diff
added codes to script grammar_checker_google_jeremy
parent
06bd8a2f
Branches
main
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/grammar_checker_google.py
+1
-1
1 addition, 1 deletion
src/grammar_checker_google.py
src/grammar_checker_google_jeremy.py
+78
-0
78 additions, 0 deletions
src/grammar_checker_google_jeremy.py
with
79 additions
and
1 deletion
src/grammar_checker_google.py
+
1
−
1
View file @
2e9bdd5e
...
@@ -26,7 +26,7 @@ class GrammarChecker:
...
@@ -26,7 +26,7 @@ class GrammarChecker:
sentence
=
sentence
.
lower
()
sentence
=
sentence
.
lower
()
# create n_grams
# create n_grams
n_grams
=
{
1
:
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
1
))}
n_grams
=
{
1
:
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
1
))}
for
n
in
[
2
,
3
]
for
n
in
[
2
,
3
]
:
n_grams
[
n
]
=
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
n
))
n_grams
[
n
]
=
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
n
))
# find error
# find error
i_errors
=
self
.
find_index_of_error
(
n_grams
)
i_errors
=
self
.
find_index_of_error
(
n_grams
)
...
...
This diff is collapsed.
Click to expand it.
src/grammar_checker_google_jeremy.py
+
78
−
0
View file @
2e9bdd5e
import
json
import
nltk
import
math
import
requests
from
time
import
sleep
class
GrammarCheckerGoogle
:
def
__init__
(
self
,
n
,
float_min
,
threshold
):
# required
nltk
.
download
(
'
punkt
'
)
self
.
float_min
=
float_min
# variables
self
.
threshold
=
threshold
self
.
n
=
n
# tokenizer
self
.
tokenizer
=
nltk
.
RegexpTokenizer
(
r
"
\w+
"
)
def
check
(
self
,
sentence
):
"""
checks a sentence for errors and recursively corrects the first one
"""
# lower case sentence
sentence
=
sentence
.
lower
()
n_grams
=
list
(
nltk
.
ngrams
(
self
.
tokenizer
.
tokenize
(
sentence
),
self
.
n
))
# find error
i_error
=
self
.
find_index_of_error
(
n_grams
)
return
i_error
def
get_google_ngram_prob
(
self
,
n_gram
):
"""
gets probability for given n_gram
"""
url
=
f
"
https://books.google.com/ngrams/json?content=
{
'
'
.
join
(
n_gram
)
}
&case_insensitive=true
"
successful
=
False
wait_time
=
0.0001
while
not
successful
:
response
=
requests
.
get
(
url
)
sleep
(
wait_time
)
if
response
.
ok
:
successful
=
True
results
=
json
.
loads
(
response
.
content
)
if
results
:
max_prob
=
0.0
for
result
in
results
:
cur_max_prob
=
max
(
results
[
0
][
"
timeseries
"
])
max_prob
=
cur_max_prob
if
cur_max_prob
>
max_prob
else
max_prob
return
max_prob
else
:
return
None
if
not
successful
:
if
wait_time
<
10
:
# print(f"no response: increasing wait time from {wait_time} to {wait_time*10}.")
wait_time
*=
10
else
:
pass
# print("still no response.")
def
get_prob_of_n_gram
(
self
,
n_gram
):
"""
calculates probability of n_gram
"""
# smallest possible positive float (1e-324 == 0.0)
# float_min = 1e-323
# get n_gram probability
prob
=
self
.
get_google_ngram_prob
(
n_gram
)
return
prob
if
prob
!=
0.0
and
prob
!=
None
else
self
.
float_min
def
find_index_of_error
(
self
,
n_grams
):
"""
finds index of greatest error in n_grams
"""
if
len
(
n_grams
)
!=
0
:
probs
=
[
self
.
get_prob_of_n_gram
(
n_gram
)
for
n_gram
in
n_grams
]
else
:
probs
=
[
self
.
float_min
]
chained_probs
=
math
.
prod
(
probs
)
diff_list
=
[(
j
-
i
)
for
i
,
j
in
zip
(
probs
[:
-
1
],
probs
[
1
:])]
if
(
chained_probs
)
**
(
1
/
len
(
n_grams
))
<=
self
.
threshold
:
return
probs
.
index
(
min
(
probs
))
else
:
return
None
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment