Skip to content
Snippets Groups Projects
Commit 0aa038c2 authored by Jeremy Mah Zhee Kein's avatar Jeremy Mah Zhee Kein
Browse files

Merge branch 'google_ngram' into 'main'

Google ngram

See merge request !3
parents 66f634cc 9537e0c2
No related branches found
No related tags found
1 merge request!3Google ngram
This diff is collapsed.
...@@ -4,6 +4,15 @@ import nltk ...@@ -4,6 +4,15 @@ import nltk
from tqdm import tqdm from tqdm import tqdm
from nltk import ngrams from nltk import ngrams
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import requests
import logging
import sys
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
log.addHandler(handler)
""" """
corpus example: corpus example:
...@@ -95,6 +104,115 @@ def create_corpus(n, verbose=False): ...@@ -95,6 +104,115 @@ def create_corpus(n, verbose=False):
json.dump(corpus, outfile) json.dump(corpus, outfile)
print("corpus saved successfully!") print("corpus saved successfully!")
def create_POS_corpus(n, verbose=False,tagset:str=None):
""" creates corpus of n-grams from training data """
# create empty corpus
corpus = {}
# define
tokenizer = nltk.RegexpTokenizer(r"\w+")
# create tree from xml
tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
root = tree.getroot()
# go through tree
for elem in tqdm(root[1]):
# if subelem has attributes
if elem.attrib:
# for each sentence in the dataset
for subelem in elem:
# get sid
sid = subelem.attrib["sid"]
# build sentence using only grammatically correct parts
sentence = ""
parts = list(subelem.iter())
for part in parts:
if part.tag in ["sentence", "ins"] and part.text is not None: # explicitly don't use 'del'
sentence += part.text
# add trailing text if exists
if part.tail:
sentence += part.tail
if verbose:
# print sentence
print(f"{sid}: {sentence}")
sentence = sentence.replace("__"," and ",1).replace("__"," or ").replace("_"," ")
tokens = nltk.word_tokenize(sentence)
tokens_with_tags = nltk.pos_tag(tokens)
for number,i in enumerate(tokens_with_tags):
if i[0] == "MATHDISP":
y = list(i)
y[1] = "MATHDISP"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "MATH":
y = list(i)
y[1] = "MATH"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "CITE":
y = list(i)
y[1] = "CITE"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "REF":
y = list(i)
y[1] = "REF"
i = tuple(y)
tokens_with_tags[number] = i
tags = [tag for token,tag in tokens_with_tags]
n_grams = list(ngrams(tags,n))
for n_gram in n_grams:
n_gram_str = " ".join(n_gram)
tag = n_gram[0]
if tag not in corpus:
corpus[tag] = {}
if n_gram_str in corpus[tag].keys():
corpus[tag][n_gram_str] += 1
else:
corpus[tag][n_gram_str] = 1
with open(f"../data/corpus_POS_n={n}.json", "w") as outfile:
json.dump(corpus, outfile)
log.info("corpus saved successfully!")
def get_google_ngram_occurences(query:str,corpus:str,start_year:int,end_year:int,specific_year:int,case_insensitive:bool=True,smoothing:int=3):
corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_gb_2012=18, eng_gb_2009=6,
chi_sim_2012=23, chi_sim_2009=11, eng_2012=15, eng_2009=0,
eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1,
fre_2012=19, fre_2009=7, ger_2012=20, ger_2009=8, heb_2012=24,
heb_2009=9, spa_2012=21, spa_2009=10, rus_2012=25, rus_2009=12,
ita_2012=22)
if corpus is None:
corpus = "eng_us_2012"
url = f"https://books.google.com/ngrams/json?content={query}&year_start={str(start_year)}&year_end={str(end_year)}&corpus={str(corpora[corpus])}&smoothing={str(smoothing)}&case_insensitive={str(case_insensitive)}"
response = requests.get(url)
print(response.content)
if response.ok:
results = json.loads(response.content)
if results and specific_year and start_year <= specific_year <= end_year:
index = [i for i in range(start_year,end_year+1)].index(specific_year)
specified_year = results[0]["timeseries"][index]
return specified_year
else:
if len(results) == 0:
log.info("No record or results")
return None
else:
log.info("No specific year chosen or wrong year order")
return results
else:
log.info("No response found for query")
return None
def trim_corpus(n, threshold): def trim_corpus(n, threshold):
""" removes n-grams with less than threshold occurrences """ """ removes n-grams with less than threshold occurrences """
...@@ -113,14 +231,15 @@ def trim_corpus(n, threshold): ...@@ -113,14 +231,15 @@ def trim_corpus(n, threshold):
corpus.pop(init_char, None) corpus.pop(init_char, None)
with open(f"../data/corpus_n={n}_threshold={threshold}.json", "w") as outfile: with open(f"../data/corpus_n={n}_threshold={threshold}.json", "w") as outfile:
json.dump(corpus, outfile) json.dump(corpus, outfile)
print("corpus trimmed successfully!") log.info("corpus trimmed successfully!")
if __name__ == "__main__": if __name__ == "__main__":
for n in range(1, 6): # for n in range(1, 6):
print(f"n={n}") # print(f"n={n}")
create_corpus(n) # create_corpus(n)
# threshold = 10 # threshold = 10
# print(f"threshold={threshold}") # print(f"threshold={threshold}")
# trim_corpus(n, threshold) # trim_corpus(n, threshold)
print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
# create_POS_corpus(3)
\ No newline at end of file
import logging
logging.basicConfig(format="%(asctime)s %(filename)s: %(message)s")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment