Skip to content
Snippets Groups Projects
Commit 0aa038c2 authored by Jeremy Mah Zhee Kein's avatar Jeremy Mah Zhee Kein
Browse files

Merge branch 'google_ngram' into 'main'

Google ngram

See merge request !3
parents 66f634cc 9537e0c2
No related branches found
No related tags found
1 merge request!3Google ngram
This diff is collapsed.
......@@ -4,6 +4,15 @@ import nltk
from tqdm import tqdm
from nltk import ngrams
import xml.etree.ElementTree as ET
import requests
import logging
import sys
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
log.addHandler(handler)
"""
corpus example:
......@@ -95,6 +104,115 @@ def create_corpus(n, verbose=False):
json.dump(corpus, outfile)
print("corpus saved successfully!")
def create_POS_corpus(n, verbose=False,tagset:str=None):
""" creates corpus of n-grams from training data """
# create empty corpus
corpus = {}
# define
tokenizer = nltk.RegexpTokenizer(r"\w+")
# create tree from xml
tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
root = tree.getroot()
# go through tree
for elem in tqdm(root[1]):
# if subelem has attributes
if elem.attrib:
# for each sentence in the dataset
for subelem in elem:
# get sid
sid = subelem.attrib["sid"]
# build sentence using only grammatically correct parts
sentence = ""
parts = list(subelem.iter())
for part in parts:
if part.tag in ["sentence", "ins"] and part.text is not None: # explicitly don't use 'del'
sentence += part.text
# add trailing text if exists
if part.tail:
sentence += part.tail
if verbose:
# print sentence
print(f"{sid}: {sentence}")
sentence = sentence.replace("__"," and ",1).replace("__"," or ").replace("_"," ")
tokens = nltk.word_tokenize(sentence)
tokens_with_tags = nltk.pos_tag(tokens)
for number,i in enumerate(tokens_with_tags):
if i[0] == "MATHDISP":
y = list(i)
y[1] = "MATHDISP"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "MATH":
y = list(i)
y[1] = "MATH"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "CITE":
y = list(i)
y[1] = "CITE"
i = tuple(y)
tokens_with_tags[number] = i
elif i[0] == "REF":
y = list(i)
y[1] = "REF"
i = tuple(y)
tokens_with_tags[number] = i
tags = [tag for token,tag in tokens_with_tags]
n_grams = list(ngrams(tags,n))
for n_gram in n_grams:
n_gram_str = " ".join(n_gram)
tag = n_gram[0]
if tag not in corpus:
corpus[tag] = {}
if n_gram_str in corpus[tag].keys():
corpus[tag][n_gram_str] += 1
else:
corpus[tag][n_gram_str] = 1
with open(f"../data/corpus_POS_n={n}.json", "w") as outfile:
json.dump(corpus, outfile)
log.info("corpus saved successfully!")
def get_google_ngram_occurences(query:str,corpus:str,start_year:int,end_year:int,specific_year:int,case_insensitive:bool=True,smoothing:int=3):
corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_gb_2012=18, eng_gb_2009=6,
chi_sim_2012=23, chi_sim_2009=11, eng_2012=15, eng_2009=0,
eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1,
fre_2012=19, fre_2009=7, ger_2012=20, ger_2009=8, heb_2012=24,
heb_2009=9, spa_2012=21, spa_2009=10, rus_2012=25, rus_2009=12,
ita_2012=22)
if corpus is None:
corpus = "eng_us_2012"
url = f"https://books.google.com/ngrams/json?content={query}&year_start={str(start_year)}&year_end={str(end_year)}&corpus={str(corpora[corpus])}&smoothing={str(smoothing)}&case_insensitive={str(case_insensitive)}"
response = requests.get(url)
print(response.content)
if response.ok:
results = json.loads(response.content)
if results and specific_year and start_year <= specific_year <= end_year:
index = [i for i in range(start_year,end_year+1)].index(specific_year)
specified_year = results[0]["timeseries"][index]
return specified_year
else:
if len(results) == 0:
log.info("No record or results")
return None
else:
log.info("No specific year chosen or wrong year order")
return results
else:
log.info("No response found for query")
return None
def trim_corpus(n, threshold):
""" removes n-grams with less than threshold occurrences """
......@@ -113,14 +231,15 @@ def trim_corpus(n, threshold):
corpus.pop(init_char, None)
with open(f"../data/corpus_n={n}_threshold={threshold}.json", "w") as outfile:
json.dump(corpus, outfile)
print("corpus trimmed successfully!")
log.info("corpus trimmed successfully!")
if __name__ == "__main__":
for n in range(1, 6):
print(f"n={n}")
create_corpus(n)
# for n in range(1, 6):
# print(f"n={n}")
# create_corpus(n)
# threshold = 10
# print(f"threshold={threshold}")
# trim_corpus(n, threshold)
print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
# create_POS_corpus(3)
\ No newline at end of file
import logging
logging.basicConfig(format="%(asctime)s %(filename)s: %(message)s")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment