Merge branch 'google_ngram' into 'main'

Google ngram See merge request !3

Merge branch 'google_ngram' into 'main'
0aa038c2 · Jeremy Mah Zhee Kein · 66f634cc · 9537e0c2 · 0aa038c2 · 0aa038c2
Commit 0aa038c2 authored 3 years ago by Jeremy Mah Zhee Kein
--- a/data/corpus_POS_n=3.json
+++ b/data/corpus_POS_n=3.json
--- a/src/create_corpus.py
+++ b/src/create_corpus.py
@@ -4,6 +4,15 @@ import nltk
 from tqdm import tqdm
 from nltk import ngrams
 import xml.etree.ElementTree as ET
+import requests
+import logging
+import sys
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+log.addHandler(handler)
 """
 corpus example:
@@ -95,6 +104,115 @@ def create_corpus(n, verbose=False):
        json.dump(corpus, outfile)
        print("corpus saved successfully!")
+def create_POS_corpus(n, verbose=False,tagset:str=None):
+    """ creates corpus of n-grams from training data """
+    # create empty corpus
+    corpus = {}
+    # define
+    tokenizer = nltk.RegexpTokenizer(r"\w+")
+    # create tree from xml
+    tree = ET.parse('../data/aesw2016(v1.2)_train.xml')
+    root = tree.getroot()
+    # go through tree
+    for elem in tqdm(root[1]):
+    # if subelem has attributes
+        if elem.attrib:
+            # for each sentence in the dataset
+            for subelem in elem:
+                # get sid
+                sid = subelem.attrib["sid"]
+                # build sentence using only grammatically correct parts
+                sentence = ""
+                parts = list(subelem.iter())
+                for part in parts:
+                    if part.tag in ["sentence", "ins"] and part.text is not None:  # explicitly don't use 'del'
+                        sentence += part.text
+                        # add trailing text if exists
+                        if part.tail:
+                            sentence += part.tail
+                if verbose:
+                    # print sentence
+                    print(f"{sid}: {sentence}")
+                sentence = sentence.replace("__"," and ",1).replace("__"," or ").replace("_"," ")
+                tokens = nltk.word_tokenize(sentence)
+                tokens_with_tags = nltk.pos_tag(tokens)
+                for number,i in enumerate(tokens_with_tags):
+                    if i[0] == "MATHDISP":
+                        y = list(i)
+                        y[1] = "MATHDISP"
+                        i = tuple(y)
+                        tokens_with_tags[number] = i
+                    elif i[0] == "MATH":
+                        y = list(i)
+                        y[1] = "MATH"
+                        i = tuple(y)
+                        tokens_with_tags[number] = i
+                    elif i[0] == "CITE":
+                        y = list(i)
+                        y[1] = "CITE"
+                        i = tuple(y)
+                        tokens_with_tags[number] = i
+                    elif i[0] == "REF":
+                        y = list(i)
+                        y[1] = "REF"
+                        i = tuple(y)
+                        tokens_with_tags[number] = i                
+                tags = [tag for token,tag in tokens_with_tags]
+                n_grams = list(ngrams(tags,n))
+                for n_gram in n_grams:
+                    n_gram_str = " ".join(n_gram)
+                    tag = n_gram[0]
+                    if tag not in corpus:
+                        corpus[tag] = {}
+                    if n_gram_str in corpus[tag].keys():
+                        corpus[tag][n_gram_str] += 1
+                    else:
+                        corpus[tag][n_gram_str] = 1
+    with open(f"../data/corpus_POS_n={n}.json", "w") as outfile:
+        json.dump(corpus, outfile)
+        log.info("corpus saved successfully!")
+def get_google_ngram_occurences(query:str,corpus:str,start_year:int,end_year:int,specific_year:int,case_insensitive:bool=True,smoothing:int=3):
+    corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_gb_2012=18, eng_gb_2009=6,
+               chi_sim_2012=23, chi_sim_2009=11, eng_2012=15, eng_2009=0,
+               eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1,
+               fre_2012=19, fre_2009=7, ger_2012=20, ger_2009=8, heb_2012=24,
+               heb_2009=9, spa_2012=21, spa_2009=10, rus_2012=25, rus_2009=12,
+               ita_2012=22)
+    if corpus is None:
+        corpus = "eng_us_2012"
+    url = f"https://books.google.com/ngrams/json?content={query}&year_start={str(start_year)}&year_end={str(end_year)}&corpus={str(corpora[corpus])}&smoothing={str(smoothing)}&case_insensitive={str(case_insensitive)}"
+    response = requests.get(url)
+    print(response.content)
+    if response.ok:
+        results = json.loads(response.content)
+        if results and specific_year and start_year <= specific_year <= end_year:
+            index = [i for i in range(start_year,end_year+1)].index(specific_year)
+            specified_year = results[0]["timeseries"][index]
+            return specified_year
+        else:
+            if len(results) == 0:
+                log.info("No record or results")
+                return None
+            else:
+                log.info("No specific year chosen or wrong year order")
+                return results
+    else:
+        log.info("No response found for query")
+        return None
 def trim_corpus(n, threshold):
    """ removes n-grams with less than threshold occurrences """
@@ -113,14 +231,15 @@ def trim_corpus(n, threshold):
            corpus.pop(init_char, None)
    with open(f"../data/corpus_n={n}_threshold={threshold}.json", "w") as outfile:
        json.dump(corpus, outfile)
-        print("corpus trimmed successfully!")
+        log.info("corpus trimmed successfully!")
 if __name__ == "__main__":
-    for n in range(1, 6):
+    # for n in range(1, 6):
-        print(f"n={n}")
+    #     print(f"n={n}")
-        create_corpus(n)
+    #     create_corpus(n)
        # threshold = 10
        # print(f"threshold={threshold}")
        # trim_corpus(n, threshold)
+    print(get_google_ngram_occurences( "I lovess Jesus","eng_2012",2002,2003,2003))
+    # create_POS_corpus(3)
\ No newline at end of file
--- a/src/logging_config.py
+++ b/src/logging_config.py
+import logging
+logging.basicConfig(format="%(asctime)s %(filename)s: %(message)s")