# Course 2 - N-grams

## Import librairies 

In [1]:
from datasets import load_dataset
from collections import Counter, defaultdict
import math
from nltk.tokenize import word_tokenize, sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


## Import dataset

In [13]:
dataset = load_dataset("gustavecortal/DreamBank-annotated")
df = dataset["train"].to_pandas()
df["report"] = df["report"].astype(str)

train_df = df.query("id != 'vietnam_vet3'")
train_list = list(train_df["report"])
print("Number of dream reports in the training set:",len(train_list))

test_df = df.query("id == 'vietnam_vet3'")
test_list = list(test_df["report"])
print("Number of dream reports in the test set:",len(test_list))

Number of dream reports in the training set: 27489
Number of dream reports in the test set: 463


In [14]:
train_strings = " ".join(train_list)
train_strings = train_strings.lower()
train_tokens = word_tokenize(train_strings)
print("Number of tokens in the training set:",len(train_tokens))

vocab = set(train_tokens)
print("Vocabulary size:",len(vocab))

test_strings = " ".join(test_list)
test_strings = test_strings.lower()
test_tokens = word_tokenize(test_strings)
print("Number of tokens in the test set:",len(test_tokens))

Number of tokens in the training set: 5538015
Vocabulary size: 52195
Number of tokens in the test set: 78254


## Train n-grams

In [4]:
def tokenize(text):
    """Tokenize the input text."""
    
    return word_tokenize(text)

def count_ngrams(tokens, n):
    """Counts n-grams."""
    
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    
    return Counter(ngrams)

def calculate_ngram_probabilities(train_tokens, n, test_tokens, k=0.00001):
    """Calculates n-gram probabilities."""
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        if ngram not in ngram_counts:
            ngram_counts[ngram] = k
            prefix = ngram[:-1]
            if prefix not in n_minus_one_gram_counts:
                n_minus_one_gram_counts[prefix] = k
            ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)
    
    return ngram_probabilities

## Example

In [15]:
n = 5

ngram_probabilities = calculate_ngram_probabilities(train_tokens, n, test_tokens)
print(f"Number of {n}-grams:",len(ngram_probabilities))

Number of 5-grams: 4864073


In [16]:
ngram_probabilities

defaultdict(float,
            {('the', 'one', 'at', 'the', 'meads'): 0.1329467319687954,
             ('one', 'at', 'the', 'meads', "'s"): 0.6570606323425058,
             ('at', 'the', 'meads', "'s", 'house'): 0.6570606323425058,
             ('the', 'meads', "'s", 'house', ','): 0.6570606323425058,
             ('meads', "'s", 'house', ',', 'where'): 0.6570606323425058,
             ("'s", 'house', ',', 'where', 'it'): 0.1050224953686006,
             ('house', ',', 'where', 'it', "'s"): 0.6570606323425058,
             (',', 'where', 'it', "'s", 'bigger'): 0.11734624429121941,
             ('where', 'it', "'s", 'bigger', 'inside'): 0.6570606323425058,
             ('it', "'s", 'bigger', 'inside', 'than'): 0.6570606323425058,
             ("'s", 'bigger', 'inside', 'than', 'out'): 0.6570606323425058,
             ('bigger', 'inside', 'than', 'out', ';'): 0.6570606323425058,
             ('inside', 'than', 'out', ';', 'there'): 0.6570606323425058,
             ('than', 'out', ';', 't

## Evaluate perplexity

In [17]:
def calculate_perplexity(test_tokens, ngram_probabilities, n):
    """Calculates the perplexity of a test corpus given n-gram probabilities."""
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        log_probability_sum += math.log2(ngram_probabilities[ngram])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

In [18]:
calculate_perplexity(train_tokens, ngram_probabilities, n)

2.4339180118336756

In [19]:
calculate_perplexity(test_tokens, ngram_probabilities, n)

17168.83336216221

In [20]:
def greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()
        
        for v in vocab:

            ngram = list(context)
            ngram.append(v)
            ngram = tuple(ngram)
            probs[v] = ngram_probabilities[ngram]

        best_token = max(probs, key=probs.get) # greedy 
        #print(best_v)
        #print(probs[best_v])
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [21]:
context = ['the', 'one', 'at', 'the']
sentence = greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 200)
print(" ".join(context) + " " +  " ".join(sentence))

the one at the meads 's house , where i was going . i was walking down the street and i was on the phone . i am constantly interrupted by the antics of his 2-year-old boy who i am apparently babysitting . the boy runs and jumps from a high place . i yell , `` get him in the car ! i 'll be right back . '' i said , `` i 'm going to be late for work and i ca n't remember what it was . i was in a room with my friends niles zelling , matt yams , queen billman , and another woman . i am in a house . i was in a room with my friends niles zelling , matt yams , queen billman , and another woman . i am in a house . i was in a room with my friends niles zelling , matt yams , queen billman , and another woman . i am in a house . i was in a room with my friends niles zelling , matt yams , queen billman , and another woman . i am in a house . i was in a room


## TODO

padding, backoff, interpolation, better sampling methods (top-k, top-p), etc.

In [None]:
def padding(sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None):
    if pad_left:
        sequence = [left_pad_symbol] * (n-1) + sequence
    if pad_right:
        sequence += [right_pad_symbol] * (n-1)
    return sequence

# Using NLTK.lm

## Import dataset and prepare training and test sets

In [98]:
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, StupidBackoff, Laplace

In [99]:
train_sents = sent_tokenize(train_strings)
print(len(train_sents))
train_sents_tokens = [word_tokenize(s) for s in train_sents]

test_sents = sent_tokenize(test_strings)
print(len(test_sents))
test_sents_tokens = [word_tokenize(s) for s in test_sents]

351146
5627


In [100]:
test_sents_tokens[:2]

[['i',
  "'m",
  'in',
  'honduras',
  'at',
  'some',
  'kind',
  'of',
  'border',
  'crossing',
  '.'],
 ['a',
  'wide',
  'corridor',
  'of',
  'thick',
  'fenced',
  'scrub',
  'is',
  'off',
  'on',
  'either',
  'side',
  '..',
  'everything',
  'is',
  'grim',
  ',',
  'sinister',
  ',',
  'ominous',
  '.']]

In [101]:
n = 3

train, vocab = padded_everygram_pipeline(n, train_sents_tokens)

#lm = MLE(n) # Maximum Likelihood Estimate
#lm = StupidBackoff(order = n) # Stupid Backoff
lm = Laplace(n) # Laplace smoothing

In [102]:
lm.fit(train, vocab)
print(lm.vocab)
print(len(lm.vocab))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 52182 items>
52182


In [103]:
lm.vocab.lookup(["france", "gustave"])

('france', '<UNK>')

In [104]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 19774470 ngrams>


In [105]:
lm.counts['gustave']

0

In [106]:
lm.counts['france']

62

In [107]:
lm.counts['dream']

7106

In [108]:
lm.counts[['i']]['love']

288

In [109]:
lm.counts[['i']]['want']

1624

In [110]:
lm.score("i")

0.038742394727067954

In [111]:
lm.score("france")

9.006667507288967e-06

In [112]:
lm.score("gustave")

1.429629763061741e-07

In [113]:
lm.score("i", ["love"])

7.431076763022962e-05

In [114]:
lm.perplexity(train_sents_tokens)

50465.94290553111

In [115]:
lm.perplexity(test_sents_tokens)

49653.97430774097

In [116]:
lm.generate(20, random_seed=42)

['land',
 ')',
 'blue',
 ',',
 'showing',
 'my',
 'stamina',
 '.',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>']

In [117]:
lm.generate(20, text_seed=['i'], random_seed=42)

['say',
 ',',
 '``',
 'hon',
 ',',
 'relax',
 '.',
 "''",
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>',
 '</s>']