Namesformer

Namesformer#

Before we get into the lecture you can play with the trained model here: Namesformer Streamlit app.

Inspired by Andrej Karpathy lecture makemore that contains english name generation.

The code was fully writen using ChatGPT with minimal corrections. My first query was:

I am preparing a lecture for my students on AI basics. They already know how to use attention in PyTorch to create self-attention layers. What I want to explain them is how to make a simplest possible transformer architecture (with minimal amount of code).
 As a dataset I will use a csv with names:
    john
    peter
    mike
    ...
And the goal will be to generate more names that sound name-like.
Give me an implementation with PyTorch trying to keep it as minimal as possible.

After that I had to ask for couple corrections, like avoiding using Transformer layer, adding comments, fixing a bug in token indexing. All were relatively easy to spot and in less than an hour this notebook was generating plausibly sounding names.

I decided to replace original dataset since I found a list of Lithuanian names that are easy to extract from vardai.vlkk.lt using the following code snippet:

import requests
from bs4 import BeautifulSoup

names = []
for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', class_='names_list__links names_list__links--man')
    names += [name.text for name in links]

If you want to play with english names download them from here and use names.txt instead of vardai.txt.

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
names = pd.read_csv('vardai.txt')['name'].values
names
array(['Ãbas', 'Ãbdijus', 'Abdònas', ..., 'Žilvynas', 'Žimantas',
       'Žydrunas'], dtype=object)
len(names)
3495

Let’s add a space at the end to mark the end of the name

names += ' '
names[0]
'Ãbas '

Note that this dataset is not simple since it uses accentuation symbols and capital letters. Let’s intentionally keep it like this and see if the model can figure it out.

Our transformer will be based on the self-attention.

dataset.vocab_size
83
len(dataset.int_to_char)
83
# Adjusted NameDataset
class NameDataset(Dataset):
    def __init__(self, csv_file):
        self.names = pd.read_csv('vardai.txt')['name'].values
        self.chars = sorted(list(set(''.join(self.names) + ' ')))  # Including a padding character
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '  # Adding padding character at the end
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)

# Custom collate function for padding
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    return input_seq, target_seq

# Minimal Transformer Model
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

# Training Loop
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')


csv_file = 'vardai.txt'
dataset = NameDataset(csv_file)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)
model = MinimalTransformer(vocab_size=dataset.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model, dataloader)
/opt/homebrew/Caskroom/miniforge/base/envs/torch/lib/python3.10/site-packages/torch/nn/modules/transformer.py:286: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)
  warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
Epoch 1, Average Loss: 1.5661300399086693
Epoch 2, Average Loss: 1.3030138059095904
Epoch 3, Average Loss: 1.274652126160535
Epoch 4, Average Loss: 1.2546222600069914
Epoch 5, Average Loss: 1.2314461583440954
Epoch 6, Average Loss: 1.22686504017223
Epoch 7, Average Loss: 1.2192441631447186
Epoch 8, Average Loss: 1.2121069810607217
Epoch 9, Average Loss: 1.2129392515529285
Epoch 10, Average Loss: 1.2083363023671236

And generate a name by predicing the next letter.

def sample(model, dataset, start_str='a', max_length=20):
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension
        
        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq)
            
            # Get the last character from the output
            probabilities = torch.softmax(output[0, -1], dim=0)
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]
            
            if next_char == ' ':  # Assume ' ' is your end-of-sequence character
                break
            
            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)
        
        return output_name

# After training your model, generate a name starting with a specific letter
for _ in range(10):
    generated_name = sample(model, dataset, start_str='R')
    print(generated_name)
Regontas
Rongonis
Rẽšvijus
Ralijijus
Rugaustinas
Rámeñtas
Raùcius
Rigū̃sintas
Rùntis
Rorū́lis

Not bad! Note that this name is not in our names list.

generated_name
'Rorū́lis'
generated_name + ' ' in names
False

Let’s train for longer.

train_model(model, dataloader, epochs=200)
Epoch 1, Average Loss: 1.2126153149388053
Epoch 2, Average Loss: 1.2008646195585078
Epoch 3, Average Loss: 1.2044431323354894
Epoch 4, Average Loss: 1.2084777745333586
Epoch 5, Average Loss: 1.2039412173357877
Epoch 6, Average Loss: 1.2010385670445183
Epoch 7, Average Loss: 1.2084186375141144
Epoch 8, Average Loss: 1.2009209258989855
Epoch 9, Average Loss: 1.1919255847280676
Epoch 10, Average Loss: 1.196088556809859
Epoch 11, Average Loss: 1.1934009887955406
Epoch 12, Average Loss: 1.1910438266667454
Epoch 13, Average Loss: 1.1883929929950021
Epoch 14, Average Loss: 1.18849547884681
Epoch 15, Average Loss: 1.1841346204280854
Epoch 16, Average Loss: 1.1886381100524555
Epoch 17, Average Loss: 1.1862260487946596
Epoch 18, Average Loss: 1.1879008596593683
Epoch 19, Average Loss: 1.1960965817624873
Epoch 20, Average Loss: 1.182948770306327
Epoch 21, Average Loss: 1.1842969905246388
Epoch 22, Average Loss: 1.184790024432269
Epoch 23, Average Loss: 1.181914219531146
Epoch 24, Average Loss: 1.1763226807117462
Epoch 25, Average Loss: 1.179220708391883
Epoch 26, Average Loss: 1.1783306847919117
Epoch 27, Average Loss: 1.1771398804404518
Epoch 28, Average Loss: 1.1773126699707726
Epoch 29, Average Loss: 1.176071114431728
Epoch 30, Average Loss: 1.1774945611303502
Epoch 31, Average Loss: 1.1753104345365004
Epoch 32, Average Loss: 1.1693926020102068
Epoch 33, Average Loss: 1.1799512798135932
Epoch 34, Average Loss: 1.1740663609721445
Epoch 35, Average Loss: 1.1694568932056426
Epoch 36, Average Loss: 1.1811962263150648
Epoch 37, Average Loss: 1.17307439067147
Epoch 38, Average Loss: 1.176123644547029
Epoch 39, Average Loss: 1.1735039640556681
Epoch 40, Average Loss: 1.1717253424904563
Epoch 41, Average Loss: 1.177093555168672
Epoch 42, Average Loss: 1.171743896332654
Epoch 43, Average Loss: 1.1714473003690893
Epoch 44, Average Loss: 1.1676099918105385
Epoch 45, Average Loss: 1.1768520918759433
Epoch 46, Average Loss: 1.173742602630095
Epoch 47, Average Loss: 1.180647957866842
Epoch 48, Average Loss: 1.1670065533031118
Epoch 49, Average Loss: 1.1708998994393782
Epoch 50, Average Loss: 1.174356814406135
Epoch 51, Average Loss: 1.171154983477159
Epoch 52, Average Loss: 1.1716643262993205
Epoch 53, Average Loss: 1.1670574442906814
Epoch 54, Average Loss: 1.1785864970900797
Epoch 55, Average Loss: 1.1648033169182863
Epoch 56, Average Loss: 1.1688325421376662
Epoch 57, Average Loss: 1.17388866760514
Epoch 58, Average Loss: 1.1666124620220877
Epoch 59, Average Loss: 1.172797761180184
Epoch 60, Average Loss: 1.176192511211742
Epoch 61, Average Loss: 1.169443035667593
Epoch 62, Average Loss: 1.1800002092664892
Epoch 63, Average Loss: 1.171481213244525
Epoch 64, Average Loss: 1.1669982026923786
Epoch 65, Average Loss: 1.1764582509344275
Epoch 66, Average Loss: 1.163921559940685
Epoch 67, Average Loss: 1.1642179787158966
Epoch 68, Average Loss: 1.1651612466031855
Epoch 69, Average Loss: 1.1650332786820152
Epoch 70, Average Loss: 1.1688154946674
Epoch 71, Average Loss: 1.1705795759504491
Epoch 72, Average Loss: 1.1614629257809033
Epoch 73, Average Loss: 1.1699370909820903
Epoch 74, Average Loss: 1.1682413280010224
Epoch 75, Average Loss: 1.1624731887470592
Epoch 76, Average Loss: 1.1658373101191086
Epoch 77, Average Loss: 1.1636967192996632
Epoch 78, Average Loss: 1.1620631938630885
Epoch 79, Average Loss: 1.1607454305345362
Epoch 80, Average Loss: 1.1687679886817932
Epoch 81, Average Loss: 1.1678154587745666
Epoch 82, Average Loss: 1.1654754454439336
Epoch 83, Average Loss: 1.155006990107623
Epoch 84, Average Loss: 1.163443606008183
Epoch 85, Average Loss: 1.1602861404418945
Epoch 86, Average Loss: 1.167173838073557
Epoch 87, Average Loss: 1.1719990968704224
Epoch 88, Average Loss: 1.1669102771715685
Epoch 89, Average Loss: 1.1618749781088396
Epoch 90, Average Loss: 1.165453993732279
Epoch 91, Average Loss: 1.1623399273915724
Epoch 92, Average Loss: 1.1563565720211375
Epoch 93, Average Loss: 1.163650984655727
Epoch 94, Average Loss: 1.1724212256344881
Epoch 95, Average Loss: 1.167005370421843
Epoch 96, Average Loss: 1.1674671113491057
Epoch 97, Average Loss: 1.1701534655961123
Epoch 98, Average Loss: 1.1644188783385536
Epoch 99, Average Loss: 1.1655203813856299
Epoch 100, Average Loss: 1.1715436978773637
Epoch 101, Average Loss: 1.1552146738225764
Epoch 102, Average Loss: 1.1608184684406628
Epoch 103, Average Loss: 1.159850030595606
Epoch 104, Average Loss: 1.1684163879264484
Epoch 105, Average Loss: 1.1663107931613923
Epoch 106, Average Loss: 1.158122861927206
Epoch 107, Average Loss: 1.1624844074249268
Epoch 108, Average Loss: 1.1574821585958655
Epoch 109, Average Loss: 1.1655770285563036
Epoch 110, Average Loss: 1.1647455410523848
Epoch 111, Average Loss: 1.1573507780378516
Epoch 112, Average Loss: 1.1607191784815354
Epoch 113, Average Loss: 1.1629504626447504
Epoch 114, Average Loss: 1.1642750745469874
Epoch 115, Average Loss: 1.1613674646074121
Epoch 116, Average Loss: 1.166970002109354
Epoch 117, Average Loss: 1.1576437251134353
Epoch 118, Average Loss: 1.1650751937519421
Epoch 119, Average Loss: 1.1617956735871056
Epoch 120, Average Loss: 1.164371263439005
Epoch 121, Average Loss: 1.1629116388884457
Epoch 122, Average Loss: 1.1550310535864396
Epoch 123, Average Loss: 1.1581492738290267
Epoch 124, Average Loss: 1.158476221561432
Epoch 125, Average Loss: 1.1614776307886296
Epoch 126, Average Loss: 1.1575686704028736
Epoch 127, Average Loss: 1.1618938050486824
Epoch 128, Average Loss: 1.1620031795718453
Epoch 129, Average Loss: 1.1597038236531345
Epoch 130, Average Loss: 1.1628466026349502
Epoch 131, Average Loss: 1.158760524337942
Epoch 132, Average Loss: 1.1628416088494387
Epoch 133, Average Loss: 1.1603023599494586
Epoch 134, Average Loss: 1.1653292233293706
Epoch 135, Average Loss: 1.1570394418456338
Epoch 136, Average Loss: 1.159251956506209
Epoch 137, Average Loss: 1.15108516107906
Epoch 138, Average Loss: 1.157251045920632
Epoch 139, Average Loss: 1.1605818797241558
Epoch 140, Average Loss: 1.162669566002759
Epoch 141, Average Loss: 1.1601114251396873
Epoch 142, Average Loss: 1.16759164983576
Epoch 143, Average Loss: 1.1619377488439733
Epoch 144, Average Loss: 1.1621505347165195
Epoch 145, Average Loss: 1.1543098693544214
Epoch 146, Average Loss: 1.1532247613776814
Epoch 147, Average Loss: 1.1500764911825008
Epoch 148, Average Loss: 1.1602625012397767
Epoch 149, Average Loss: 1.1586852783506567
Epoch 150, Average Loss: 1.1529835099523718
Epoch 151, Average Loss: 1.1618636467240073
Epoch 152, Average Loss: 1.1549416943029924
Epoch 153, Average Loss: 1.1662258408286355
Epoch 154, Average Loss: 1.1571188888766548
Epoch 155, Average Loss: 1.1591200124133716
Epoch 156, Average Loss: 1.1585772801529277
Epoch 157, Average Loss: 1.1559988807548176
Epoch 158, Average Loss: 1.1582237693396482
Epoch 159, Average Loss: 1.1598566656762903
Epoch 160, Average Loss: 1.150617011568763
Epoch 161, Average Loss: 1.1551114580848
Epoch 162, Average Loss: 1.1640103768218648
Epoch 163, Average Loss: 1.160990835319866
Epoch 164, Average Loss: 1.1596852134574542
Epoch 165, Average Loss: 1.1487048609690234
Epoch 166, Average Loss: 1.1591558082537217
Epoch 167, Average Loss: 1.1607873385602778
Epoch 168, Average Loss: 1.1556667615066876
Epoch 169, Average Loss: 1.1585448243401266
Epoch 170, Average Loss: 1.1581260957501152
Epoch 171, Average Loss: 1.1559378802776337
Epoch 172, Average Loss: 1.161676243760369
Epoch 173, Average Loss: 1.158989368243651
Epoch 174, Average Loss: 1.1545235828919844
Epoch 175, Average Loss: 1.1571894456039775
Epoch 176, Average Loss: 1.1542898205193606
Epoch 177, Average Loss: 1.1522481392730366
Epoch 178, Average Loss: 1.1551333757964048
Epoch 179, Average Loss: 1.153004600243135
Epoch 180, Average Loss: 1.1598306520418686
Epoch 181, Average Loss: 1.162586285309358
Epoch 182, Average Loss: 1.1609084183519536
Epoch 183, Average Loss: 1.1571473636410452
Epoch 184, Average Loss: 1.1611249522729354
Epoch 185, Average Loss: 1.1568290174007416
Epoch 186, Average Loss: 1.1532831658016551
Epoch 187, Average Loss: 1.1452395493333989
Epoch 188, Average Loss: 1.1520163557746195
Epoch 189, Average Loss: 1.1546604254029014
Epoch 190, Average Loss: 1.1602082713083788
Epoch 191, Average Loss: 1.1623538266528737
Epoch 192, Average Loss: 1.1638677071441303
Epoch 193, Average Loss: 1.1488759859041735
Epoch 194, Average Loss: 1.14711737036705
Epoch 195, Average Loss: 1.1594428961927241
Epoch 196, Average Loss: 1.1597402187910948
Epoch 197, Average Loss: 1.1606087662956932
Epoch 198, Average Loss: 1.1493876272981818
Epoch 199, Average Loss: 1.1451929959383877
Epoch 200, Average Loss: 1.1518649155443366
for _ in range(10):
    generated_name = sample(model, dataset, start_str='R')
    print(generated_name)
Ritãperdas
Rãlantas
Rãtardas
Ripõnas
Rìlmiktẽras
Rárvydas
Rìnis
Rarntìndas
Rusktaugas
Rĩšvas

If we want the model to be more creative we can add temperature/creativity control.

def sample(model, dataset, start_str='a', max_length=20, temperature=1.0):
    assert temperature > 0, "Temperature must be greater than 0"
    model.eval()  # Switch model to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension
        
        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq)
            
            # Apply temperature scaling
            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)
            
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]
            
            if next_char == ' ':  # Assume ' ' is your end-of-sequence character
                break
            
            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)
        
        return output_name

# Example usage with different temperatures
print('More confident:')
for _ in range(10):
    print(' ', sample(model, dataset, start_str='R', temperature=0.5))  # More confident

print('\nMore diverse/creative:')
for _ in range(10):
    print(' ', sample(model, dataset, start_str='R', temperature=1.5))  # More diverse
More confident:
  Rìnartas
  Rèntas
  Rĩgis
  Rùrìldas
  Ròrijus
  Rìnis
  Rarìtas
  Rėnas
  Relinijus
  Rìnijus

More diverse/creative:
  Rūdrìmildas
  Ratėnas
  Revýdridas
  Rõr̃mijus
  Rýgmas
  Ròtotrdas
  Rãan
  Riguòmas
  Ridmistis
  Rìrhez

Here we go, we have a Lithuanian name generator!

import json

torch.save(model, '../namesformer_app/namesformer_model.pt')
    
with open('../namesformer_app/int_to_char.json', 'w') as f:
    json.dump(dataset.int_to_char, f)

with open('../namesformer_app/char_to_int.json', 'w') as f:
    json.dump(dataset.char_to_int, f)