import torch
import transformers
import matplotlib
from matplotlib import pyplot
import time

matplotlib.rcParams.update({'font.size': 14, 'figure.figsize': (6.0, 6.0)})

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x1044ab8b0>

model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,local_files_only=True)
model = transformers.AutoModelForCausalLM.from_pretrained(model_name,torch_dtype='auto',local_files_only=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)

prompt = 'Please tell me all the ways Sun Wukong became immortal in Journey to the West.'
input = tokenizer(f'<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>', return_tensors='pt')

input

{'input_ids': tensor([[128000, 128006,    882, 128007,    271,   5618,   3371,    757,    682,
            279,   5627,   8219,    468,   3178,    647,   6244,  60214,    304,
          43680,    311,    279,   4410,     13, 128009, 128006,  78191, 128007]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}

tokenizer.batch_decode(input['input_ids'].t())

['<|begin_of_text|>',
 '<|start_header_id|>',
 'user',
 '<|end_header_id|>',
 '\n\n',
 'Please',
 ' tell',
 ' me',
 ' all',
 ' the',
 ' ways',
 ' Sun',
 ' W',
 'uk',
 'ong',
 ' became',
 ' immortal',
 ' in',
 ' Journey',
 ' to',
 ' the',
 ' West',
 '.',
 '<|eot_id|>',
 '<|start_header_id|>',
 'assistant',
 '<|end_header_id|>']

output = model.generate(**input, max_length=256)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

output

tensor([[128000, 128006,    882, 128007,    271,   5618,   3371,    757,    682,
            279,   5627,   8219,    468,   3178,    647,   6244,  60214,    304,
          43680,    311,    279,   4410,     13, 128009, 128006,  78191, 128007,
            271,     32,   2294,   3488,    922,    832,    315,   8620,  17649,
            596,   1455,  28530,   5885,   2268,  11439,    311,    279,  11670,
          11775,  43680,    311,    279,   4410,     11,   8219,    468,   3178,
            647,     11,   1101,   3967,    439,    279,  58937,   6342,     11,
           6244,  60214,   1555,    264,  10824,    315,    813,   1866,  13736,
             11,  24632,   4455,     11,    323,    279,  56650,    315,   5370,
            409,   1385,     13,   5810,    527,    279,   1401,   5627,    568,
          17427,   4998,  76052,   1473,     16,     13,   3146,  59204,    505,
            264,   9998,  96618,   8219,    468,   3178,    647,    574,   9405,
            505,    264,   9998,    304,    264,  16700,  26457,     11,    902,
            574,   1071,    311,    387,    279,   1121,    315,    264,  24632,
          25885,     13,   1115,  19018,   7342,  13160,   1461,    439,    264,
           1694,    449,  24674,  13736,    323,  18000,    627,     17,     13,
           3146,  43066,    287,    279,  21594,  35257,  44187,  96618,   1666,
            264,   3995,  39803,     11,   8219,    468,   3178,    647,  11352,
            264,  24632,  14098,    430,  11938,   1461,   4998,  76052,     13,
            578,  14098,     11,   3967,    439,    279,  21594,  35257,  44187,
            320,  19171,   2663,    279,    330,  38120,     12,   3968,   3372,
          44187,      1,    477,    330,  38120,     12,   3968,   3372,  44187,
            315,    279,  18288,  61269,   4063,    574,   1071,    311,   6782,
            279,  28591,    315,    279,   4330,   5540,     25,   7732,     11,
           4027,     11,   9578,     11,   9501,     11,    323,   3090,    627,
             18,     13,   3146,  38030,    449,    279,  23860,   1132,   3804,
             71,  32973,  96618,   8219,    468,   3178,    647,    574,   3010,
          11352,    555,    279,  23860,   1132,   3804,     71,  32973,     11,
            264,  47841,   7491,    889]])

tokenizer.decode(output[0])

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nPlease tell me all the ways Sun Wukong became immortal in Journey to the West.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA great question about one of Chinese literature\'s most beloved characters!\n\nAccording to the classic novel Journey to the West, Sun Wukong, also known as the Monkey King, became immortal through a combination of his own powers, magical events, and the blessings of various deities. Here are the key ways he achieved immortality:\n\n1. **Born from a stone**: Sun Wukong was born from a stone in a mountain cave, which was said to be the result of a magical phenomenon. This unusual birth marked him as a being with extraordinary powers and abilities.\n2. **Consuming the Five Elements Fruit**: As a young monkey, Sun Wukong discovered a magical fruit that granted him immortality. The fruit, known as the Five Elements Fruit (also called the "Five-Flavor Fruit" or "Five-Flavor Fruit of the Golden Lotus"), was said to contain the essence of the five elements: wood, fire, earth, metal, and water.\n3. **Training with the Patriarch Subhuti**: Sun Wukong was later discovered by the Patriarch Subhuti, a Buddhist master who'

Lecture 10: Neural Networks and Transformers¶

CS4787/5777 — Principles of Large-Scale Machine Learning Systems¶

Review: Linear models and neural networks.¶

Variants of neural networks:¶

Transformers¶

Position¶

Layer normalization¶

Autoregressive models: the language-modeling head.¶

The Key Feature of Autoregressive Transformers: the KV Cache¶

Putting it all together. The Llama-3 Model: Demo.¶

Grouped-Query Attention¶

Appendix/Review: Overfitting and Underfitting and Neural Networks.¶