Coverage for transformer_lens/components/bert_block.py: 89%
44 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-06-11 01:46 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-06-11 01:46 +0000
1"""Hooked Transformer Bert Block Component.
3This module contains all the component :class:`BertBlock`.
4"""
5from typing import Optional
7import torch
8import torch.nn as nn
9from jaxtyping import Float
11from transformer_lens.components import MLP, Attention, LayerNorm
12from transformer_lens.hook_points import HookPoint
13from transformer_lens.HookedTransformerConfig import HookedTransformerConfig
14from transformer_lens.utils import repeat_along_head_dimension
17class BertBlock(nn.Module):
18 """
19 BERT Block. Similar to the TransformerBlock, except that the LayerNorms are applied after the attention and MLP, rather than before.
20 """
22 def __init__(self, cfg: HookedTransformerConfig):
23 super().__init__()
24 self.cfg = cfg
26 self.attn = Attention(cfg)
27 self.ln1 = LayerNorm(cfg)
28 self.mlp = MLP(cfg)
29 self.ln2 = LayerNorm(cfg)
31 self.hook_q_input = HookPoint() # [batch, pos, n_heads, d_model]
32 self.hook_k_input = HookPoint() # [batch, pos, n_heads, d_model]
33 self.hook_v_input = HookPoint() # [batch, pos, n_heads, d_model]
35 self.hook_attn_out = HookPoint() # [batch, pos, d_model]
36 self.hook_mlp_in = HookPoint() # [batch, pos, d_model]
37 self.hook_mlp_out = HookPoint() # [batch, pos, d_model]
38 self.hook_resid_pre = HookPoint() # [batch, pos, d_model]
39 self.hook_resid_mid = HookPoint() # [batch, pos, d_model]
40 self.hook_resid_post = HookPoint() # [batch, pos, d_model]
41 self.hook_normalized_resid_post = HookPoint() # [batch, pos, d_model]
43 def forward(
44 self,
45 resid_pre: Float[torch.Tensor, "batch pos d_model"],
46 additive_attention_mask: Optional[Float[torch.Tensor, "batch 1 1 pos"]] = None,
47 ):
48 resid_pre = self.hook_resid_pre(resid_pre)
50 query_input = resid_pre
51 key_input = resid_pre
52 value_input = resid_pre
54 if self.cfg.use_split_qkv_input: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 n_heads = self.cfg.n_heads
56 query_input = self.hook_q_input(repeat_along_head_dimension(query_input, n_heads))
57 key_input = self.hook_k_input(repeat_along_head_dimension(key_input, n_heads))
58 value_input = self.hook_v_input(repeat_along_head_dimension(value_input, n_heads))
60 attn_out = self.hook_attn_out(
61 self.attn(
62 query_input,
63 key_input,
64 value_input,
65 additive_attention_mask=additive_attention_mask,
66 )
67 )
68 resid_mid = self.hook_resid_mid(resid_pre + attn_out)
70 mlp_in = resid_mid if not self.cfg.use_hook_mlp_in else self.hook_mlp_in(resid_mid.clone())
71 normalized_resid_mid = self.ln1(mlp_in)
72 mlp_out = self.hook_mlp_out(self.mlp(normalized_resid_mid))
73 resid_post = self.hook_resid_post(normalized_resid_mid + mlp_out)
74 normalized_resid_post = self.hook_normalized_resid_post(self.ln2(resid_post))
76 return normalized_resid_post