Coverage for transformer_lens/components/t5_block.py: 88%
63 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-06-11 01:46 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-06-11 01:46 +0000
1from typing import Optional
3import torch
4import torch.nn as nn
5from jaxtyping import Float
7from transformer_lens.components import MLP, RMSNorm, T5Attention
8from transformer_lens.hook_points import HookPoint
9from transformer_lens.HookedTransformerConfig import HookedTransformerConfig
10from transformer_lens.past_key_value_caching import HookedTransformerKeyValueCacheEntry
11from transformer_lens.utils import repeat_along_head_dimension
14class T5Block(nn.Module):
15 """
16 T5 decoder Block. Uses T5Layernorm, and T5attention insted of usual ones.
17 Also uses cross attention if is_decoder is True.
18 """
20 def __init__(self, cfg: HookedTransformerConfig, block_index: int, is_decoder: bool):
21 super().__init__()
22 self.cfg = cfg
23 self.is_decoder = is_decoder
25 self.ln1 = RMSNorm(cfg)
26 self.attn = T5Attention(cfg, has_relative_attention_bias=block_index == 0)
27 self.ln2 = RMSNorm(cfg)
28 if self.is_decoder:
29 self.cross_attn = T5Attention(cfg)
30 self.ln3 = RMSNorm(cfg)
31 self.mlp = MLP(cfg) # [batch, pos, n_heads]
33 self.hook_q_input = HookPoint() # [batch, pos, n_heads, d_model]
34 self.hook_k_input = HookPoint() # [batch, pos, n_heads, d_model]
35 self.hook_v_input = HookPoint() # [batch, pos, n_heads, d_model]
37 self.hook_attn_in = HookPoint() # [batch, pos, d_model]
38 self.hook_attn_out = HookPoint() # [batch, pos, d_model]
39 if self.is_decoder:
40 self.hook_cross_attn_in = HookPoint() # [batch, pos, d_model]
41 self.hook_cross_attn_out = HookPoint() # [batch, pos, d_model]
42 self.hook_resid_mid_cross = HookPoint() # [batch, pos, d_model]
44 self.hook_mlp_in = HookPoint() # [batch, pos, d_model]
45 self.hook_mlp_out = HookPoint() # [batch, pos, d_model]
46 self.hook_resid_pre = HookPoint() # [batch, pos, d_model]
47 self.hook_resid_mid = HookPoint() # [batch, pos, d_model]
48 self.hook_resid_post = HookPoint() # [batch, pos, d_model]
50 def forward(
51 self,
52 resid_pre: Float[torch.Tensor, "batch pos d_model"],
53 additive_attention_mask: Optional[Float[torch.Tensor, "batch 1 1 pos"]] = None,
54 encoder_additive_attention_mask: Optional[
55 Float[torch.Tensor, "batch 1 1 encoder_pos"]
56 ] = None,
57 position_bias: Optional[Float[torch.Tensor, "1 head_index pos kv_pos"]] = None,
58 encoder_hidden_states: Optional[Float[torch.Tensor, "batch encoder_pos d_model"]] = None,
59 past_kv_cache_entry: Optional[HookedTransformerKeyValueCacheEntry] = None,
60 ) -> Float[torch.Tensor, "batch pos d_model"]:
61 """A single Transformer block.
63 Args:
64 resid_pre (torch.Tensor): The residual stream - shape [batch, pos, d_model]
65 encoder_hidden_states (torch.Tensor): The hidden states of the encoder for cross attention - shape [batch, encoder_pos, d_model]
66 cache (HookedTransformerKeyValueCache): A cache of previous keys and values, used only when generating text. Defaults to None.
67 attention_mask (torch.Tensor, optional): The attention mask for padded tokens. Defaults to None.
69 Returns:
70 _type_: _description_
71 """
72 resid_pre = self.hook_resid_pre(resid_pre) # [batch, pos, d_model]
74 attn_in = resid_pre
76 if self.cfg.use_attn_in: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 attn_in = self.hook_attn_in(
78 repeat_along_head_dimension(resid_pre, n_heads=self.cfg.n_heads)
79 )
81 if self.cfg.use_split_qkv_input: 81 ↛ 82line 81 didn't jump to line 82
82 n_kv_heads = (
83 self.cfg.n_key_value_heads
84 if self.cfg.n_key_value_heads is not None
85 else self.cfg.n_heads
86 )
87 query_input = self.hook_q_input(
88 repeat_along_head_dimension(resid_pre, n_heads=self.cfg.n_heads)
89 )
90 key_input = self.hook_k_input(
91 repeat_along_head_dimension(resid_pre, n_heads=n_kv_heads)
92 )
93 value_input = self.hook_v_input(
94 repeat_along_head_dimension(resid_pre, n_heads=n_kv_heads)
95 )
96 else:
97 query_input = attn_in
98 key_input = attn_in
99 value_input = attn_in
101 attn_out = self.hook_attn_out(
102 # hook the residual stream states that are used to calculate the
103 # queries, keys and values, independently.
104 # Then take the layer norm of these inputs, and pass these to the attention module.
105 self.attn(
106 query_input=self.ln1(query_input),
107 key_input=self.ln1(key_input),
108 value_input=self.ln1(value_input),
109 past_kv_cache_entry=past_kv_cache_entry,
110 additive_attention_mask=additive_attention_mask,
111 position_bias=position_bias,
112 )
113 )
115 # [batch, pos, d_model]
117 resid_mid = self.hook_resid_mid(resid_pre + attn_out) # [batch, pos, d_model]
119 if self.is_decoder:
120 cross_attn_in = (
121 resid_mid
122 if not self.cfg.use_attn_in
123 else self.hook_cross_attn_in(resid_mid.clone())
124 )
126 if encoder_hidden_states is None: 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true
127 raise ValueError("Encoder hidden states must be provided for cross attention!")
129 cross_attn_out = self.hook_cross_attn_out(
130 self.cross_attn(
131 query_input=self.ln2(cross_attn_in),
132 key_input=encoder_hidden_states,
133 value_input=encoder_hidden_states,
134 additive_attention_mask=encoder_additive_attention_mask,
135 )
136 )
137 resid_mid_cross = self.hook_resid_mid_cross(resid_mid + cross_attn_out)
139 mlp_in = (
140 resid_mid_cross
141 if not self.cfg.use_hook_mlp_in
142 else self.hook_mlp_in(resid_mid_cross.clone())
143 )
145 normalized_resid_mid = self.ln3(mlp_in)
146 else:
147 mlp_in = (
148 resid_mid if not self.cfg.use_hook_mlp_in else self.hook_mlp_in(resid_mid.clone())
149 )
150 normalized_resid_mid = self.ln2(mlp_in)
152 mlp_out = self.hook_mlp_out(self.mlp(normalized_resid_mid)) # [batch, pos, d_model]
153 resid_post = self.hook_resid_post(mlp_in + mlp_out) # [batch, pos, d_model]
155 return resid_post