Coverage for transformer_lens/components/abstract_attention.py: 80%
236 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-19 14:42 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-19 14:42 +0000
1import math
2from abc import ABC
3from typing import Dict, Optional, Tuple, Union
5import einops
6import torch
7import torch.nn as nn
8import torch.nn.functional as F
9from better_abc import abstract_attribute
10from jaxtyping import Float, Int
11from transformers.utils import is_bitsandbytes_available
13from transformer_lens.FactoredMatrix import FactoredMatrix
14from transformer_lens.hook_points import HookPoint
15from transformer_lens.HookedTransformerConfig import HookedTransformerConfig
16from transformer_lens.past_key_value_caching import HookedTransformerKeyValueCacheEntry
17from transformer_lens.utilities.attention import complex_attn_linear, simple_attn_linear
18from transformer_lens.utils import get_offset_position_ids
20if is_bitsandbytes_available(): 20 ↛ 21line 20 didn't jump to line 21, because the condition on line 20 was never true
21 import bitsandbytes as bnb
22 from bitsandbytes.nn.modules import Params4bit
25class AbstractAttention(ABC, nn.Module):
26 alibi: Union[torch.Tensor, None]
28 def __init__(
29 self,
30 cfg: Union[Dict, HookedTransformerConfig],
31 attn_type: str = "global",
32 layer_id: Optional[int] = None,
33 ):
34 """Abstract Base Class of Attention Blocks, featuring common functionality of both Attention and GroupedQueryAttention blocks.
36 Query and Output projections are defined in this class as they are the same for regular and grouped query attention.
37 Attributes related to Key and Value projections are abstract as their implementations may differ. For example, in GroupedQueryAttention there are less query and key heads than value heads.
38 To enforce implementation of W_K, W_V, b_K, and b_V by child classes, the better_abc.abstract_attribute class is used. See here for details: https://stackoverflow.com/questions/23831510/abstract-attribute-not-property.
40 Args:
41 cfg (Union[Dict, HookedTransformerConfig]): Config
42 attn_type (str, optional): "global" or "local", used by GPT-Neo. Local attention means the model can only attend back cfg.window_size tokens (here, 256). Not used by any other model at the moment. Defaults to "global".
43 layer_id (int, optional): The index of the current layer. Used by the Mistral models (labelled here as stanford-gpt2) to scale down attention scores pre softmax for numerical stability reasons by 1/(layer_id+1). Defaults to None.
44 """
45 super().__init__()
46 self.cfg = HookedTransformerConfig.unwrap(cfg)
48 if self.cfg.load_in_4bit: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 nq = int((self.cfg.d_model * self.cfg.d_head * self.cfg.n_heads) / 2)
50 self.W_Q = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
51 self.W_O = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
52 else:
53 self.W_Q = nn.Parameter(
54 torch.empty(
55 self.cfg.n_heads,
56 self.cfg.d_model,
57 self.cfg.d_head,
58 dtype=self.cfg.dtype,
59 )
60 )
61 self.W_O = nn.Parameter(
62 torch.empty(
63 self.cfg.n_heads,
64 self.cfg.d_head,
65 self.cfg.d_model,
66 dtype=self.cfg.dtype,
67 )
68 )
69 self.W_K = abstract_attribute()
70 self.W_V = abstract_attribute()
72 self.b_Q = nn.Parameter(
73 torch.zeros(self.cfg.n_heads, self.cfg.d_head, dtype=self.cfg.dtype)
74 )
75 self.b_K: nn.Parameter = abstract_attribute()
76 self.b_V: nn.Parameter = abstract_attribute()
77 self.b_O = nn.Parameter(torch.zeros(self.cfg.d_model, dtype=self.cfg.dtype))
79 self.attn_type = attn_type
80 # Create a max_ctx x max_ctx mask, with True iff that query position
81 # can attend to that key position (query is first axis, key is second axis)
82 causal_mask = torch.tril(torch.ones((self.cfg.n_ctx, self.cfg.n_ctx)).bool())
83 if self.attn_type == "global":
84 # For global attention, this is a lower triangular matrix - key <= query
85 self.register_buffer("mask", causal_mask)
86 elif self.attn_type == "local": 86 ↛ 92line 86 didn't jump to line 92, because the condition on line 86 was never false
87 # For local, this is banded, query - window_size < key <= query
88 if not isinstance(self.cfg.window_size, int): 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true
89 raise ValueError("Window size must be an integer for local attention")
90 self.register_buffer("mask", torch.triu(causal_mask, 1 - self.cfg.window_size))
91 else:
92 raise ValueError(f"Invalid attention type: {self.attn_type}")
94 self.register_buffer("IGNORE", torch.tensor(-torch.inf))
96 self.layer_id = layer_id
98 # attn_scale is a constant that we divide the attention scores by pre-softmax. I'm not entirely sure why it matters, but it's probably a mix of softmax not being scale invariant and numerical stability?
99 if self.cfg.use_attn_scale:
100 self.attn_scale = self.cfg.attn_scale # Defaults to sqrt(d_head)
101 else:
102 self.attn_scale = 1.0
103 if self.cfg.scale_attn_by_inverse_layer_idx:
104 if self.layer_id is None: # keep mypy happy 104 ↛ 105line 104 didn't jump to line 105, because the condition on line 104 was never true
105 raise ValueError("Layer ID must be provided to scale attention scores")
106 self.attn_scale *= self.layer_id + 1
108 self.hook_k = HookPoint() # [batch, pos, head_index, d_head]
109 self.hook_q = HookPoint() # [batch, pos, head_index, d_head]
110 self.hook_v = HookPoint() # [batch, pos, head_index, d_head]
111 self.hook_z = HookPoint() # [batch, pos, head_index, d_head]
112 self.hook_attn_scores = HookPoint() # [batch, head_index, query_pos, key_pos]
113 self.hook_pattern = HookPoint() # [batch, head_index, query_pos, key_pos]
114 self.hook_result = HookPoint() # [batch, pos, head_index, d_model]
116 # See HookedTransformerConfig for more details.
117 if self.cfg.positional_embedding_type == "shortformer":
118 # This tracks the input to the keys and queries, which is resid_pre + pos_embeds
119 self.hook_attn_input = HookPoint() # [batch, pos, d_model]
120 elif self.cfg.positional_embedding_type == "rotary":
121 # Applies a rotation to each two-element chunk of keys and queries pre dot producting to bake in relative position. See HookedTransformerConfig for details
122 self.hook_rot_k = HookPoint()
123 self.hook_rot_q = HookPoint()
124 if self.cfg.rotary_dim is None: # keep mypy happy 124 ↛ 125line 124 didn't jump to line 125, because the condition on line 124 was never true
125 raise ValueError("Rotary dim must be provided for rotary positional embeddings")
126 sin, cos = self.calculate_sin_cos_rotary(
127 self.cfg.rotary_dim,
128 self.cfg.n_ctx,
129 base=self.cfg.rotary_base,
130 dtype=self.cfg.dtype,
131 )
132 self.register_buffer("rotary_sin", sin)
133 self.register_buffer("rotary_cos", cos)
134 elif self.cfg.positional_embedding_type == "alibi":
135 # ALiBi bias wil be constructed on the first forward pass.
136 # Note: While computationally efficient, initializing an bias with max n_ctx (16, 1024, 1024) of float32 will occupy ~256MiB of contiguous GPU memory, which may not be optimal for memory usage.
137 self.alibi = None
139 elif self.cfg.positional_embedding_type == "relative_positional_bias":
140 # will be overwritten by the child T5Attention class
141 self.has_relative_attention_bias = False
143 @property
144 def OV(self) -> FactoredMatrix:
145 """
146 OV-Circuit, as defined in A Mathematical Framework. Because there's no non-linearity between the value vector and the output of the layer, the output is purely determined by the matrix W_OV = W_V @ W_O, and not W_V or W_O individually. (Mathematically, for a single head, output == pattern @ residual @ W_V @ W_O, see the glossary for more)
148 Done in the order W_V, W_O because the paper uses left-multiplying weight matrices, and TransformerLens uses right-multiplying, sorry!
150 Returns a FactoredMatrix, with left matrix W_V [head_index, d_model, d_head] and right matrix W_O [head_index, d_head, d_model] - this is a low rank factorisation of the underlying [head_index, d_model, d_model]. FactoredMatrix has helper functions to deal with these large matrices efficiently. To get the OV circuit of a head k, attn.OV[k] works.
151 """
152 return FactoredMatrix(self.W_V, self.W_O)
154 @property
155 def QK(self) -> FactoredMatrix:
156 """
157 QK-Circuit, as defined in A Mathematical Framework. Because there's no non-linearity in the key-query dot product, the output is purely determined by the matrix W_QK = W_Q.T @ W_K, and not W_Q or W_K individually. (Mathematically, for a single head, pattern = destination_residual.T @ W_Q.T @ W_K @ source-residual, see the glossary for more).
159 Done in the order Q on the left, K on the right, because the pattern has dimensions [destination_pos, source_pos]
161 Returns a FactoredMatrix, with left matrix W_Q [head_index, d_model, d_head] and right matrix W_K.T [head_index, d_head, d_model] - this is a low rank factorisation of the underlying [head_index, d_model, d_model] matrix. FactoredMatrix has helper functions to deal with these large matrices efficiently. To get the QK circuit of a head k, attn.QK[k] works.
162 """
163 W_K_transpose = einops.rearrange(
164 self.W_K, "head_index d_model d_head -> head_index d_head d_model"
165 )
166 return FactoredMatrix(self.W_Q, W_K_transpose)
168 def forward(
169 self,
170 query_input: Union[
171 Float[torch.Tensor, "batch pos d_model"],
172 Float[torch.Tensor, "batch pos head_index d_model"],
173 ],
174 key_input: Union[
175 Float[torch.Tensor, "batch kv_pos d_model"],
176 Float[torch.Tensor, "batch kv_pos head_index d_model"],
177 Float[torch.Tensor, "batch kv_pos kv_head_index d_model"],
178 ],
179 value_input: Union[
180 Float[torch.Tensor, "batch kv_pos d_model"],
181 Float[torch.Tensor, "batch kv_pos head_index d_model"],
182 Float[torch.Tensor, "batch kv_pos kv_head_index d_model"],
183 ],
184 past_kv_cache_entry: Optional[HookedTransformerKeyValueCacheEntry] = None,
185 additive_attention_mask: Optional[Float[torch.Tensor, "batch 1 1 kv_pos"]] = None,
186 attention_mask: Optional[Int[torch.Tensor, "batch offset_pos"]] = None,
187 position_bias: Optional[Float[torch.Tensor, "1 head_index pos kv_pos"]] = None,
188 ) -> Float[torch.Tensor, "batch pos d_model"]:
189 """
190 shortformer_pos_embed is only used if self.cfg.positional_embedding_type == "shortformer", else defaults to None and is irrelevant. See HookedTransformerConfig for more details
191 past_kv_cache_entry is an optional entry of past keys and values for this layer, only relevant if generating text. Defaults to None
192 additive_attention_mask is an optional mask to add to the attention weights. Defaults to None.
193 attention_mask is the attention mask for padded tokens. Defaults to None.
194 """
196 q, k, v = self.calculate_qkv_matrices(query_input, key_input, value_input)
198 if past_kv_cache_entry is not None:
199 # Appends the new keys and values to the cached values, and automatically updates the cache
200 kv_cache_pos_offset = past_kv_cache_entry.past_keys.size(1)
201 k, v = past_kv_cache_entry.append(k, v)
202 else:
203 # Not using a cache
204 kv_cache_pos_offset = 0
206 if self.cfg.positional_embedding_type == "rotary":
207 q = self.hook_rot_q(self.apply_rotary(q, kv_cache_pos_offset, attention_mask))
208 k = self.hook_rot_k(
209 self.apply_rotary(k, 0, attention_mask)
210 ) # keys are cached so no offset
212 if self.cfg.dtype not in [torch.float32, torch.float64]: 212 ↛ 214line 212 didn't jump to line 214, because the condition on line 212 was never true
213 # If using 16 bits, increase the precision to avoid numerical instabilities
214 q = q.to(torch.float32)
215 k = k.to(torch.float32)
217 attn_scores = self.calculate_attention_scores(
218 q, k
219 ) # [batch, head_index, query_pos, key_pos]
221 if self.cfg.positional_embedding_type == "alibi":
222 query_ctx = attn_scores.size(-2)
223 # The key context length is the number of positions in the past - this includes all positions in the cache
224 key_ctx = attn_scores.size(-1)
226 # only recompute when necessary to increase efficiency.
227 if self.alibi is None or key_ctx > self.alibi.size(-1): 227 ↛ 233line 227 didn't jump to line 233, because the condition on line 227 was never false
228 self.alibi = AbstractAttention.create_alibi_bias(
229 self.cfg.n_heads, key_ctx, self.cfg.device
230 )
232 # Take the last query_ctx positions so it also works with past_kv_cache
233 attn_scores += self.alibi[
234 :, -query_ctx:, :key_ctx
235 ] # [batch, head_index, query_pos, key_pos]
236 elif self.cfg.positional_embedding_type == "relative_positional_bias":
237 if position_bias is None:
238 if self.has_relative_attention_bias: 238 ↛ 239line 238 didn't jump to line 239, because the condition on line 238 was never true
239 raise ValueError("Positional bias is required for relative_positional_bias")
240 else:
241 position_bias = torch.zeros(
242 1,
243 self.cfg.n_heads,
244 attn_scores.shape[2],
245 attn_scores.shape[3],
246 device=attn_scores.device,
247 )
249 attn_scores += position_bias
250 if self.cfg.attention_dir == "causal":
251 # If causal attention, we mask it to only attend backwards. If bidirectional, we don't mask.
252 attn_scores = self.apply_causal_mask(
253 attn_scores, kv_cache_pos_offset, attention_mask
254 ) # [batch, head_index, query_pos, key_pos]
255 if additive_attention_mask is not None:
256 attn_scores += additive_attention_mask
258 attn_scores = self.hook_attn_scores(attn_scores)
259 pattern = F.softmax(attn_scores, dim=-1)
260 pattern = torch.where(torch.isnan(pattern), torch.zeros_like(pattern), pattern)
261 pattern = self.hook_pattern(pattern) # [batch, head_index, query_pos, key_pos]
262 pattern = pattern.to(self.cfg.dtype)
263 pattern = pattern.to(v.device)
264 z = self.calculate_z_scores(v, pattern) # [batch, pos, head_index, d_head]
265 if not self.cfg.use_attn_result:
266 if self.cfg.load_in_4bit: 266 ↛ 268line 266 didn't jump to line 268
267 # call bitsandbytes method to dequantize and multiply
268 out = (
269 bnb.matmul_4bit(
270 z.reshape(z.shape[0], z.shape[1], self.cfg.d_head * self.cfg.n_heads),
271 self.W_O.t(),
272 # bias=self.W_O.t(),
273 bias=None,
274 quant_state=self.W_O.quant_state,
275 )
276 + self.b_O
277 )
278 else:
279 w = einops.rearrange(
280 self.W_O, "head_index d_head d_model -> d_model (head_index d_head)"
281 )
282 out = F.linear(
283 z.reshape(z.shape[0], z.shape[1], self.cfg.d_head * self.cfg.n_heads),
284 w,
285 self.b_O,
286 )
287 else:
288 # Explicitly calculate the attention result so it can be accessed by a hook
289 # This is off by default because it can easily eat through your GPU memory.
290 if self.cfg.load_in_4bit: 290 ↛ 291line 290 didn't jump to line 291, because the condition on line 290 was never true
291 result = self.hook_result(
292 bnb.matmul_4bit(
293 z.reshape(z.shape[0], z.shape[1], self.cfg.d_head * self.cfg.n_heads),
294 self.W_O.t(),
295 bias=None,
296 quant_state=self.W_O.quant_state,
297 )
298 )
299 else:
300 w = einops.rearrange(
301 self.W_O,
302 "head_index d_head d_model -> d_model head_index d_head",
303 )
304 result = self.hook_result(
305 einops.einsum(
306 z,
307 w,
308 "... head_index d_head, d_model head_index d_head -> ... head_index d_model",
309 )
310 ) # [batch, pos, head_index, d_model]
311 out = (
312 einops.reduce(result, "batch position index model->batch position model", "sum")
313 + self.b_O
314 ) # [batch, pos, d_model]
315 return out
317 def calculate_qkv_matrices(
318 self,
319 query_input: Union[
320 Float[torch.Tensor, "batch pos d_model"],
321 Float[torch.Tensor, "batch pos head_index d_model"],
322 ],
323 key_input: Union[
324 Float[torch.Tensor, "batch kv_pos d_model"],
325 Float[torch.Tensor, "batch kv_pos head_index d_model"],
326 ],
327 value_input: Union[
328 Float[torch.Tensor, "batch kv_pos d_model"],
329 Float[torch.Tensor, "batch kv_pos head_index d_model"],
330 ],
331 ) -> Tuple[
332 Float[torch.Tensor, "batch pos head_index d_head"],
333 Float[torch.Tensor, "batch kv_pos head_index d_head"],
334 Float[torch.Tensor, "batch kv_pos head_index d_head"],
335 ]:
336 attn_fn = (
337 complex_attn_linear
338 if self.cfg.use_split_qkv_input or self.cfg.use_attn_in
339 else simple_attn_linear
340 )
341 if self.cfg.load_in_4bit: 341 ↛ 342line 341 didn't jump to line 342, because the condition on line 341 was never true
342 q = self.hook_q(
343 # call bitsandbytes method to dequantize and multiply
344 bnb.matmul_4bit(
345 query_input,
346 self.W_Q.t(),
347 bias=None,
348 quant_state=self.W_Q.quant_state,
349 ).reshape(
350 query_input.shape[0],
351 query_input.shape[1],
352 self.cfg.n_heads,
353 self.cfg.d_head,
354 )
355 + self.b_Q
356 )
357 else:
358 q = self.hook_q(attn_fn(query_input, self.W_Q, self.b_Q))
359 if self.cfg.load_in_4bit: 359 ↛ 360line 359 didn't jump to line 360, because the condition on line 359 was never true
360 if not isinstance(self.W_K, Params4bit):
361 raise ValueError("W_K must be a Params4bit object if load_in_4bit is True")
362 k = self.hook_k(
363 # call bitsandbytes method to dequantize and multiply
364 bnb.matmul_4bit(
365 key_input, self.W_K.t(), bias=None, quant_state=self.W_K.quant_state
366 ).reshape(
367 key_input.shape[0],
368 key_input.shape[1],
369 self.cfg.n_heads,
370 self.cfg.d_head,
371 )
372 + self.b_K
373 )
374 else:
375 k = self.hook_k(attn_fn(key_input, self.W_K, self.b_K))
377 if self.cfg.load_in_4bit: 377 ↛ 378line 377 didn't jump to line 378, because the condition on line 377 was never true
378 if not isinstance(self.W_V, Params4bit):
379 raise ValueError("W_V must be a Params4bit object if load_in_4bit is True")
380 v = self.hook_v(
381 # call bitsandbytes method to dequantize and multiply
382 bnb.matmul_4bit(
383 value_input,
384 self.W_V.t(),
385 bias=None,
386 quant_state=self.W_V.quant_state,
387 ).reshape(
388 value_input.shape[0],
389 value_input.shape[1],
390 self.cfg.n_heads,
391 self.cfg.d_head,
392 )
393 + self.b_V
394 )
395 else:
396 v = self.hook_v(attn_fn(value_input, self.W_V, self.b_V))
398 return q, k, v
400 def calculate_attention_scores(
401 self,
402 q: Float[torch.Tensor, "batch query_pos head_index d_head"],
403 k: Float[torch.Tensor, "batch key_pos head_index d_head"],
404 ) -> Float[torch.Tensor, "batch head_index query_pos key_pos"]:
405 q_ = einops.rearrange(
406 q, "batch query_pos head_index d_head -> batch head_index query_pos d_head"
407 )
408 k_ = einops.rearrange(
409 k, "batch key_pos head_index d_head -> batch head_index d_head key_pos"
410 )
411 attn_scores = q_ @ k_ / self.attn_scale
412 if self.cfg.attn_scores_soft_cap > 0: 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true
413 attn_scores = self.cfg.attn_scores_soft_cap * F.tanh(
414 attn_scores / self.cfg.attn_scores_soft_cap
415 )
416 return attn_scores
418 def calculate_z_scores(
419 self,
420 v: Float[torch.Tensor, "batch key_pos head_index d_head"],
421 pattern: Float[torch.Tensor, "batch head_index query_pos key_pos"],
422 ) -> Float[torch.Tensor, "batch query_pos head_index d_head"]:
423 v_ = einops.rearrange(
424 v, "batch key_pos head_index d_head -> batch head_index key_pos d_head"
425 )
426 pattern_ = einops.rearrange(
427 pattern,
428 "batch head_index query_pos key_pos -> batch head_index query_pos key_pos",
429 )
430 z = self.hook_z(
431 einops.rearrange(
432 pattern_ @ v_,
433 "batch head_index query_pos d_head -> batch query_pos head_index d_head",
434 )
435 )
436 return z
438 def apply_causal_mask(
439 self,
440 attn_scores: Float[torch.Tensor, "batch head_index pos pos_plus_past_kv_pos_offset"],
441 past_kv_pos_offset: int = 0,
442 attention_mask: Optional[Int[torch.Tensor, "batch offset_pos"]] = None,
443 ):
444 # The query context length is the number of positions we take queries from - if not using a past_kv_cache this is just the context length (for the current prompt), but if we're caching it can be different.
445 query_ctx_length = attn_scores.size(-2)
446 # The key context length is the number of positions in the past - this includes all positions in the cache
447 # If not caching, query_ctx_length == key_ctx_length
448 key_ctx_length = attn_scores.size(-1)
450 if query_ctx_length + past_kv_pos_offset != key_ctx_length: 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true
451 raise ValueError(
452 f"query_ctx_length {query_ctx_length} + past_kv_pos_offset {past_kv_pos_offset} != key_ctx_length {key_ctx_length} - you likely have a bug."
453 )
455 # Index back to front to ensure local attention works
456 final_mask = self.mask[None, None, -query_ctx_length:, -key_ctx_length:] # [1, 1, pos, pos]
457 if attention_mask is not None:
458 # Apply a causal mask to the attention scores considering the padding
459 einsum_str = "batch head pos offset_pos, batch offset_pos -> batch head pos offset_pos"
460 final_mask = final_mask.to(attention_mask.device)
461 final_mask = einops.einsum(final_mask, attention_mask, einsum_str).bool()
463 attn_scores = attn_scores.to(final_mask.device)
464 return torch.where(final_mask, attn_scores, self.IGNORE)
466 def calculate_sin_cos_rotary(
467 self,
468 rotary_dim: int,
469 n_ctx: int,
470 base: int = 10000,
471 dtype: torch.dtype = torch.float32,
472 ) -> Tuple[Float[torch.Tensor, "n_ctx rotary_dim"], Float[torch.Tensor, "n_ctx rotary_dim"]]:
473 """
474 Calculate the sine and cosine waves to use in a rotary embedding. See https://blog.eleuther.ai/rotary-embeddings/ for details
476 Note: For some inexplicable reason, in GPT-J each ADJACENT pair of elements in k and q are rotated, in GPT-NeoX the pair of elements at k and k+n//2 are rotated (ie folding the full length in half, and then looking at pairs accordingly). I have absolutely no clue why, it should be completely equivalent.
477 To resolve this, I've coded it to default to the GPT-J mode, but to explicitly check whether it's GPT-NeoX and then do the GPT-NeoX thing if it is.
478 """
479 high_precision = torch.float32 if dtype != torch.float64 else torch.float64
480 pos = torch.arange(n_ctx, dtype=high_precision)
481 dim = torch.arange(rotary_dim // 2, dtype=high_precision)
483 # Llama-3.1 uses NTK-by-Parts Rotary Embedding introduced in Section 3.2 in https://arxiv.org/pdf/2309.00071
484 # Implementation copied from https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/modeling_rope_utils.py#L310
485 if self.cfg.use_NTK_by_parts_rope: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true
486 inv_freq = 1.0 / (
487 base ** (torch.arange(0, rotary_dim, 2, dtype=torch.int64).float() / rotary_dim)
488 )
489 factor = self.cfg.NTK_by_parts_factor
490 low_freq_factor = self.cfg.NTK_by_parts_low_freq_factor
491 high_freq_factor = self.cfg.NTK_by_parts_high_freq_factor
492 old_context_len = n_ctx
494 low_freq_wavelen = old_context_len / low_freq_factor
495 high_freq_wavelen = old_context_len / high_freq_factor
497 wavelen = 2 * math.pi / inv_freq
498 inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
499 smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
500 high_freq_factor - low_freq_factor
501 )
502 smoothed_inv_freq = (
503 1 - smooth_factor
504 ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
505 is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
506 inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
507 freq = 1 / inv_freq_llama
508 else:
509 freq = base ** (dim / (rotary_dim / 2))
510 if self.cfg.rotary_adjacent_pairs: 510 ↛ 511line 510 didn't jump to line 511, because the condition on line 510 was never true
511 freq = einops.repeat(freq, "d -> (d 2)")
512 else:
513 freq = einops.repeat(freq, "d -> (2 d)")
514 # Create a n_ctx x rotary_dim tensor, where each column is an arithmetic sequence of angles in that frequency
515 angles = pos[:, None] / freq[None, :]
516 return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)
518 def rotate_every_two(
519 self, x: Float[torch.Tensor, "... rotary_dim"]
520 ) -> Float[torch.Tensor, "... rotary_dim"]:
521 """
522 Rotary helper function, splits x into blocks of size 2 along the final axis and maps [x0, x1] to [-x1, x0]
524 The final axis of x must have even length.
526 GPT-NeoX and GPT-J do rotary subtly differently, see calculate_sin_cos_rotary for details.
527 """
528 rot_x = x.clone()
529 if self.cfg.rotary_adjacent_pairs: 529 ↛ 530line 529 didn't jump to line 530, because the condition on line 529 was never true
530 rot_x[..., ::2] = -x[..., 1::2]
531 rot_x[..., 1::2] = x[..., ::2]
532 else:
533 n = x.size(-1) // 2
534 rot_x[..., :n] = -x[..., n:]
535 rot_x[..., n:] = x[..., :n]
537 return rot_x
539 def apply_rotary(
540 self,
541 x: Float[torch.Tensor, "batch pos head_index d_head"],
542 past_kv_pos_offset=0,
543 attention_mask: Optional[Int[torch.Tensor, "batch offset_pos"]] = None,
544 ) -> Float[torch.Tensor, "batch pos head_index d_head"]:
545 # Only apply rotary to first rotary_dim dimensions (eg, if rotary_dim=64 and d_head=256, only apply to first 1/4 of dimensions)
546 x_pos = x.size(1)
547 x_rot = x[..., : self.cfg.rotary_dim]
548 x_pass = x[..., self.cfg.rotary_dim :]
549 x_flip = self.rotate_every_two(x_rot)
551 if attention_mask is None:
552 rotary_cos = self.rotary_cos[
553 None, past_kv_pos_offset : past_kv_pos_offset + x_pos, None, :
554 ]
555 rotary_sin = self.rotary_sin[
556 None, past_kv_pos_offset : past_kv_pos_offset + x_pos, None, :
557 ]
558 x_rotated = x_rot * rotary_cos + x_flip * rotary_sin
559 else:
560 offset_position_ids = get_offset_position_ids(past_kv_pos_offset, attention_mask)
561 offset_position_ids = offset_position_ids.to(self.rotary_cos.device)
562 mask_rotary_cos = self.rotary_cos[offset_position_ids, None, :]
563 mask_rotary_sin = self.rotary_sin[offset_position_ids, None, :]
564 x_rotated = x_rot * mask_rotary_cos + x_flip * mask_rotary_sin
566 return torch.cat([x_rotated, x_pass], dim=-1)
568 @staticmethod
569 def create_alibi_slope(
570 n_ctx: int, device: Optional[Union[str, torch.device]] = None
571 ) -> Float[torch.Tensor, "query key"]:
572 """Create an ALiBi Slope Matrix.
574 Create the slope matrix used in ALiBi, before it is multiplied by the head-specific scalar.
576 See :meth:`create_alibi_bias` for the full ALiBi bias calculation.
578 Examples:
580 >>> AbstractAttention.create_alibi_slope(3)
581 tensor([[ 0., 0., 0.],
582 [-1., 0., 0.],
583 [-2., -1., 0.]])
585 >>> AbstractAttention.create_alibi_slope(4)
586 tensor([[ 0., 0., 0., 0.],
587 [-1., 0., 0., 0.],
588 [-2., -1., 0., 0.],
589 [-3., -2., -1., 0.]])
591 Args:
592 n_ctx: The maximum number of tokens in a prompt.
594 Returns:
595 A tensor of shape (n_ctx, n_ctx), where the upper triangle is zero and the lower
596 triangle is decreasing by a constant slope of 1 (towards the bottom left corner).
597 """
598 # set rows as [[0,1,2...]]
599 rows = torch.arange(n_ctx, device=device).unsqueeze(0)
601 # Set cols as [[0],[1],[2]...]
602 cols = torch.arange(n_ctx, device=device).unsqueeze(1)
604 # Use broadcasting to create the desired lower triangular part of the matrix
605 slope_matrix = rows - cols
607 # Use the clamp method to set all positive values (upper right triangle) to
608 return slope_matrix.clamp(max=0).to(torch.float32)
610 @staticmethod
611 def create_alibi_multipliers(
612 n_heads: int, device: Optional[Union[str, torch.device]] = None
613 ) -> Float[torch.Tensor, "head_idx"]:
614 """Create the ALiBi Scalar Multipliers for each Head.
616 For n heads, the set of multipliers (m) is the geometric sequence that starts at 2^(-8/n), and
617 uses that same value as its ratio. For example, with 8 heads the values would be [1/(2^1),
618 1/(2^2), ... , 1/(2^8)]. With 16 heads the values would be [1/(2^0.5), 1/(2^1), ... , 1/(2^8)].
620 See :meth:`create_alibi_bias` for the full ALiBi bias calculation.
622 Examples:
624 >>> AbstractAttention.create_alibi_multipliers(8)
625 tensor([0.5000, 0.2500, 0.1250, 0.0625, 0.0312, 0.0156, 0.0078, 0.0039])
627 >>> AbstractAttention.create_alibi_multipliers(16)
628 tensor([0.7071, 0.5000, 0.3536, 0.2500, 0.1768, 0.1250, 0.0884, 0.0625, 0.0442, 0.0312,
629 0.0221, 0.0156, 0.0110, 0.0078, 0.0055, 0.0039])
631 Args:
632 n_heads: The number of heads in a layer.
633 device: The device to create the tensor on.
635 Returns:
636 A tensor of shape (n_heads,) containing the scalar multiplier for each head.
637 """
638 # Calculate the starting value
639 start = 2 ** (-8 / n_heads)
641 # Generate the indices [0, 1, ..., n_heads-1]
642 indices = torch.arange(n_heads, device=device)
644 # Compute the multipliers, with the starting value being the same as the ratio
645 multipliers = start * (start**indices)
647 return multipliers
649 @staticmethod
650 def create_alibi_bias(
651 n_heads: int, n_ctx: int, device: Optional[Union[torch.device, str]] = None
652 ) -> Float[torch.Tensor, "head_idx query key"]:
653 """Create the ALiBi Bias for all Heads.
655 Calculate the ALiBi bias (https://arxiv.org/pdf/2108.12409.pdf) for all heads in a layer.
657 The broad idea behind ALiBi is to remove the positional encoding from the original transformer
658 model, and instead apply a bias to each attention score. This bias is proportional to the
659 distance between the query and key (i.e. it encourage paying less attention to more distant
660 tokens), and is added to the attention scores before the softmax. It is used in models such as
661 Bloom.
663 Examples:
665 >>> AbstractAttention.create_alibi_bias(2, 4, torch.device('cpu'))
666 tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000],
667 [-0.0625, 0.0000, 0.0000, 0.0000],
668 [-0.1250, -0.0625, 0.0000, 0.0000],
669 [-0.1875, -0.1250, -0.0625, 0.0000]],
670 [[ 0.0000, 0.0000, 0.0000, 0.0000],
671 [-0.0039, 0.0000, 0.0000, 0.0000],
672 [-0.0078, -0.0039, 0.0000, 0.0000],
673 [-0.0117, -0.0078, -0.0039, 0.0000]]])
675 Args:
676 n_heads: The number of heads in a layer.
677 n_ctx: The maximum number of tokens in a prompt.
678 device: The device to create the tensor on.
680 Returns:
681 The ALiBi bias that should be added to the attention scores before the softmax.
682 """
683 # Create the slope matrix
684 slope: Float[torch.Tensor, "query key"] = AbstractAttention.create_alibi_slope(
685 n_ctx, device
686 )
688 # Create the scalar multiplier for each head.
689 multipliers: Float[torch.Tensor, "head_idx"] = AbstractAttention.create_alibi_multipliers(
690 n_heads, device
691 )
693 # The ALiBi bias is then m * slope_matrix
694 alibi_bias = torch.einsum("ij,k->kij", slope, multipliers)
696 return alibi_bias