Coverage for transformer_lens/components/bert_block.py: 89%

44 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-06-11 01:46 +0000

1"""Hooked Transformer Bert Block Component. 

2 

3This module contains all the component :class:`BertBlock`. 

4""" 

5from typing import Optional 

6 

7import torch 

8import torch.nn as nn 

9from jaxtyping import Float 

10 

11from transformer_lens.components import MLP, Attention, LayerNorm 

12from transformer_lens.hook_points import HookPoint 

13from transformer_lens.HookedTransformerConfig import HookedTransformerConfig 

14from transformer_lens.utils import repeat_along_head_dimension 

15 

16 

17class BertBlock(nn.Module): 

18 """ 

19 BERT Block. Similar to the TransformerBlock, except that the LayerNorms are applied after the attention and MLP, rather than before. 

20 """ 

21 

22 def __init__(self, cfg: HookedTransformerConfig): 

23 super().__init__() 

24 self.cfg = cfg 

25 

26 self.attn = Attention(cfg) 

27 self.ln1 = LayerNorm(cfg) 

28 self.mlp = MLP(cfg) 

29 self.ln2 = LayerNorm(cfg) 

30 

31 self.hook_q_input = HookPoint() # [batch, pos, n_heads, d_model] 

32 self.hook_k_input = HookPoint() # [batch, pos, n_heads, d_model] 

33 self.hook_v_input = HookPoint() # [batch, pos, n_heads, d_model] 

34 

35 self.hook_attn_out = HookPoint() # [batch, pos, d_model] 

36 self.hook_mlp_in = HookPoint() # [batch, pos, d_model] 

37 self.hook_mlp_out = HookPoint() # [batch, pos, d_model] 

38 self.hook_resid_pre = HookPoint() # [batch, pos, d_model] 

39 self.hook_resid_mid = HookPoint() # [batch, pos, d_model] 

40 self.hook_resid_post = HookPoint() # [batch, pos, d_model] 

41 self.hook_normalized_resid_post = HookPoint() # [batch, pos, d_model] 

42 

43 def forward( 

44 self, 

45 resid_pre: Float[torch.Tensor, "batch pos d_model"], 

46 additive_attention_mask: Optional[Float[torch.Tensor, "batch 1 1 pos"]] = None, 

47 ): 

48 resid_pre = self.hook_resid_pre(resid_pre) 

49 

50 query_input = resid_pre 

51 key_input = resid_pre 

52 value_input = resid_pre 

53 

54 if self.cfg.use_split_qkv_input: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 n_heads = self.cfg.n_heads 

56 query_input = self.hook_q_input(repeat_along_head_dimension(query_input, n_heads)) 

57 key_input = self.hook_k_input(repeat_along_head_dimension(key_input, n_heads)) 

58 value_input = self.hook_v_input(repeat_along_head_dimension(value_input, n_heads)) 

59 

60 attn_out = self.hook_attn_out( 

61 self.attn( 

62 query_input, 

63 key_input, 

64 value_input, 

65 additive_attention_mask=additive_attention_mask, 

66 ) 

67 ) 

68 resid_mid = self.hook_resid_mid(resid_pre + attn_out) 

69 

70 mlp_in = resid_mid if not self.cfg.use_hook_mlp_in else self.hook_mlp_in(resid_mid.clone()) 

71 normalized_resid_mid = self.ln1(mlp_in) 

72 mlp_out = self.hook_mlp_out(self.mlp(normalized_resid_mid)) 

73 resid_post = self.hook_resid_post(normalized_resid_mid + mlp_out) 

74 normalized_resid_post = self.hook_normalized_resid_post(self.ln2(resid_post)) 

75 

76 return normalized_resid_post