Coverage for transformer_lens/components/t5_block.py: 88%

63 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-06-11 01:46 +0000

1from typing import Optional 

2 

3import torch 

4import torch.nn as nn 

5from jaxtyping import Float 

6 

7from transformer_lens.components import MLP, RMSNorm, T5Attention 

8from transformer_lens.hook_points import HookPoint 

9from transformer_lens.HookedTransformerConfig import HookedTransformerConfig 

10from transformer_lens.past_key_value_caching import HookedTransformerKeyValueCacheEntry 

11from transformer_lens.utils import repeat_along_head_dimension 

12 

13 

14class T5Block(nn.Module): 

15 """ 

16 T5 decoder Block. Uses T5Layernorm, and T5attention insted of usual ones. 

17 Also uses cross attention if is_decoder is True. 

18 """ 

19 

20 def __init__(self, cfg: HookedTransformerConfig, block_index: int, is_decoder: bool): 

21 super().__init__() 

22 self.cfg = cfg 

23 self.is_decoder = is_decoder 

24 

25 self.ln1 = RMSNorm(cfg) 

26 self.attn = T5Attention(cfg, has_relative_attention_bias=block_index == 0) 

27 self.ln2 = RMSNorm(cfg) 

28 if self.is_decoder: 

29 self.cross_attn = T5Attention(cfg) 

30 self.ln3 = RMSNorm(cfg) 

31 self.mlp = MLP(cfg) # [batch, pos, n_heads] 

32 

33 self.hook_q_input = HookPoint() # [batch, pos, n_heads, d_model] 

34 self.hook_k_input = HookPoint() # [batch, pos, n_heads, d_model] 

35 self.hook_v_input = HookPoint() # [batch, pos, n_heads, d_model] 

36 

37 self.hook_attn_in = HookPoint() # [batch, pos, d_model] 

38 self.hook_attn_out = HookPoint() # [batch, pos, d_model] 

39 if self.is_decoder: 

40 self.hook_cross_attn_in = HookPoint() # [batch, pos, d_model] 

41 self.hook_cross_attn_out = HookPoint() # [batch, pos, d_model] 

42 self.hook_resid_mid_cross = HookPoint() # [batch, pos, d_model] 

43 

44 self.hook_mlp_in = HookPoint() # [batch, pos, d_model] 

45 self.hook_mlp_out = HookPoint() # [batch, pos, d_model] 

46 self.hook_resid_pre = HookPoint() # [batch, pos, d_model] 

47 self.hook_resid_mid = HookPoint() # [batch, pos, d_model] 

48 self.hook_resid_post = HookPoint() # [batch, pos, d_model] 

49 

50 def forward( 

51 self, 

52 resid_pre: Float[torch.Tensor, "batch pos d_model"], 

53 additive_attention_mask: Optional[Float[torch.Tensor, "batch 1 1 pos"]] = None, 

54 encoder_additive_attention_mask: Optional[ 

55 Float[torch.Tensor, "batch 1 1 encoder_pos"] 

56 ] = None, 

57 position_bias: Optional[Float[torch.Tensor, "1 head_index pos kv_pos"]] = None, 

58 encoder_hidden_states: Optional[Float[torch.Tensor, "batch encoder_pos d_model"]] = None, 

59 past_kv_cache_entry: Optional[HookedTransformerKeyValueCacheEntry] = None, 

60 ) -> Float[torch.Tensor, "batch pos d_model"]: 

61 """A single Transformer block. 

62 

63 Args: 

64 resid_pre (torch.Tensor): The residual stream - shape [batch, pos, d_model] 

65 encoder_hidden_states (torch.Tensor): The hidden states of the encoder for cross attention - shape [batch, encoder_pos, d_model] 

66 cache (HookedTransformerKeyValueCache): A cache of previous keys and values, used only when generating text. Defaults to None. 

67 attention_mask (torch.Tensor, optional): The attention mask for padded tokens. Defaults to None. 

68 

69 Returns: 

70 _type_: _description_ 

71 """ 

72 resid_pre = self.hook_resid_pre(resid_pre) # [batch, pos, d_model] 

73 

74 attn_in = resid_pre 

75 

76 if self.cfg.use_attn_in: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 attn_in = self.hook_attn_in( 

78 repeat_along_head_dimension(resid_pre, n_heads=self.cfg.n_heads) 

79 ) 

80 

81 if self.cfg.use_split_qkv_input: 81 ↛ 82line 81 didn't jump to line 82

82 n_kv_heads = ( 

83 self.cfg.n_key_value_heads 

84 if self.cfg.n_key_value_heads is not None 

85 else self.cfg.n_heads 

86 ) 

87 query_input = self.hook_q_input( 

88 repeat_along_head_dimension(resid_pre, n_heads=self.cfg.n_heads) 

89 ) 

90 key_input = self.hook_k_input( 

91 repeat_along_head_dimension(resid_pre, n_heads=n_kv_heads) 

92 ) 

93 value_input = self.hook_v_input( 

94 repeat_along_head_dimension(resid_pre, n_heads=n_kv_heads) 

95 ) 

96 else: 

97 query_input = attn_in 

98 key_input = attn_in 

99 value_input = attn_in 

100 

101 attn_out = self.hook_attn_out( 

102 # hook the residual stream states that are used to calculate the 

103 # queries, keys and values, independently. 

104 # Then take the layer norm of these inputs, and pass these to the attention module. 

105 self.attn( 

106 query_input=self.ln1(query_input), 

107 key_input=self.ln1(key_input), 

108 value_input=self.ln1(value_input), 

109 past_kv_cache_entry=past_kv_cache_entry, 

110 additive_attention_mask=additive_attention_mask, 

111 position_bias=position_bias, 

112 ) 

113 ) 

114 

115 # [batch, pos, d_model] 

116 

117 resid_mid = self.hook_resid_mid(resid_pre + attn_out) # [batch, pos, d_model] 

118 

119 if self.is_decoder: 

120 cross_attn_in = ( 

121 resid_mid 

122 if not self.cfg.use_attn_in 

123 else self.hook_cross_attn_in(resid_mid.clone()) 

124 ) 

125 

126 if encoder_hidden_states is None: 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true

127 raise ValueError("Encoder hidden states must be provided for cross attention!") 

128 

129 cross_attn_out = self.hook_cross_attn_out( 

130 self.cross_attn( 

131 query_input=self.ln2(cross_attn_in), 

132 key_input=encoder_hidden_states, 

133 value_input=encoder_hidden_states, 

134 additive_attention_mask=encoder_additive_attention_mask, 

135 ) 

136 ) 

137 resid_mid_cross = self.hook_resid_mid_cross(resid_mid + cross_attn_out) 

138 

139 mlp_in = ( 

140 resid_mid_cross 

141 if not self.cfg.use_hook_mlp_in 

142 else self.hook_mlp_in(resid_mid_cross.clone()) 

143 ) 

144 

145 normalized_resid_mid = self.ln3(mlp_in) 

146 else: 

147 mlp_in = ( 

148 resid_mid if not self.cfg.use_hook_mlp_in else self.hook_mlp_in(resid_mid.clone()) 

149 ) 

150 normalized_resid_mid = self.ln2(mlp_in) 

151 

152 mlp_out = self.hook_mlp_out(self.mlp(normalized_resid_mid)) # [batch, pos, d_model] 

153 resid_post = self.hook_resid_post(mlp_in + mlp_out) # [batch, pos, d_model] 

154 

155 return resid_post