Coverage for transformer_lens/HookedTransformerConfig.py: 87%

122 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-06-11 01:46 +0000

1"""Hooked Transformer Config. 

2 

3Module with a dataclass for storing the configuration of a 

4:class:`transformer_lens.HookedTransformer` model. 

5""" 

6 

7from __future__ import annotations 

8 

9import logging 

10import pprint 

11import random 

12from dataclasses import dataclass 

13from typing import Any, Dict, List, Optional, Union 

14 

15import numpy as np 

16import torch 

17 

18from transformer_lens import utils 

19 

20SUPPORTED_ACTIVATIONS = ["relu", "gelu", "silu", "gelu_new", "solu_ln", "gelu_fast"] 

21 

22 

23@dataclass 23 ↛ 25line 23 didn't jump to line 25, because

24class HookedTransformerConfig: 

25 """ 

26 Configuration class to store the configuration of a HookedTransformer model. 

27 

28 See further_comments.md for more details on the more complex arguments. 

29 

30 Args: 

31 d_model (int): The dimensionality of the embeddings. 

32 d_head (int): The dimensionality of each attention head. 

33 n_layers (int): The number of transformer blocks (one block = one attn layer AND one MLP layer). 

34 n_ctx (int): The maximum sequence length. 

35 n_heads (int): The number of attention heads. If not 

36 specified, will be set to d_model // d_head. (This is represented by a default value of -1) 

37 d_mlp (int, *optional*): The dimensionality of the feedforward mlp 

38 network. Defaults to 4 * d_model, and in an attn-only model is None. 

39 d_vocab (int): The size of the vocabulary. Defaults to -1, which means not set. If not set, will be 

40 automatically set from the tokenizer's vocab size. 

41 act_fn (str, *optional*): The activation function to use. Always 

42 lowercase. Supports ['relu', 'gelu', 'silu', 'gelu_new', 'solu_ln', 

43 'gelu_fast']. Must be set unless using an attn-only model. 

44 eps (float): The epsilon value to use for layer normalization. Defaults 

45 to 1e-5 

46 use_attn_result (bool): whether to explicitly calculate the amount 

47 each head adds to the residual stream (with a hook) and THEN add it 

48 up, vs just calculating the sum. This can be very memory intensive 

49 for large models, so defaults to False 

50 use_split_qkv_input (bool): whether to explicitly calculate the input of 

51 each head separately, with a hook. Defaults to false to save memory. 

52 use_hook_mlp_in (bool): whether to use a hook to get the input to the 

53 MLP layer. Defaults to false to save memory. 

54 use_attn_in (bool): whether to explicitly calculate the input of each 

55 attention head separately, with a hook. Defaults to false to save memory 

56 use_attn_scale (bool): whether to scale the attention weights by 

57 1/sqrt(d_head) 

58 model_name (str): the name of the model, used to load 

59 weights from HuggingFace or initialized to "custom" if not passed 

60 original_architecture (str, *optional*): the family of the model, used 

61 to help load 

62 weights from HuggingFace or initialized to "custom" if not passed 

63 from_checkpoint (bool): Whether the model weights were 

64 loaded from a checkpoint (only applies to pretrained models) 

65 checkpoint_index (int, *optional*): The index of the 

66 checkpoint loaded (only applies to pretrained models). 

67 checkpoint_label_type (str, *optional*): Whether 

68 checkpoints are labelled by the number of steps or number of tokens. 

69 checkpoint_value (int, *optional*): The value of the 

70 checkpoint label (whether of steps or tokens). 

71 tokenizer_name (str, *optional*): the full name of the model, passed into 

72 HuggingFace to access the tokenizer. Only used when passing in 

73 custom config, if loading from pretrained then this is not needed. 

74 use_local_attn (bool): whether to use local attention - ie each 

75 destination token can only attend to source tokens a certain distance back. 

76 window_size (int, *optional*): the size of the window for local 

77 attention 

78 attn_types (List[str], *optional*): the types of attention to use for 

79 local attention 

80 weight_init_mode (str): the initialization mode to use for the 

81 weights. Only relevant for custom models, ignored for pre-trained. 

82 We now support 'gpt2', 'xavier_uniform', 'xavier_normal', 'kaiming_uniform', 

83 'kaiming_normal'. MuP support to come. Defaults to 'gpt2'. 

84 normalization_type (str, *optional*): the type of normalization to use. 

85 Options are None (no normalization), 'LN' (use LayerNorm, including weights 

86 & biases) and 'LNPre' (use LayerNorm, but no weights & biases). 

87 Defaults to LN 

88 device(str): The device to use for the model. Defaults to 'cuda' if 

89 available, else 'cpu'. Must be 'cuda' if `n_devices` > 1. 

90 n_devices (int): The number of devices to use for the model. Defaults to 1. Layers are loaded 

91 to support "pipeline parallelism", where each device is responsible for a subset of the layers. 

92 attention_dir (str): Whether to use causal (aka unidirectional aka GPT-2 

93 style) or bidirectional attention. Options are 'causal' and 

94 'bidirectional'. Defaults to 'causal' 

95 attn_only (bool): Whether to only use attention layers, no feedforward 

96 layers. Defaults to False 

97 seed (int, *optional*): The seed to use for the model. 

98 Used to set sources of randomness (Python, PyTorch and NumPy) and to initialize weights. 

99 Defaults to None. We recommend setting a seed, so your experiments are reproducible. 

100 initializer_range (float): The standard deviation of the normal used to 

101 initialise the weights, initialized to 0.8 / sqrt(d_model). If weight_init_mode is 

102 'xavier_uniform' or 'xavier_normal', this value is instead treated as the `gain` parameter for the weight 

103 initialisation (a constant factor to scale the weights by). Defaults to -1.0, which means not set. 

104 init_weights (bool): Whether to initialize the weights. Defaults to 

105 True. If False, does not initialize weights. 

106 scale_attn_by_inverse_layer_idx (bool): Whether to scale the attention 

107 weights by 1/(layer_id+1), used by Mistral (Stanford) models for numerical stability when 

108 training in FP16. Defaults to False. 

109 positional_embedding_type (str): The positional embedding used. Options 

110 are 'standard' (ie GPT-2 style, absolute, randomly initialized learned positional 

111 embeddings, directly added to the residual stream), 'rotary' 

112 (described here: https://blog.eleuther.ai/rotary-embeddings/ ) and 

113 'shortformer' (GPT-2 style absolute & learned, but rather than being 

114 added to the residual stream they're only added to the inputs to the 

115 keys and the queries (ie key = W_K(res_stream + pos_embed), but 

116 values and MLPs don't get any positional info)). Sinusoidal are not 

117 currently supported. Defaults to 'standard'. 

118 final_rms (bool): Whether to replace the final normalization (just 

119 before the unembed) with RMSNorm (ie no centering or bias, just 

120 scaling + weights). Only included because of a dumb bug in my 

121 original SoLU code. Defaults to False. 

122 d_vocab_out (int, *optional*): The size of the output vocabulary. Defaults to -1, which means not set. If not 

123 set, will be equal to d_vocab. Mainly useful for algorithmic tasks 

124 where the input and output vocabularies may be different. 

125 parallel_attn_mlp (bool): Whether to parallelize the attention and MLP 

126 layers - a weird cursed thing done by GPT-J. Means that 

127 mlp_out=MLP(ln1(resid_pre)) and resid_post=resid_pre+attn_out+mlp_out. Defaults to False. 

128 rotary_dim (int, *optional*): The dimensionality of the rotary 

129 embeddings, may be d_head in which case only the first rotary_dim 

130 dimensions of each head are rotated. Defaults to None, if 

131 positional_embedding_type=="rotary" it defaults to d_head. 

132 n_params (int, *optional*): The number of (hidden weight) 

133 parameters in the model. This is automatically calculated and not 

134 intended to be set by the user. (Non embedding parameters, because 

135 the [scaling laws paper](https://arxiv.org/pdf/2001.08361.pdf) found 

136 that that was a more meaningful number. Ignoring biases and layer 

137 norms, for convenience) 

138 use_hook_tokens (bool): Will add a hook point on the token input to 

139 HookedTransformer.forward, which lets you cache or intervene on the tokens. 

140 Defaults to False. 

141 default_prepend_bos (bool, optional): Default behavior of whether to prepend the BOS token when the 

142 methods of HookedTransformer process input text to tokenize (only when input is a string). 

143 Defaults to True - even for models not explicitly trained with this, heads often use the 

144 first position as a resting position and accordingly lose information from the first token, 

145 so this empirically seems to give better results. To change the default behavior to False, pass in 

146 default_prepend_bos=False. Note that you can also locally override the default behavior by passing 

147 in prepend_bos=True/False when you call a method that processes the input string. 

148 dtype (torch.dtype, *optional*): The model's dtype. Defaults to torch.float32. 

149 tokenizer_prepends_bos (bool, *optional*): This flag is set by set_tokenizer. It is set to True only 

150 when the tokenizer automatically prepends the BOS token if initialized with add_bos_token=True. 

151 We need this information to dynamically control bos prepending. 

152 load_in_4bit(bool): If this flag is set, then it's assumed that parameters are 4-bit quantized 

153 with bitsandbytes. Currently only supported for Llama. 

154 n_key_value_heads (int, *optional*): The number of groups of heads that use the same key and value matrix. 

155 Only for models that use Grouped Query Attention. 

156 post_embedding_ln (bool): Whether to apply layer normalization after embedding the tokens. Defaults 

157 to False. 

158 num_experts (int, *optional*): The number of experts to use in the MoE layer. If set, experts_per_token 

159 must also be set. Set to None if not using MoE. 

160 experts_per_token (int, *optional*): The number of experts to use for each pass in the MoE layer. If set, 

161 num_experts must also be set. Set to None if not using MoE. 

162 relative_attention_max_distance (int, *optional*): The maximum distance between tokens for relative 

163 attention. If set, relative_attention_num_buckets must also be set.Only used in EncoderDecoder models, like T5. 

164 relative_attention_num_buckets (int, *optional*): The number of buckets to use for relative attention. 

165 If set, relative_attention_max_distance must also be set.Only used in EncoderDecoder models, like T5. 

166 decoder_start_token_id (int, *optional*): The start token id for the decoder. Only used in EncoderDecoder models, like T5. 

167 tie_word_embeddings (bool): Whether to tie the word embeddings and the output layer weights. Defaults to False. Only used in EncoderDecoder (T5) by now. 

168 """ 

169 

170 n_layers: int 

171 d_model: int 

172 n_ctx: int 

173 d_head: int 

174 model_name: str = "custom" 

175 n_heads: int = -1 

176 d_mlp: Optional[int] = None 

177 act_fn: Optional[str] = None 

178 d_vocab: int = -1 

179 eps: float = 1e-5 

180 use_attn_result: bool = False 

181 use_attn_scale: bool = True 

182 use_split_qkv_input: bool = False 

183 use_hook_mlp_in: bool = False 

184 use_attn_in: bool = False 

185 use_local_attn: bool = False 

186 original_architecture: Optional[str] = None 

187 from_checkpoint: bool = False 

188 checkpoint_index: Optional[int] = None 

189 checkpoint_label_type: Optional[str] = None 

190 checkpoint_value: Optional[int] = None 

191 tokenizer_name: Optional[str] = None 

192 window_size: Optional[int] = None 

193 attn_types: Optional[List] = None 

194 init_mode: str = "gpt2" 

195 normalization_type: Optional[str] = "LN" 

196 device: Optional[str] = None 

197 n_devices: int = 1 

198 attention_dir: str = "causal" 

199 attn_only: bool = False 

200 seed: Optional[int] = None 

201 initializer_range: float = -1.0 

202 init_weights: bool = True 

203 scale_attn_by_inverse_layer_idx: bool = False 

204 positional_embedding_type: str = "standard" 

205 final_rms: bool = False 

206 d_vocab_out: int = -1 

207 parallel_attn_mlp: bool = False 

208 rotary_dim: Optional[int] = None 

209 n_params: Optional[int] = None 

210 use_hook_tokens: bool = False 

211 gated_mlp: bool = False 

212 default_prepend_bos: bool = True 

213 dtype: torch.dtype = torch.float32 

214 tokenizer_prepends_bos: Optional[bool] = None 

215 n_key_value_heads: Optional[int] = None 

216 post_embedding_ln: bool = False 

217 rotary_base: int = 10000 

218 trust_remote_code: bool = False 

219 rotary_adjacent_pairs: bool = False 

220 load_in_4bit: bool = False 

221 num_experts: Optional[int] = None 

222 experts_per_token: Optional[int] = None 

223 relative_attention_max_distance: Optional[int] = None 

224 relative_attention_num_buckets: Optional[int] = None 

225 decoder_start_token_id: Optional[int] = None 

226 tie_word_embeddings: bool = False 

227 

228 def __post_init__(self): 

229 if self.n_heads == -1: 

230 self.n_heads = self.d_model // self.d_head 

231 

232 if not self.d_model % (self.d_head) == 0: 232 ↛ 233line 232 didn't jump to line 233, because the condition on line 232 was never true

233 logging.warning( 

234 "d_model %d is not divisible by d_head %d." 

235 "n_heads was inferred to be %d, rounding down the ratio.", 

236 self.d_model, 

237 self.d_head, 

238 self.n_heads, 

239 ) 

240 

241 if self.seed is not None: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true

242 self.set_seed_everywhere(self.seed) 

243 if self.use_local_attn: 

244 assert self.window_size is not None, "window_size must be specified for local attention" 

245 assert self.attn_types is not None, "attn_types must be specified for local attention" 

246 if not self.attn_only: 

247 if self.d_mlp is None: 

248 # For some reason everyone hard codes in this hyper-parameter! 

249 self.d_mlp: int = self.d_model * 4 

250 assert self.act_fn is not None, "act_fn must be specified for non-attn-only models" 

251 assert ( 

252 self.act_fn in SUPPORTED_ACTIVATIONS 

253 ), f"act_fn={self.act_fn} must be one of {SUPPORTED_ACTIVATIONS}" 

254 if self.initializer_range < 0 and self.init_mode == "gpt2": 254 ↛ 257line 254 didn't jump to line 257, because the condition on line 254 was never false

255 # Roughly copy the GPT-2 value, but proportional to sqrt(1/d_model) 

256 self.initializer_range = 0.8 / np.sqrt(self.d_model) 

257 if self.initializer_range < 0 and self.init_mode != "gpt2": 257 ↛ 259line 257 didn't jump to line 259, because the condition on line 257 was never true

258 # This is the gain parameter for the weight initialisation 

259 self.initializer_range = 1.0 

260 

261 if self.d_vocab_out == -1: 

262 # d_vocab_out defaults to d_vocab, unless there's an algorithmic task 

263 # If d_vocab is not set, it'll be inferred from tokenizer_name or from a tokenizer 

264 # explicitly passed to HookedTransformer initialisation. 

265 self.d_vocab_out = self.d_vocab 

266 

267 if self.positional_embedding_type == "rotary" and self.rotary_dim is None: 267 ↛ 268line 267 didn't jump to line 268, because the condition on line 267 was never true

268 self.rotary_dim = self.d_head 

269 

270 if self.num_experts is not None: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true

271 assert ( 

272 self.experts_per_token is not None 

273 ), "experts_per_token must be set if num_experts is set" 

274 if self.experts_per_token is not None: 274 ↛ 275line 274 didn't jump to line 275, because the condition on line 274 was never true

275 assert ( 

276 self.num_experts is not None 

277 ), "num_experts must be set if experts_per_token is set" 

278 

279 # The number of parameters in attention layers (ignoring biases and layer norm). 4 because W_Q, W_K, W_V and W_O 

280 self.n_params = self.n_layers * ((self.d_model * self.d_head * self.n_heads * 4)) 

281 if not self.attn_only: 

282 assert self.d_mlp is not None # mypy 

283 # Number of parameters in MLP layers (ignoring biases and layer norm). 2 because W_in and W_out 

284 mlp_params_per_layer = self.d_model * self.d_mlp * (2 + self.gated_mlp) 

285 

286 if self.num_experts: 286 ↛ 288line 286 didn't jump to line 288, because the condition on line 286 was never true

287 # If we are using MoE, we multiply by num_experts, and add the expert gate parameters (d_model * num_experts) 

288 mlp_params_per_layer = (mlp_params_per_layer + self.d_model) * self.num_experts 

289 self.n_params += self.n_layers * mlp_params_per_layer 

290 

291 if self.device is None: 

292 self.device = utils.get_device() 

293 

294 if self.n_devices > 1: 294 ↛ 295line 294 didn't jump to line 295, because the condition on line 294 was never true

295 assert ( 

296 torch.cuda.device_count() >= self.n_devices 

297 ), f"Not enough CUDA devices to support n_devices {self.n_devices}" 

298 

299 assert self.default_prepend_bos in [ 

300 True, 

301 False, 

302 ], f"padding_side must be either True or False, but {self.default_prepend_bos} is given" 

303 

304 @classmethod 

305 def unwrap(cls, config: Union[Dict, "HookedTransformerConfig"]) -> HookedTransformerConfig: 

306 """ 

307 Convenience function to avoid duplicate code from a common way config is passed to various components 

308 """ 

309 return HookedTransformerConfig.from_dict(config) if isinstance(config, Dict) else config 

310 

311 @classmethod 

312 def from_dict(cls, config_dict: Dict[str, Any]) -> HookedTransformerConfig: 

313 """ 

314 Instantiates a `HookedTransformerConfig` from a Python dictionary of 

315 parameters. 

316 """ 

317 return cls(**config_dict) 

318 

319 def to_dict(self): 

320 return self.__dict__ 

321 

322 def __repr__(self): 

323 return "HookedTransformerConfig:\n" + pprint.pformat(self.to_dict()) 

324 

325 def set_seed_everywhere(self, seed: int): 

326 torch.manual_seed(seed) 

327 random.seed(seed) 

328 np.random.seed(seed)