Coverage for transformer_lens/components/mlps/gated_mlp_4bit.py: 42%
32 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-19 14:42 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-19 14:42 +0000
1"""Hooked Transformer Gated MLP Component.
3This module contains all the component :class:`GatedMLP`.
4"""
5from typing import Dict, Union
7import torch
8import torch.nn as nn
9from jaxtyping import Float
10from transformers.utils import is_bitsandbytes_available
12from transformer_lens.components.mlps.can_be_used_as_mlp import CanBeUsedAsMLP
13from transformer_lens.hook_points import HookPoint
14from transformer_lens.HookedTransformerConfig import HookedTransformerConfig
16if is_bitsandbytes_available(): 16 ↛ 17line 16 didn't jump to line 17, because the condition on line 16 was never true
17 import bitsandbytes as bnb
18 from bitsandbytes.nn.modules import Params4bit
21class GatedMLP4Bit(CanBeUsedAsMLP):
22 """
23 The equation of a gated MLP:
24 pre = x @ W_gate
25 pre_linear = x @ W_in
26 post = Gelu(pre) * (pre_linear) + b_in
27 mlp_out = post @ W_out + b_out
29 In one equation, mlp_out = (Gelu(x @ W_gate) * (x @ W_in) + b_in) @ W_out + b_out
30 """
32 def __init__(self, cfg: Union[Dict, HookedTransformerConfig]):
33 super().__init__(cfg)
34 self.select_activation_function()
36 nq = int((self.cfg.d_model * self.d_mlp) / 2)
37 self.W_in = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
38 self.W_gate = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
39 self.W_out = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
41 self.b_in = nn.Parameter(torch.zeros(self.d_mlp, dtype=self.cfg.dtype))
42 self.b_out = nn.Parameter(torch.zeros(self.cfg.d_model, dtype=self.cfg.dtype))
44 # hook on gate output but before act_fn
45 self.hook_pre = HookPoint() # [batch, pos, d_mlp]
46 # hook on the linear component of the input
47 self.hook_pre_linear = HookPoint() # [batch, pos, d_mlp]
48 # hook on act_fn(gate_output) * W_in(x) + b_in
49 self.hook_post = HookPoint() # [batch, pos, d_mlp]
51 def forward(
52 self, x: Float[torch.Tensor, "batch pos d_model"]
53 ) -> Float[torch.Tensor, "batch pos d_model"]:
54 # Technically, all these einsums could be done with a single matmul, but this is more readable.
55 pre_act = self.hook_pre(
56 bnb.matmul_4bit(x, self.W_gate.t(), bias=None, quant_state=self.W_gate.quant_state)
57 )
59 if (
60 self.cfg.is_layer_norm_activation()
61 and self.hook_mid is not None
62 and self.ln is not None
63 ):
64 mid_act = self.hook_mid(self.act_fn(pre_act)) # [batch, pos, d_mlp]
65 post_act = self.hook_post(self.ln(mid_act))
66 else:
67 pre_linear = self.hook_pre_linear(
68 bnb.matmul_4bit(x, self.W_in.t(), bias=None, quant_state=self.W_in.quant_state)
69 )
71 post_act = self.hook_post(
72 (self.act_fn(pre_act) * pre_linear) + self.b_in
73 ) # [batch, pos, d_mlp]
75 return bnb.matmul_4bit(
76 post_act, self.W_out.t(), bias=None, quant_state=self.W_out.quant_state
77 )