Model Properties Table#
n_params |
n_layers |
d_model |
n_heads |
act_fn |
n_ctx |
d_vocab |
d_head |
d_mlp |
n_key_value_heads |
|
---|---|---|---|---|---|---|---|---|---|---|
gpt2-small |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
gpt2-medium |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
gpt2-large |
708M |
36 |
1280 |
20 |
gelu |
1024 |
50257 |
64 |
5120 |
|
gpt2-xl |
1.5B |
48 |
1600 |
25 |
gelu |
1024 |
50257 |
64 |
6400 |
|
distillgpt2 |
42M |
6 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
opt-125m |
85M |
12 |
768 |
12 |
relu |
2048 |
50272 |
64 |
3072 |
|
opt-1.3b |
1.2B |
24 |
2048 |
32 |
relu |
2048 |
50272 |
64 |
8192 |
|
opt-2.7b |
2.5B |
32 |
2560 |
32 |
relu |
2048 |
50272 |
80 |
10240 |
|
opt-6.7b |
6.4B |
32 |
4096 |
32 |
relu |
2048 |
50272 |
128 |
16384 |
|
opt-13b |
13B |
40 |
5120 |
40 |
relu |
2048 |
50272 |
128 |
20480 |
|
opt-30b |
30B |
48 |
7168 |
56 |
relu |
2048 |
50272 |
128 |
28672 |
|
opt-66b |
65B |
64 |
9216 |
72 |
relu |
2048 |
50272 |
128 |
36864 |
|
gpt-neo-125M |
85M |
12 |
768 |
12 |
gelu |
2048 |
50257 |
64 |
3072 |
|
gpt-neo-1.3B |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50257 |
128 |
8192 |
|
gpt-neo-2.7B |
2.5B |
32 |
2560 |
20 |
gelu |
2048 |
50257 |
128 |
10240 |
|
gpt-j-6B |
5.6B |
28 |
4096 |
16 |
gelu |
2048 |
50400 |
256 |
16384 |
|
gpt-neox-20b |
20B |
44 |
6144 |
64 |
gelu |
2048 |
50432 |
96 |
24576 |
|
stanford-gpt2-small-a |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
stanford-gpt2-small-b |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
stanford-gpt2-small-c |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
stanford-gpt2-small-d |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
stanford-gpt2-small-e |
85M |
12 |
768 |
12 |
gelu |
1024 |
50257 |
64 |
3072 |
|
stanford-gpt2-medium-a |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
stanford-gpt2-medium-b |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
stanford-gpt2-medium-c |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
stanford-gpt2-medium-d |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
stanford-gpt2-medium-e |
302M |
24 |
1024 |
16 |
gelu |
1024 |
50257 |
64 |
4096 |
|
pythia-14m |
1.2M |
6 |
128 |
4 |
gelu |
2048 |
50304 |
32 |
512 |
|
pythia-31m |
4.7M |
6 |
256 |
8 |
gelu |
2048 |
50304 |
32 |
1024 |
|
pythia-70m |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
|
pythia-160m |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-410m |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
|
pythia-1b |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
|
pythia-1.4b |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
|
pythia-2.8b |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
|
pythia-6.9b |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
|
pythia-12b |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
|
pythia-70m-deduped |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
|
pythia-160m-deduped |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-410m-deduped |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
|
pythia-1b-deduped |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
|
pythia-1.4b-deduped |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
|
pythia-2.8b-deduped |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
|
pythia-6.9b-deduped |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
|
pythia-12b-deduped |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
|
pythia-70m-v0 |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
|
pythia-160m-v0 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-410m-v0 |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
|
pythia-1b-v0 |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
|
pythia-1.4b-v0 |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
|
pythia-2.8b-v0 |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
|
pythia-6.9b-v0 |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
|
pythia-12b-v0 |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
|
pythia-70m-deduped-v0 |
19M |
6 |
512 |
8 |
gelu |
2048 |
50304 |
64 |
2048 |
|
pythia-160m-deduped-v0 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-410m-deduped-v0 |
302M |
24 |
1024 |
16 |
gelu |
2048 |
50304 |
64 |
4096 |
|
pythia-1b-deduped-v0 |
805M |
16 |
2048 |
8 |
gelu |
2048 |
50304 |
256 |
8192 |
|
pythia-1.4b-deduped-v0 |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
50304 |
128 |
8192 |
|
pythia-2.8b-deduped-v0 |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
50304 |
80 |
10240 |
|
pythia-6.9b-deduped-v0 |
6.4B |
32 |
4096 |
32 |
gelu |
2048 |
50432 |
128 |
16384 |
|
pythia-12b-deduped-v0 |
11B |
36 |
5120 |
40 |
gelu |
2048 |
50688 |
128 |
20480 |
|
pythia-160m-seed1 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-160m-seed2 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
pythia-160m-seed3 |
85M |
12 |
768 |
12 |
gelu |
2048 |
50304 |
64 |
3072 |
|
solu-1l-pile |
13M |
1 |
1024 |
16 |
solu |
1024 |
50278 |
64 |
4096 |
|
solu-2l-pile |
13M |
2 |
736 |
11 |
solu |
1024 |
50278 |
64 |
2944 |
|
solu-4l-pile |
13M |
4 |
512 |
8 |
solu |
1024 |
50278 |
64 |
2048 |
|
solu-6l-pile |
42M |
6 |
768 |
12 |
solu |
1024 |
50278 |
64 |
3072 |
|
solu-8l-pile |
101M |
8 |
1024 |
16 |
solu |
1024 |
50278 |
64 |
4096 |
|
solu-10l-pile |
197M |
10 |
1280 |
20 |
solu |
1024 |
50278 |
64 |
5120 |
|
solu-12l-pile |
340M |
12 |
1536 |
24 |
solu |
1024 |
50278 |
64 |
6144 |
|
solu-1l |
3.1M |
1 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
solu-2l |
6.3M |
2 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
solu-3l |
9.4M |
3 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
solu-4l |
13M |
4 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
solu-6l |
42M |
6 |
768 |
12 |
solu |
1024 |
48262 |
64 |
3072 |
|
solu-8l |
101M |
8 |
1024 |
16 |
solu |
1024 |
48262 |
64 |
4096 |
|
solu-10l |
197M |
10 |
1280 |
20 |
solu |
1024 |
48262 |
64 |
5120 |
|
solu-12l |
340M |
12 |
1536 |
24 |
solu |
1024 |
48262 |
64 |
6144 |
|
gelu-1l |
3.1M |
1 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
|
gelu-2l |
6.3M |
2 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
|
gelu-3l |
9.4M |
3 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
|
gelu-4l |
13M |
4 |
512 |
8 |
gelu |
1024 |
48262 |
64 |
2048 |
|
attn-only-1l |
1.0M |
1 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
|
attn-only-2l |
2.1M |
2 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
|
attn-only-3l |
3.1M |
3 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
|
attn-only-4l |
4.2M |
4 |
512 |
8 |
attn_only |
1024 |
48262 |
64 |
2048 |
|
attn-only-2l-demo |
2.1M |
2 |
512 |
8 |
attn_only |
1024 |
50277 |
64 |
2048 |
|
solu-1l-wiki |
3.1M |
1 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
solu-4l-wiki |
13M |
4 |
512 |
8 |
solu |
1024 |
48262 |
64 |
2048 |
|
redwood_attn_2l |
524K |
2 |
256 |
8 |
attn_only |
2048 |
50259 |
32 |
-1 |
|
llama-7b |
6.5B |
32 |
4096 |
32 |
silu |
2048 |
32000 |
128 |
11008 |
|
llama-13b |
13B |
40 |
5120 |
40 |
silu |
2048 |
32000 |
128 |
13824 |
|
llama-30b |
32B |
60 |
6656 |
52 |
silu |
2048 |
32000 |
128 |
17920 |
|
llama-65b |
65B |
80 |
8192 |
64 |
silu |
2048 |
32000 |
128 |
22016 |
|
Llama-2-7b |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
32000 |
128 |
11008 |
|
Llama-2-7b-chat |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
32000 |
128 |
11008 |
|
Llama-2-13b |
13B |
40 |
5120 |
40 |
silu |
4096 |
32000 |
128 |
13824 |
|
Llama-2-13b-chat |
13B |
40 |
5120 |
40 |
silu |
4096 |
32000 |
128 |
13824 |
|
Llama-2-70b-chat |
78B |
80 |
8192 |
64 |
silu |
4096 |
32000 |
128 |
28672 |
8 |
CodeLlamallama-2-7b |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
32016 |
128 |
11008 |
|
CodeLlama-7b-python |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
32000 |
128 |
11008 |
|
CodeLlama-7b-instruct |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
32016 |
128 |
11008 |
|
meta-llama/Meta-Llama-3-8B |
7.8B |
32 |
4096 |
32 |
silu |
8192 |
128256 |
128 |
14336 |
8 |
meta-llama/Meta-Llama-3-8B-Instruct |
7.8B |
32 |
4096 |
32 |
silu |
8192 |
128256 |
128 |
14336 |
8 |
meta-llama/Meta-Llama-3-70B |
78B |
80 |
8192 |
64 |
silu |
8192 |
128256 |
128 |
28672 |
8 |
meta-llama/Meta-Llama-3-70B-Instruct |
78B |
80 |
8192 |
64 |
silu |
8192 |
128256 |
128 |
28672 |
8 |
meta-llama/Llama-3.2-1B |
1.1B |
16 |
2048 |
32 |
silu |
2048 |
128256 |
64 |
8192 |
8 |
meta-llama/Llama-3.2-3B |
3.2B |
28 |
3072 |
24 |
silu |
2048 |
128256 |
128 |
8192 |
8 |
meta-llama/Llama-3.2-1B-Instruct |
1.1B |
16 |
2048 |
32 |
silu |
2048 |
128256 |
64 |
8192 |
8 |
meta-llama/Llama-3.2-3B-Instruct |
3.2B |
28 |
3072 |
24 |
silu |
2048 |
128256 |
128 |
8192 |
8 |
meta-llama/Llama-3.1-70B |
78B |
80 |
8192 |
64 |
silu |
2048 |
128256 |
128 |
28672 |
8 |
meta-llama/Llama-3.1-8B |
7.8B |
32 |
4096 |
32 |
silu |
2048 |
128256 |
128 |
14336 |
8 |
meta-llama/Llama-3.1-8B-Instruct |
7.8B |
32 |
4096 |
32 |
silu |
2048 |
128256 |
128 |
14336 |
8 |
meta-llama/Llama-3.1-70B-Instruct |
78B |
80 |
8192 |
64 |
silu |
2048 |
128256 |
128 |
28672 |
8 |
othello-gpt |
25M |
8 |
512 |
8 |
gelu |
59 |
61 |
64 |
2048 |
|
bert-base-cased |
85M |
12 |
768 |
12 |
gelu |
512 |
28996 |
64 |
3072 |
|
tiny-stories-1M |
393K |
8 |
64 |
16 |
gelu |
2048 |
50257 |
4 |
256 |
|
tiny-stories-3M |
1.6M |
8 |
128 |
16 |
gelu |
2048 |
50257 |
8 |
512 |
|
tiny-stories-8M |
6.3M |
8 |
256 |
16 |
gelu |
2048 |
50257 |
16 |
1024 |
|
tiny-stories-28M |
25M |
8 |
512 |
16 |
gelu |
2048 |
50257 |
32 |
2048 |
|
tiny-stories-33M |
28M |
4 |
768 |
16 |
gelu |
2048 |
50257 |
48 |
3072 |
|
tiny-stories-instruct-1M |
393K |
8 |
64 |
16 |
gelu |
2048 |
50257 |
4 |
256 |
|
tiny-stories-instruct-3M |
1.6M |
8 |
128 |
16 |
gelu |
2048 |
50257 |
8 |
512 |
|
tiny-stories-instruct-8M |
6.3M |
8 |
256 |
16 |
gelu |
2048 |
50257 |
16 |
1024 |
|
tiny-stories-instruct-28M |
25M |
8 |
512 |
16 |
gelu |
2048 |
50257 |
32 |
2048 |
|
tiny-stories-instruct-33M |
28M |
4 |
768 |
16 |
gelu |
2048 |
50257 |
48 |
3072 |
|
tiny-stories-1L-21M |
13M |
1 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
|
tiny-stories-2L-33M |
25M |
2 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
|
tiny-stories-instruct-1L-21M |
13M |
1 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
|
tiny-stories-instruct-2L-33M |
25M |
2 |
1024 |
16 |
gelu |
2048 |
50257 |
64 |
4096 |
|
stablelm-base-alpha-3b |
3.2B |
16 |
4096 |
32 |
gelu |
4096 |
50688 |
128 |
16384 |
|
stablelm-base-alpha-7b |
7.2B |
16 |
6144 |
48 |
gelu |
4096 |
50432 |
128 |
24576 |
|
stablelm-tuned-alpha-3b |
3.2B |
16 |
4096 |
32 |
gelu |
4096 |
50688 |
128 |
16384 |
|
stablelm-tuned-alpha-7b |
7.2B |
16 |
6144 |
48 |
gelu |
4096 |
50432 |
128 |
24576 |
|
mistral-7b |
7.8B |
32 |
4096 |
32 |
silu |
2048 |
32000 |
128 |
14336 |
8 |
mistral-7b-instruct |
7.8B |
32 |
4096 |
32 |
silu |
2048 |
32000 |
128 |
14336 |
8 |
mistral-nemo-base-2407 |
12B |
40 |
5120 |
32 |
silu |
2048 |
131072 |
128 |
14336 |
8 |
mixtral |
47B |
32 |
4096 |
32 |
silu |
32768 |
32000 |
128 |
14336 |
8 |
mixtral-instruct |
47B |
32 |
4096 |
32 |
silu |
32768 |
32000 |
128 |
14336 |
8 |
bloom-560m |
302M |
24 |
1024 |
16 |
gelu |
2048 |
250880 |
64 |
4096 |
|
bloom-1b1 |
679M |
24 |
1536 |
16 |
gelu |
2048 |
250880 |
96 |
6144 |
|
bloom-1b7 |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
250880 |
128 |
8192 |
|
bloom-3b |
2.4B |
30 |
2560 |
32 |
gelu |
2048 |
250880 |
80 |
10240 |
|
bloom-7b1 |
6.0B |
30 |
4096 |
32 |
gelu |
2048 |
250880 |
128 |
16384 |
|
santacoder |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
49280 |
128 |
8192 |
|
qwen-1.8b |
1.2B |
24 |
2048 |
16 |
silu |
2048 |
151936 |
128 |
5504 |
|
qwen-7b |
6.5B |
32 |
4096 |
32 |
silu |
2048 |
151936 |
128 |
11008 |
|
qwen-14b |
13B |
40 |
5120 |
40 |
silu |
2048 |
152064 |
128 |
13696 |
|
qwen-1.8b-chat |
1.2B |
24 |
2048 |
16 |
silu |
2048 |
151936 |
128 |
5504 |
|
qwen-7b-chat |
6.5B |
32 |
4096 |
32 |
silu |
2048 |
151936 |
128 |
11008 |
|
qwen-14b-chat |
13B |
40 |
5120 |
40 |
silu |
2048 |
152064 |
128 |
13696 |
|
qwen1.5-0.5b |
308M |
24 |
1024 |
16 |
silu |
2048 |
151936 |
64 |
2816 |
16 |
qwen1.5-0.5b-chat |
308M |
24 |
1024 |
16 |
silu |
2048 |
151936 |
64 |
2816 |
16 |
qwen1.5-1.8b |
1.2B |
24 |
2048 |
16 |
silu |
2048 |
151936 |
128 |
5504 |
16 |
qwen1.5-1.8b-chat |
1.2B |
24 |
2048 |
16 |
silu |
2048 |
151936 |
128 |
5504 |
16 |
qwen1.5-4b |
3.2B |
40 |
2560 |
20 |
silu |
2048 |
151936 |
128 |
6912 |
20 |
qwen1.5-4b-chat |
3.2B |
40 |
2560 |
20 |
silu |
2048 |
151936 |
128 |
6912 |
20 |
qwen1.5-7b |
6.5B |
32 |
4096 |
32 |
silu |
2048 |
151936 |
128 |
11008 |
32 |
qwen1.5-7b-chat |
6.5B |
32 |
4096 |
32 |
silu |
2048 |
151936 |
128 |
11008 |
32 |
qwen1.5-14b |
13B |
40 |
5120 |
40 |
silu |
2048 |
152064 |
128 |
13696 |
40 |
qwen1.5-14b-chat |
13B |
40 |
5120 |
40 |
silu |
2048 |
152064 |
128 |
13696 |
40 |
Qwen/Qwen2-0.5B |
391M |
24 |
896 |
14 |
silu |
2048 |
151936 |
64 |
4864 |
2 |
Qwen/Qwen2-0.5B-Instruct |
391M |
24 |
896 |
14 |
silu |
2048 |
151936 |
64 |
4864 |
2 |
Qwen/Qwen2-1.5B |
1.4B |
28 |
1536 |
12 |
silu |
2048 |
151936 |
128 |
8960 |
2 |
Qwen/Qwen2-1.5B-Instruct |
1.4B |
28 |
1536 |
12 |
silu |
2048 |
151936 |
128 |
8960 |
2 |
Qwen/Qwen2-7B |
7.1B |
28 |
3584 |
28 |
silu |
2048 |
152064 |
128 |
18944 |
4 |
Qwen/Qwen2-7B-Instruct |
7.1B |
28 |
3584 |
28 |
silu |
2048 |
152064 |
128 |
18944 |
4 |
phi-1 |
1.2B |
24 |
2048 |
32 |
gelu |
2048 |
51200 |
64 |
8192 |
|
phi-1_5 |
1.2B |
24 |
2048 |
32 |
gelu |
2048 |
51200 |
64 |
8192 |
|
phi-2 |
2.5B |
32 |
2560 |
32 |
gelu |
2048 |
51200 |
80 |
10240 |
|
phi-3 |
3.6B |
32 |
3072 |
32 |
silu |
4096 |
32064 |
96 |
8192 |
|
gemma-2b |
2.1B |
18 |
2048 |
8 |
gelu |
8192 |
256000 |
256 |
16384 |
1 |
gemma-7b |
7.8B |
28 |
3072 |
16 |
gelu |
8192 |
256000 |
256 |
24576 |
16 |
gemma-2b-it |
2.1B |
18 |
2048 |
8 |
gelu |
8192 |
256000 |
256 |
16384 |
1 |
gemma-7b-it |
7.8B |
28 |
3072 |
16 |
gelu |
8192 |
256000 |
256 |
24576 |
16 |
gemma-2-2b |
2.1B |
26 |
2304 |
8 |
gelu_pytorch_tanh |
8192 |
256000 |
256 |
9216 |
4 |
gemma-2-2b-it |
2.1B |
26 |
2304 |
8 |
gelu_pytorch_tanh |
8192 |
256000 |
256 |
9216 |
4 |
gemma-2-9b |
8.9B |
42 |
3584 |
16 |
gelu_pytorch_tanh |
8192 |
256000 |
256 |
14336 |
8 |
gemma-2-9b-it |
8.9B |
42 |
3584 |
16 |
gelu_pytorch_tanh |
8192 |
256000 |
256 |
14336 |
8 |
gemma-2-27b |
27B |
46 |
4608 |
32 |
gelu_pytorch_tanh |
8192 |
256000 |
128 |
36864 |
16 |
gemma-2-27b-it |
27B |
46 |
4608 |
32 |
gelu_pytorch_tanh |
8192 |
256000 |
128 |
36864 |
16 |
yi-6b |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
64000 |
128 |
11008 |
4 |
yi-34b |
39B |
60 |
7168 |
56 |
silu |
4096 |
64000 |
128 |
20480 |
8 |
yi-6b-chat |
6.5B |
32 |
4096 |
32 |
silu |
4096 |
64000 |
128 |
11008 |
4 |
yi-34b-chat |
39B |
60 |
7168 |
56 |
silu |
4096 |
64000 |
128 |
20480 |
8 |
t5-small |
19M |
6 |
512 |
8 |
relu |
20 |
32128 |
64 |
2048 |
|
t5-base |
85M |
12 |
768 |
12 |
relu |
20 |
32128 |
64 |
3072 |
|
t5-large |
302M |
24 |
1024 |
16 |
relu |
20 |
32128 |
64 |
4096 |
|
mGPT |
1.2B |
24 |
2048 |
16 |
gelu |
2048 |
100000 |
128 |
8192 |