Model Properties Table#
also see the interactive model table
name.default_alias |
name.huggingface |
n_params.as_str |
n_params.as_int |
cfg.n_params |
cfg.n_layers |
cfg.n_heads |
cfg.d_model |
cfg.d_vocab |
cfg.act_fn |
cfg.positional_embedding_type |
cfg.parallel_attn_mlp |
cfg.original_architecture |
cfg.normalization_type |
tokenizer.name |
tokenizer.class |
tokenizer.vocab_size |
tokenizer.vocab_hash |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
gpt2-small |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
gpt2 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt2-medium |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
gpt2-medium |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt2-large |
708M |
707788800 |
707788800 |
36 |
20 |
1280 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
gpt2-large |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt2-xl |
1.5B |
1474560000 |
1474560000 |
48 |
25 |
1600 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
gpt2-xl |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
distillgpt2 |
42M |
42467328 |
42467328 |
6 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
distilgpt2 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
opt-125m |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-125m |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-1.3b |
1.2B |
1207959552 |
1207959552 |
24 |
32 |
2048 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-1.3b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-2.7b |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-2.7b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-6.7b |
6.4B |
6442450944 |
6442450944 |
32 |
32 |
4096 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-6.7b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-13b |
13B |
12582912000 |
12582912000 |
40 |
40 |
5120 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-13b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-30b |
30B |
29595009024 |
29595009024 |
48 |
56 |
7168 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-30b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
opt-66b |
65B |
65229815808 |
65229815808 |
64 |
72 |
9216 |
50272 |
relu |
standard |
False |
OPTForCausalLM |
LN |
facebook/opt-66b |
GPT2TokenizerFast |
50265.0 |
f1FIzqnRiMYzke1CU0hp8TDxq7k= |
|
gpt-neo-125M |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
EleutherAI/gpt-neo-125M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt-neo-1.3B |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
EleutherAI/gpt-neo-1.3B |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt-neo-2.7B |
2.5B |
2516582400 |
2516582400 |
32 |
20 |
2560 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
EleutherAI/gpt-neo-2.7B |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
gpt-j-6B |
5.6B |
5637144576 |
5637144576 |
28 |
16 |
4096 |
50400 |
gelu_new |
rotary |
True |
GPTJForCausalLM |
LN |
EleutherAI/gpt-j-6B |
GPT2TokenizerFast |
50257.0 |
aKfp-BCA9d3W27qknxFiS0DGC5s= |
|
gpt-neox-20b |
20B |
19931332608 |
19931332608 |
44 |
64 |
6144 |
50432 |
gelu_fast |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
stanford-gpt2-small-a |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/alias-gpt2-small-x21 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-small-b |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/battlestar-gpt2-small-x49 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-small-c |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/caprica-gpt2-small-x81 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-small-d |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/darkmatter-gpt2-small-x343 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-small-e |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/expanse-gpt2-small-x777 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-medium-a |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/arwen-gpt2-medium-x21 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-medium-b |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/beren-gpt2-medium-x49 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-medium-c |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/celebrimbor-gpt2-medium-x81 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-medium-d |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/durin-gpt2-medium-x343 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stanford-gpt2-medium-e |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
stanford-crfm/eowyn-gpt2-medium-x777 |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
pythia-14m |
1.2M |
1179648 |
1179648 |
6 |
4 |
128 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-14m |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-31m |
4.7M |
4718592 |
4718592 |
6 |
8 |
256 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-31m |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-70m |
19M |
18874368 |
18874368 |
6 |
8 |
512 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-70m |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-410m |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-410m |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1b |
805M |
805306368 |
805306368 |
16 |
8 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1.4b |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1.4b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-2.8b |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-2.8b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-6.9b |
6.4B |
6442450944 |
6442450944 |
32 |
32 |
4096 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-6.9b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-12b |
11B |
11324620800 |
11324620800 |
36 |
40 |
5120 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-12b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-70m-deduped |
19M |
18874368 |
18874368 |
6 |
8 |
512 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-70m-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-deduped |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-410m-deduped |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-410m-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1b-deduped |
805M |
805306368 |
805306368 |
16 |
8 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1b-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1.4b-deduped |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1.4b-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-2.8b-deduped |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-2.8b-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-6.9b-deduped |
6.4B |
6442450944 |
6442450944 |
32 |
32 |
4096 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-6.9b-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-12b-deduped |
11B |
11324620800 |
11324620800 |
36 |
40 |
5120 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-12b-deduped |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-70m-v0 |
19M |
18874368 |
18874368 |
6 |
8 |
512 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-70m-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-v0 |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-410m-v0 |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-410m-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1b-v0 |
805M |
805306368 |
805306368 |
16 |
8 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1b-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1.4b-v0 |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1.4b-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-2.8b-v0 |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-2.8b-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-6.9b-v0 |
6.4B |
6442450944 |
6442450944 |
32 |
32 |
4096 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-6.9b-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-12b-v0 |
11B |
11324620800 |
11324620800 |
36 |
40 |
5120 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-12b-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-70m-deduped-v0 |
19M |
18874368 |
18874368 |
6 |
8 |
512 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-70m-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-deduped-v0 |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-410m-deduped-v0 |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-410m-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1b-deduped-v0 |
805M |
805306368 |
805306368 |
16 |
8 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1b-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-1.4b-deduped-v0 |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-1.4b-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-2.8b-deduped-v0 |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-2.8b-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-6.9b-deduped-v0 |
6.4B |
6442450944 |
6442450944 |
32 |
32 |
4096 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-6.9b-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-12b-deduped-v0 |
11B |
11324620800 |
11324620800 |
36 |
40 |
5120 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-12b-deduped-v0 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-seed1 |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-seed1 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-seed2 |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-seed2 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
pythia-160m-seed3 |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
50304 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
EleutherAI/pythia-160m-seed3 |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-1l-pile |
13M |
12582912 |
12582912 |
1 |
16 |
1024 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LN |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-2l-pile |
13M |
12812288 |
12812288 |
2 |
11 |
736 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LNPre |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-4l-pile |
13M |
12582912 |
12582912 |
4 |
8 |
512 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LNPre |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-6l-pile |
42M |
42467328 |
42467328 |
6 |
12 |
768 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LNPre |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-8l-pile |
101M |
100663296 |
100663296 |
8 |
16 |
1024 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LNPre |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-10l-pile |
197M |
196608000 |
196608000 |
10 |
20 |
1280 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LNPre |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-12l-pile |
340M |
339738624 |
339738624 |
12 |
24 |
1536 |
50278 |
solu_ln |
standard |
False |
neel-solu-old |
LN |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
solu-1l |
3.1M |
3145728 |
3145728 |
1 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-2l |
6.3M |
6291456 |
6291456 |
2 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-3l |
9.4M |
9437184 |
9437184 |
3 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-4l |
13M |
12582912 |
12582912 |
4 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-6l |
42M |
42467328 |
42467328 |
6 |
12 |
768 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-8l |
101M |
100663296 |
100663296 |
8 |
16 |
1024 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-10l |
197M |
196608000 |
196608000 |
10 |
20 |
1280 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-12l |
340M |
339738624 |
339738624 |
12 |
24 |
1536 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
gelu-1l |
3.1M |
3145728 |
3145728 |
1 |
8 |
512 |
48262 |
gelu |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
gelu-2l |
6.3M |
6291456 |
6291456 |
2 |
8 |
512 |
48262 |
gelu |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
gelu-3l |
9.4M |
9437184 |
9437184 |
3 |
8 |
512 |
48262 |
gelu |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
gelu-4l |
13M |
12582912 |
12582912 |
4 |
8 |
512 |
48262 |
gelu |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
attn-only-1l |
1.0M |
1048576 |
1048576 |
1 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
attn-only-2l |
2.1M |
2097152 |
2097152 |
2 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
attn-only-3l |
3.1M |
3145728 |
3145728 |
3 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
attn-only-4l |
4.2M |
4194304 |
4194304 |
4 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
attn-only-2l-demo |
2.1M |
2097152 |
2097152 |
2 |
8 |
512 |
50277 |
solu_ln |
shortformer |
False |
neel |
EleutherAI/gpt-neox-20b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
||
solu-1l-wiki |
3.1M |
3145728 |
3145728 |
1 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
solu-4l-wiki |
13M |
12582912 |
12582912 |
4 |
8 |
512 |
48262 |
solu_ln |
standard |
False |
neel |
LN |
NeelNanda/gpt-neox-tokenizer-digits |
PreTrainedTokenizerFast |
48262.0 |
AsGo9tS8Sq4-rlVHM2o3-GyDkJU= |
|
redwood_attn_2l |
524K |
524288 |
524288 |
2 |
8 |
256 |
50259 |
gelu_new |
shortformer |
False |
neel |
LN |
ArthurConmy/redwood_tokenizer |
GPT2TokenizerFast |
50257.0 |
J8auoAiqFanHN7mOtkTrFA9voRk= |
|
llama-7b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
huggyllama/llama-7b |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
llama-13b |
13B |
12687769600 |
12687769600 |
40 |
40 |
5120 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
huggyllama/llama-13b |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
llama-30b |
32B |
32102154240 |
32102154240 |
60 |
52 |
6656 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
huggyllama/llama-30b |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
llama-65b |
65B |
64760053760 |
64760053760 |
80 |
64 |
8192 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
huggyllama/llama-65b |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
Llama-2-7b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-2-7b-hf |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
Llama-2-7b-chat |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-2-7b-chat-hf |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
Llama-2-13b |
13B |
12687769600 |
12687769600 |
40 |
40 |
5120 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-2-13b-hf |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
Llama-2-13b-chat |
13B |
12687769600 |
12687769600 |
40 |
40 |
5120 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-2-13b-chat-hf |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
Llama-2-70b-chat |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-2-70b-chat-hf |
LlamaTokenizerFast |
32000.0 |
e3A7wYziNQPAWcJ15GMAQY8qZqw= |
|
CodeLlamallama-2-7b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32016 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
codellama/CodeLlama-7b-hf |
CodeLlamaTokenizerFast |
32016.0 |
Tq7bUWJcm1X5kj9R-2uR1o7lSq8= |
|
CodeLlama-7b-python |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
codellama/CodeLlama-7b-Python-hf |
CodeLlamaTokenizerFast |
32000.0 |
8UzKES6KUWi5kIvXOLgDHqiLXZk= |
|
CodeLlama-7b-instruct |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
32016 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
codellama/CodeLlama-7b-Instruct-hf |
CodeLlamaTokenizerFast |
32016.0 |
Tq7bUWJcm1X5kj9R-2uR1o7lSq8= |
|
meta-llama/Meta-Llama-3-8B |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Meta-Llama-3-8B |
PreTrainedTokenizerFast |
128000.0 |
RnzNv9w_ITBp6b2dcibKR7_l85I= |
|
meta-llama/Meta-Llama-3-8B-Instruct |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Meta-Llama-3-8B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
RnzNv9w_ITBp6b2dcibKR7_l85I= |
|
meta-llama/Meta-Llama-3-70B |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Meta-Llama-3-70B |
PreTrainedTokenizerFast |
128000.0 |
RnzNv9w_ITBp6b2dcibKR7_l85I= |
|
meta-llama/Meta-Llama-3-70B-Instruct |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Meta-Llama-3-70B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
RnzNv9w_ITBp6b2dcibKR7_l85I= |
|
meta-llama/Llama-3.1-70B |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.1-70B |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.1-8B |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.1-8B |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.1-8B-Instruct |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.1-8B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.1-70B-Instruct |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.1-70B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.2-1B |
1.1B |
1073741824 |
1073741824 |
16 |
32 |
2048 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.2-1B |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.2-3B |
3.2B |
3170893824 |
3170893824 |
28 |
24 |
3072 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.2-3B |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.2-1B-Instruct |
1.1B |
1073741824 |
1073741824 |
16 |
32 |
2048 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.2-1B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.2-3B-Instruct |
3.2B |
3170893824 |
3170893824 |
28 |
24 |
3072 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.2-3B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
meta-llama/Llama-3.3-70B-Instruct |
78B |
77846282240 |
77846282240 |
80 |
64 |
8192 |
128256 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
meta-llama/Llama-3.3-70B-Instruct |
PreTrainedTokenizerFast |
128000.0 |
j9N50ddC7mjCgS4GseU9LmKZDKk= |
|
othello-gpt |
25M |
25165824 |
25165824 |
8 |
8 |
512 |
61 |
gelu |
standard |
False |
mingpt |
LN |
|||||
bert-base-cased |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
28996 |
gelu |
standard |
False |
BertForMaskedLM |
LN |
google-bert/bert-base-cased |
BertTokenizerFast |
28996.0 |
SSKvHuFYtPbvgwMSLSIhfFE_kF8= |
|
bert-base-uncased |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
30522 |
gelu |
standard |
False |
BertForMaskedLM |
LN |
google-bert/bert-base-uncased |
BertTokenizerFast |
30522.0 |
G9iEWgpI_JY73i8Lym9gBVhq4BI= |
|
bert-large-cased |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
28996 |
gelu |
standard |
False |
BertForMaskedLM |
LN |
google-bert/bert-large-cased |
BertTokenizerFast |
28996.0 |
SSKvHuFYtPbvgwMSLSIhfFE_kF8= |
|
bert-large-uncased |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
30522 |
gelu |
standard |
False |
BertForMaskedLM |
LN |
google-bert/bert-large-uncased |
BertTokenizerFast |
30522.0 |
G9iEWgpI_JY73i8Lym9gBVhq4BI= |
|
tiny-stories-1M |
393K |
393216 |
393216 |
8 |
16 |
64 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-1M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-3M |
1.6M |
1572864 |
1572864 |
8 |
16 |
128 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-3M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-8M |
6.3M |
6291456 |
6291456 |
8 |
16 |
256 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-8M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-28M |
25M |
25165824 |
25165824 |
8 |
16 |
512 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-28M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-33M |
28M |
28311552 |
28311552 |
4 |
16 |
768 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-33M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-1M |
393K |
393216 |
393216 |
8 |
16 |
64 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-1M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-3M |
1.6M |
1572864 |
1572864 |
8 |
16 |
128 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-3M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-8M |
6.3M |
6291456 |
6291456 |
8 |
16 |
256 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-8M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-28M |
25M |
25165824 |
25165824 |
8 |
16 |
512 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-28M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-33M |
28M |
28311552 |
28311552 |
4 |
16 |
768 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-33M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-1L-21M |
13M |
12582912 |
12582912 |
1 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-1Layer-21M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-2L-33M |
25M |
25165824 |
25165824 |
2 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-2Layers-33M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-1L-21M |
13M |
12582912 |
12582912 |
1 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instuct-1Layer-21M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
tiny-stories-instruct-2L-33M |
25M |
25165824 |
25165824 |
2 |
16 |
1024 |
50257 |
gelu_new |
standard |
False |
GPTNeoForCausalLM |
LN |
roneneldan/TinyStories-Instruct-2Layers-33M |
GPT2TokenizerFast |
50257.0 |
v8xfIj5kwZX5RwgLU66lZNZUlE4= |
|
stablelm-base-alpha-3b |
3.2B |
3221225472 |
3221225472 |
16 |
32 |
4096 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
stabilityai/stablelm-base-alpha-3b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
stablelm-base-alpha-7b |
7.2B |
7247757312 |
7247757312 |
16 |
48 |
6144 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
stabilityai/stablelm-base-alpha-7b |
GPTNeoXTokenizerFast |
50254.0 |
96EawM8Lij99W7OBTk0KW2ELUrQ= |
|
stablelm-tuned-alpha-3b |
3.2B |
3221225472 |
3221225472 |
16 |
32 |
4096 |
50688 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
stabilityai/stablelm-tuned-alpha-3b |
GPTNeoXTokenizerFast |
50254.0 |
RD3vcWSd_TiTpqo5dHyICzaXtGQ= |
|
stablelm-tuned-alpha-7b |
7.2B |
7247757312 |
7247757312 |
16 |
48 |
6144 |
50432 |
gelu |
rotary |
True |
GPTNeoXForCausalLM |
LN |
stabilityai/stablelm-tuned-alpha-7b |
GPTNeoXTokenizerFast |
50254.0 |
RD3vcWSd_TiTpqo5dHyICzaXtGQ= |
|
mistral-7b |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
MistralForCausalLM |
RMS |
mistralai/Mistral-7B-v0.1 |
LlamaTokenizerFast |
32000.0 |
kkCQxUk-PF9Ay_ZKDdKCh02YaGQ= |
|
mistral-7b-instruct |
7.8B |
7784628224 |
7784628224 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
MistralForCausalLM |
RMS |
mistralai/Mistral-7B-Instruct-v0.1 |
LlamaTokenizerFast |
32000.0 |
kkCQxUk-PF9Ay_ZKDdKCh02YaGQ= |
|
mistralai/Mistral-Small-24B-Base-2501 |
23B |
23488102400 |
23488102400 |
40 |
32 |
5120 |
131072 |
silu |
rotary |
False |
MistralForCausalLM |
RMS |
mistralai/Mistral-Small-24B-Base-2501 |
LlamaTokenizerFast |
131072.0 |
GEwgZayWxpmhQxtMq-WrVFJEfqM= |
|
mistral-nemo-base-2407 |
12B |
12163481600 |
12163481600 |
40 |
32 |
5120 |
131072 |
silu |
rotary |
False |
MistralForCausalLM |
RMS |
mistralai/Mistral-Nemo-Base-2407 |
PreTrainedTokenizerFast |
131072.0 |
0xs_eSvVgsyZGbcLSIpGUn4Gdms= |
|
mixtral |
47B |
47245688832 |
47245688832 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
MixtralForCausalLM |
RMS |
mistralai/Mixtral-8x7B-v0.1 |
LlamaTokenizerFast |
32000.0 |
kkCQxUk-PF9Ay_ZKDdKCh02YaGQ= |
|
mixtral-instruct |
47B |
47245688832 |
47245688832 |
32 |
32 |
4096 |
32000 |
silu |
rotary |
False |
MixtralForCausalLM |
RMS |
mistralai/Mixtral-8x7B-Instruct-v0.1 |
LlamaTokenizerFast |
32000.0 |
kkCQxUk-PF9Ay_ZKDdKCh02YaGQ= |
|
bloom-560m |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
250880 |
gelu_fast |
alibi |
False |
BloomForCausalLM |
LN |
bigscience/bloom-560m |
BloomTokenizerFast |
250680.0 |
OO9NZoesMCpWsijo1O2DAbq9GqI= |
|
bloom-1b1 |
679M |
679477248 |
679477248 |
24 |
16 |
1536 |
250880 |
gelu_fast |
alibi |
False |
BloomForCausalLM |
LN |
bigscience/bloom-1b1 |
BloomTokenizerFast |
250680.0 |
OO9NZoesMCpWsijo1O2DAbq9GqI= |
|
bloom-1b7 |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
250880 |
gelu_fast |
alibi |
False |
BloomForCausalLM |
LN |
bigscience/bloom-1b7 |
BloomTokenizerFast |
250680.0 |
OO9NZoesMCpWsijo1O2DAbq9GqI= |
|
bloom-3b |
2.4B |
2359296000 |
2359296000 |
30 |
32 |
2560 |
250880 |
gelu_fast |
alibi |
False |
BloomForCausalLM |
LN |
bigscience/bloom-3b |
BloomTokenizerFast |
250680.0 |
OO9NZoesMCpWsijo1O2DAbq9GqI= |
|
bloom-7b1 |
6.0B |
6039797760 |
6039797760 |
30 |
32 |
4096 |
250880 |
gelu_fast |
alibi |
False |
BloomForCausalLM |
LN |
bigscience/bloom-7b1 |
BloomTokenizerFast |
250680.0 |
OO9NZoesMCpWsijo1O2DAbq9GqI= |
|
santacoder |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
49280 |
gelu_fast |
standard |
False |
GPT2LMHeadCustomModel |
LN |
bigcode/santacoder |
GPT2TokenizerFast |
49152.0 |
GiKC-dU7fpR4sGNkpwn7JKK6qys= |
|
qwen-1.8b |
1.2B |
1214251008 |
1214251008 |
24 |
16 |
2048 |
151936 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-1_8B |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen-7b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
151936 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-7B |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen-14b |
13B |
12609126400 |
12609126400 |
40 |
40 |
5120 |
152064 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-14B |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen-1.8b-chat |
1.2B |
1214251008 |
1214251008 |
24 |
16 |
2048 |
151936 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-1_8B-Chat |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen-7b-chat |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
151936 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-7B-Chat |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen-14b-chat |
13B |
12609126400 |
12609126400 |
40 |
40 |
5120 |
152064 |
silu |
rotary |
False |
QWenLMHeadModel |
RMS |
Qwen/Qwen-14B-Chat |
QWenTokenizer |
151851.0 |
LXUZBV-DGPX2Ty50XH848Cn_umU= |
|
qwen1.5-0.5b |
308M |
308281344 |
308281344 |
24 |
16 |
1024 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-0.5B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-0.5b-chat |
308M |
308281344 |
308281344 |
24 |
16 |
1024 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-0.5B-Chat |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-1.8b |
1.2B |
1214251008 |
1214251008 |
24 |
16 |
2048 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-1.8B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-1.8b-chat |
1.2B |
1214251008 |
1214251008 |
24 |
16 |
2048 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-1.8B-Chat |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-4b |
3.2B |
3171942400 |
3171942400 |
40 |
20 |
2560 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-4B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-4b-chat |
3.2B |
3171942400 |
3171942400 |
40 |
20 |
2560 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-4B-Chat |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-7b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-7B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-7b-chat |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-7B-Chat |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-14b |
13B |
12609126400 |
12609126400 |
40 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-14B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen1.5-14b-chat |
13B |
12609126400 |
12609126400 |
40 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen1.5-14B-Chat |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-0.5b |
391M |
390856704 |
390856704 |
24 |
14 |
896 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-0.5B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-0.5b-instruct |
391M |
390856704 |
390856704 |
24 |
14 |
896 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-0.5B-Instruct |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-1.5b |
1.4B |
1420296192 |
1420296192 |
28 |
12 |
1536 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-1.5B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-1.5b-instruct |
1.4B |
1420296192 |
1420296192 |
28 |
12 |
1536 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-1.5B-Instruct |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-7b |
7.1B |
7141851136 |
7141851136 |
28 |
28 |
3584 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-7B |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2-7b-instruct |
7.1B |
7141851136 |
7141851136 |
28 |
28 |
3584 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2-7B-Instruct |
Qwen2TokenizerFast |
151643.0 |
vakQOjPaHpZ23kxcqX0tTXi2EzQ= |
|
qwen2.5-0.5b |
391M |
390856704 |
390856704 |
24 |
14 |
896 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-0.5B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-0.5b-instruct |
391M |
390856704 |
390856704 |
24 |
14 |
896 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-0.5B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-1.5b |
1.4B |
1420296192 |
1420296192 |
28 |
12 |
1536 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-1.5B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-1.5b-instruct |
1.4B |
1420296192 |
1420296192 |
28 |
12 |
1536 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-1.5B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-3b |
3.0B |
3038773248 |
3038773248 |
36 |
16 |
2048 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-3B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-3b-instruct |
3.0B |
3038773248 |
3038773248 |
36 |
16 |
2048 |
151936 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-3B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-7b |
7.1B |
7141851136 |
7141851136 |
28 |
28 |
3584 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-7B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-7b-instruct |
7.1B |
7141851136 |
7141851136 |
28 |
28 |
3584 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-7B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-14b |
15B |
15225323520 |
15225323520 |
48 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-14B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-14b-instruct |
15B |
15225323520 |
15225323520 |
48 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-14B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-32b |
34B |
33889976320 |
33889976320 |
64 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-32B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-32b-instruct |
34B |
33889976320 |
33889976320 |
64 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-32B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-72b |
80B |
79607889920 |
79607889920 |
80 |
64 |
8192 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-72B |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen2.5-72b-instruct |
80B |
79607889920 |
79607889920 |
80 |
64 |
8192 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/Qwen2.5-72B-Instruct |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen-32b-preview |
34B |
33889976320 |
33889976320 |
64 |
40 |
5120 |
152064 |
silu |
rotary |
False |
Qwen2ForCausalLM |
RMS |
Qwen/QwQ-32B-Preview |
Qwen2TokenizerFast |
151643.0 |
NI384GYDfJidzgXg_-9habj8lOk= |
|
qwen3-0.6b |
499M |
499122176 |
499122176 |
28 |
16 |
1024 |
151936 |
silu |
rotary |
False |
Qwen3ForCausalLM |
RMS |
Qwen/Qwen3-0.6B |
Qwen2TokenizerFast |
151643.0 |
OfOK7SjmrN4KFUqlDylX-Up77SM= |
|
qwen3-1.7b |
1.5B |
1526726656 |
1526726656 |
28 |
16 |
2048 |
151936 |
silu |
rotary |
False |
Qwen3ForCausalLM |
RMS |
Qwen/Qwen3-1.7B |
Qwen2TokenizerFast |
151643.0 |
OfOK7SjmrN4KFUqlDylX-Up77SM= |
|
qwen3-4b |
4.2B |
4199546880 |
4199546880 |
36 |
32 |
2560 |
151936 |
silu |
rotary |
False |
Qwen3ForCausalLM |
RMS |
Qwen/Qwen3-4B |
Qwen2TokenizerFast |
151643.0 |
OfOK7SjmrN4KFUqlDylX-Up77SM= |
|
qwen3-8b |
7.9B |
7851737088 |
7851737088 |
36 |
32 |
4096 |
151936 |
silu |
rotary |
False |
Qwen3ForCausalLM |
RMS |
Qwen/Qwen3-8B |
Qwen2TokenizerFast |
151643.0 |
OfOK7SjmrN4KFUqlDylX-Up77SM= |
|
qwen3-14b |
15B |
14889779200 |
14889779200 |
40 |
40 |
5120 |
151936 |
silu |
rotary |
False |
Qwen3ForCausalLM |
RMS |
Qwen/Qwen3-14B |
Qwen2TokenizerFast |
151643.0 |
OfOK7SjmrN4KFUqlDylX-Up77SM= |
|
phi-1 |
1.2B |
1207959552 |
1207959552 |
24 |
32 |
2048 |
51200 |
gelu_new |
rotary |
True |
PhiForCausalLM |
LN |
microsoft/phi-1 |
CodeGenTokenizer |
50257.0 |
TYk6J3OrqdU2F7JYiSfFXtd-vB4= |
|
phi-1_5 |
1.2B |
1207959552 |
1207959552 |
24 |
32 |
2048 |
51200 |
gelu_new |
rotary |
True |
PhiForCausalLM |
LN |
microsoft/phi-1_5 |
CodeGenTokenizer |
50257.0 |
TYk6J3OrqdU2F7JYiSfFXtd-vB4= |
|
phi-2 |
2.5B |
2516582400 |
2516582400 |
32 |
32 |
2560 |
51200 |
gelu_new |
rotary |
True |
PhiForCausalLM |
LN |
microsoft/phi-2 |
CodeGenTokenizer |
50257.0 |
TYk6J3OrqdU2F7JYiSfFXtd-vB4= |
|
phi-3 |
3.6B |
3623878656 |
3623878656 |
32 |
32 |
3072 |
32064 |
silu |
rotary |
False |
Phi3ForCausalLM |
RMS |
microsoft/Phi-3-mini-4k-instruct |
LlamaTokenizer |
32000.0 |
2BcGXsWoZjuOkMtb6uTbGL68fbc= |
|
phi-4 |
15B |
15204352000 |
15204352000 |
40 |
40 |
5120 |
100352 |
silu |
rotary |
False |
Phi3ForCausalLM |
RMS |
microsoft/phi-4 |
GPT2Tokenizer |
100352.0 |
uJZqWk6gqn6tO_nlSJEZsP9MITQ= |
|
gemma-2b |
2.1B |
2113929216 |
2113929216 |
18 |
8 |
2048 |
256000 |
gelu_new |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2b |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-7b |
7.8B |
7751073792 |
7751073792 |
28 |
16 |
3072 |
256000 |
gelu_new |
rotary |
False |
GemmaForCausalLM |
RMS |
google/gemma-7b |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2b-it |
2.1B |
2113929216 |
2113929216 |
18 |
8 |
2048 |
256000 |
gelu_new |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2b-it |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-7b-it |
7.8B |
7751073792 |
7751073792 |
28 |
16 |
3072 |
256000 |
gelu_new |
rotary |
False |
GemmaForCausalLM |
RMS |
google/gemma-7b-it |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-2b |
2.1B |
2146959360 |
2146959360 |
26 |
8 |
2304 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-2b |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-2b-it |
2.1B |
2146959360 |
2146959360 |
26 |
8 |
2304 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-2b-it |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-9b |
8.9B |
8940158976 |
8940158976 |
42 |
16 |
3584 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-9b |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-9b-it |
8.9B |
8940158976 |
8940158976 |
42 |
16 |
3584 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-9b-it |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-27b |
27B |
26914848768 |
26914848768 |
46 |
32 |
4608 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-27b |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
gemma-2-27b-it |
27B |
26914848768 |
26914848768 |
46 |
32 |
4608 |
256000 |
gelu_pytorch_tanh |
rotary |
False |
Gemma2ForCausalLM |
RMS |
google/gemma-2-27b-it |
GemmaTokenizerFast |
256000.0 |
87mmm7o-5SoGMD05LzhcJdB_XBk= |
|
yi-6b |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
64000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
01-ai/Yi-6B |
LlamaTokenizerFast |
63992.0 |
VGXAFrTzytwGdUlX6AWH0NacncM= |
|
yi-34b |
39B |
38755368960 |
38755368960 |
60 |
56 |
7168 |
64000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
01-ai/Yi-34B |
LlamaTokenizerFast |
64000.0 |
VBBPi7l7j0Xrv93YNq1tizlalWw= |
|
yi-6b-chat |
6.5B |
6476005376 |
6476005376 |
32 |
32 |
4096 |
64000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
01-ai/Yi-6B-Chat |
LlamaTokenizerFast |
63992.0 |
VGXAFrTzytwGdUlX6AWH0NacncM= |
|
yi-34b-chat |
39B |
38755368960 |
38755368960 |
60 |
56 |
7168 |
64000 |
silu |
rotary |
False |
LlamaForCausalLM |
RMS |
01-ai/Yi-34B-Chat |
LlamaTokenizerFast |
63992.0 |
VGXAFrTzytwGdUlX6AWH0NacncM= |
|
t5-small |
19M |
18874368 |
18874368 |
6 |
8 |
512 |
32128 |
relu |
relative_positional_bias |
False |
T5ForConditionalGeneration |
LN |
google-t5/t5-small |
T5TokenizerFast |
32100.0 |
jQeywCyCMVL_vza2wKfpuwjNVys= |
|
t5-base |
85M |
84934656 |
84934656 |
12 |
12 |
768 |
32128 |
relu |
relative_positional_bias |
False |
T5ForConditionalGeneration |
LN |
google-t5/t5-base |
T5TokenizerFast |
32100.0 |
jQeywCyCMVL_vza2wKfpuwjNVys= |
|
t5-large |
302M |
301989888 |
301989888 |
24 |
16 |
1024 |
32128 |
relu |
relative_positional_bias |
False |
T5ForConditionalGeneration |
LN |
google-t5/t5-large |
T5TokenizerFast |
32100.0 |
jQeywCyCMVL_vza2wKfpuwjNVys= |
|
mGPT |
1.2B |
1207959552 |
1207959552 |
24 |
16 |
2048 |
100000 |
gelu_new |
standard |
False |
GPT2LMHeadModel |
LN |
ai-forever/mGPT |
GPT2TokenizerFast |
100000.0 |
8j6CU_p3zgyeEBZ1Z3lu358tiy0= |