Model Properties Table#

also see the interactive model table

name.default_alias

name.huggingface

n_params.as_str

n_params.as_int

cfg.n_params

cfg.n_layers

cfg.n_heads

cfg.d_model

cfg.d_vocab

cfg.act_fn

cfg.positional_embedding_type

cfg.parallel_attn_mlp

cfg.original_architecture

cfg.normalization_type

tokenizer.name

tokenizer.class

tokenizer.vocab_size

tokenizer.vocab_hash

gpt2-small

gpt2

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

gpt2

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt2-medium

gpt2-medium

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

gpt2-medium

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt2-large

gpt2-large

708M

707788800

707788800

36

20

1280

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

gpt2-large

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt2-xl

gpt2-xl

1.5B

1474560000

1474560000

48

25

1600

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

gpt2-xl

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

distillgpt2

distilgpt2

42M

42467328

42467328

6

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

distilgpt2

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

opt-125m

facebook/opt-125m

85M

84934656

84934656

12

12

768

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-125m

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-1.3b

facebook/opt-1.3b

1.2B

1207959552

1207959552

24

32

2048

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-1.3b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-2.7b

facebook/opt-2.7b

2.5B

2516582400

2516582400

32

32

2560

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-2.7b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-6.7b

facebook/opt-6.7b

6.4B

6442450944

6442450944

32

32

4096

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-6.7b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-13b

facebook/opt-13b

13B

12582912000

12582912000

40

40

5120

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-13b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-30b

facebook/opt-30b

30B

29595009024

29595009024

48

56

7168

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-30b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

opt-66b

facebook/opt-66b

65B

65229815808

65229815808

64

72

9216

50272

relu

standard

False

OPTForCausalLM

LN

facebook/opt-66b

GPT2TokenizerFast

50265.0

f1FIzqnRiMYzke1CU0hp8TDxq7k=

gpt-neo-125M

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

EleutherAI/gpt-neo-125M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt-neo-1.3B

1.2B

1207959552

1207959552

24

16

2048

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

EleutherAI/gpt-neo-1.3B

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt-neo-2.7B

2.5B

2516582400

2516582400

32

20

2560

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

EleutherAI/gpt-neo-2.7B

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

gpt-j-6B

5.6B

5637144576

5637144576

28

16

4096

50400

gelu_new

rotary

True

GPTJForCausalLM

LN

EleutherAI/gpt-j-6B

GPT2TokenizerFast

50257.0

aKfp-BCA9d3W27qknxFiS0DGC5s=

gpt-neox-20b

EleutherAI/gpt-neox-20b

20B

19931332608

19931332608

44

64

6144

50432

gelu_fast

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

stanford-gpt2-small-a

stanford-crfm/alias-gpt2-small-x21

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/alias-gpt2-small-x21

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-small-b

stanford-crfm/battlestar-gpt2-small-x49

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/battlestar-gpt2-small-x49

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-small-c

stanford-crfm/caprica-gpt2-small-x81

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/caprica-gpt2-small-x81

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-small-d

stanford-crfm/darkmatter-gpt2-small-x343

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/darkmatter-gpt2-small-x343

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-small-e

stanford-crfm/expanse-gpt2-small-x777

85M

84934656

84934656

12

12

768

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/expanse-gpt2-small-x777

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-medium-a

stanford-crfm/arwen-gpt2-medium-x21

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/arwen-gpt2-medium-x21

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-medium-b

stanford-crfm/beren-gpt2-medium-x49

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/beren-gpt2-medium-x49

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-medium-c

stanford-crfm/celebrimbor-gpt2-medium-x81

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/celebrimbor-gpt2-medium-x81

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-medium-d

stanford-crfm/durin-gpt2-medium-x343

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/durin-gpt2-medium-x343

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stanford-gpt2-medium-e

stanford-crfm/eowyn-gpt2-medium-x777

302M

301989888

301989888

24

16

1024

50257

gelu_new

standard

False

GPT2LMHeadModel

LN

stanford-crfm/eowyn-gpt2-medium-x777

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

pythia-14m

EleutherAI/pythia-14m

1.2M

1179648

1179648

6

4

128

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-14m

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-31m

EleutherAI/pythia-31m

4.7M

4718592

4718592

6

8

256

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-31m

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-70m

EleutherAI/pythia-70m

19M

18874368

18874368

6

8

512

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-70m

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m

EleutherAI/pythia-160m

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-410m

EleutherAI/pythia-410m

302M

301989888

301989888

24

16

1024

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-410m

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1b

EleutherAI/pythia-1b

805M

805306368

805306368

16

8

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1.4b

EleutherAI/pythia-1.4b

1.2B

1207959552

1207959552

24

16

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1.4b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-2.8b

EleutherAI/pythia-2.8b

2.5B

2516582400

2516582400

32

32

2560

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-2.8b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-6.9b

EleutherAI/pythia-6.9b

6.4B

6442450944

6442450944

32

32

4096

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-6.9b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-12b

EleutherAI/pythia-12b

11B

11324620800

11324620800

36

40

5120

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-12b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-70m-deduped

EleutherAI/pythia-70m-deduped

19M

18874368

18874368

6

8

512

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-70m-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-deduped

EleutherAI/pythia-160m-deduped

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-410m-deduped

EleutherAI/pythia-410m-deduped

302M

301989888

301989888

24

16

1024

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-410m-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1b-deduped

EleutherAI/pythia-1b-deduped

805M

805306368

805306368

16

8

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1b-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1.4b-deduped

EleutherAI/pythia-1.4b-deduped

1.2B

1207959552

1207959552

24

16

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1.4b-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-2.8b-deduped

EleutherAI/pythia-2.8b-deduped

2.5B

2516582400

2516582400

32

32

2560

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-2.8b-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-6.9b-deduped

EleutherAI/pythia-6.9b-deduped

6.4B

6442450944

6442450944

32

32

4096

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-6.9b-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-12b-deduped

EleutherAI/pythia-12b-deduped

11B

11324620800

11324620800

36

40

5120

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-12b-deduped

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-70m-v0

EleutherAI/pythia-70m-v0

19M

18874368

18874368

6

8

512

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-70m-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-v0

EleutherAI/pythia-160m-v0

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-410m-v0

EleutherAI/pythia-410m-v0

302M

301989888

301989888

24

16

1024

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-410m-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1b-v0

EleutherAI/pythia-1b-v0

805M

805306368

805306368

16

8

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1b-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1.4b-v0

EleutherAI/pythia-1.4b-v0

1.2B

1207959552

1207959552

24

16

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1.4b-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-2.8b-v0

EleutherAI/pythia-2.8b-v0

2.5B

2516582400

2516582400

32

32

2560

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-2.8b-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-6.9b-v0

EleutherAI/pythia-6.9b-v0

6.4B

6442450944

6442450944

32

32

4096

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-6.9b-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-12b-v0

EleutherAI/pythia-12b-v0

11B

11324620800

11324620800

36

40

5120

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-12b-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-70m-deduped-v0

EleutherAI/pythia-70m-deduped-v0

19M

18874368

18874368

6

8

512

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-70m-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-deduped-v0

EleutherAI/pythia-160m-deduped-v0

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-410m-deduped-v0

EleutherAI/pythia-410m-deduped-v0

302M

301989888

301989888

24

16

1024

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-410m-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1b-deduped-v0

EleutherAI/pythia-1b-deduped-v0

805M

805306368

805306368

16

8

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1b-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-1.4b-deduped-v0

EleutherAI/pythia-1.4b-deduped-v0

1.2B

1207959552

1207959552

24

16

2048

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-1.4b-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-2.8b-deduped-v0

EleutherAI/pythia-2.8b-deduped-v0

2.5B

2516582400

2516582400

32

32

2560

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-2.8b-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-6.9b-deduped-v0

EleutherAI/pythia-6.9b-deduped-v0

6.4B

6442450944

6442450944

32

32

4096

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-6.9b-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-12b-deduped-v0

EleutherAI/pythia-12b-deduped-v0

11B

11324620800

11324620800

36

40

5120

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-12b-deduped-v0

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-seed1

EleutherAI/pythia-160m-seed1

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-seed1

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-seed2

EleutherAI/pythia-160m-seed2

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-seed2

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

pythia-160m-seed3

EleutherAI/pythia-160m-seed3

85M

84934656

84934656

12

12

768

50304

gelu

rotary

True

GPTNeoXForCausalLM

LN

EleutherAI/pythia-160m-seed3

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-1l-pile

NeelNanda/SoLU_1L_v9_old

13M

12582912

12582912

1

16

1024

50278

solu_ln

standard

False

neel-solu-old

LN

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-2l-pile

NeelNanda/SoLU_2L_v10_old

13M

12812288

12812288

2

11

736

50278

solu_ln

standard

False

neel-solu-old

LNPre

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-4l-pile

NeelNanda/SoLU_4L_v11_old

13M

12582912

12582912

4

8

512

50278

solu_ln

standard

False

neel-solu-old

LNPre

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-6l-pile

NeelNanda/SoLU_6L_v13_old

42M

42467328

42467328

6

12

768

50278

solu_ln

standard

False

neel-solu-old

LNPre

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-8l-pile

NeelNanda/SoLU_8L_v21_old

101M

100663296

100663296

8

16

1024

50278

solu_ln

standard

False

neel-solu-old

LNPre

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-10l-pile

NeelNanda/SoLU_10L_v22_old

197M

196608000

196608000

10

20

1280

50278

solu_ln

standard

False

neel-solu-old

LNPre

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-12l-pile

NeelNanda/SoLU_12L_v23_old

340M

339738624

339738624

12

24

1536

50278

solu_ln

standard

False

neel-solu-old

LN

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-1l

NeelNanda/SoLU_1L512W_C4_Code

3.1M

3145728

3145728

1

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-2l

NeelNanda/SoLU_2L512W_C4_Code

6.3M

6291456

6291456

2

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-3l

NeelNanda/SoLU_3L512W_C4_Code

9.4M

9437184

9437184

3

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-4l

NeelNanda/SoLU_4L512W_C4_Code

13M

12582912

12582912

4

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-6l

NeelNanda/SoLU_6L768W_C4_Code

42M

42467328

42467328

6

12

768

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-8l

NeelNanda/SoLU_8L1024W_C4_Code

101M

100663296

100663296

8

16

1024

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-10l

NeelNanda/SoLU_10L1280W_C4_Code

197M

196608000

196608000

10

20

1280

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-12l

NeelNanda/SoLU_12L1536W_C4_Code

340M

339738624

339738624

12

24

1536

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

gelu-1l

NeelNanda/GELU_1L512W_C4_Code

3.1M

3145728

3145728

1

8

512

48262

gelu

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

gelu-2l

NeelNanda/GELU_2L512W_C4_Code

6.3M

6291456

6291456

2

8

512

48262

gelu

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

gelu-3l

NeelNanda/GELU_3L512W_C4_Code

9.4M

9437184

9437184

3

8

512

48262

gelu

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

gelu-4l

NeelNanda/GELU_4L512W_C4_Code

13M

12582912

12582912

4

8

512

48262

gelu

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

attn-only-1l

NeelNanda/Attn_Only_1L512W_C4_Code

1.0M

1048576

1048576

1

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

attn-only-2l

NeelNanda/Attn_Only_2L512W_C4_Code

2.1M

2097152

2097152

2

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

attn-only-3l

NeelNanda/Attn_Only_3L512W_C4_Code

3.1M

3145728

3145728

3

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

attn-only-4l

NeelNanda/Attn_Only_4L512W_C4_Code

4.2M

4194304

4194304

4

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

attn-only-2l-demo

NeelNanda/Attn-Only-2L512W-Shortformer-6B-big-lr

2.1M

2097152

2097152

2

8

512

50277

solu_ln

shortformer

False

neel

EleutherAI/gpt-neox-20b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

solu-1l-wiki

NeelNanda/SoLU_1L512W_Wiki_Finetune

3.1M

3145728

3145728

1

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

solu-4l-wiki

NeelNanda/SoLU_4L512W_Wiki_Finetune

13M

12582912

12582912

4

8

512

48262

solu_ln

standard

False

neel

LN

NeelNanda/gpt-neox-tokenizer-digits

PreTrainedTokenizerFast

48262.0

AsGo9tS8Sq4-rlVHM2o3-GyDkJU=

redwood_attn_2l

ArthurConmy/redwood_attn_2l

524K

524288

524288

2

8

256

50259

gelu_new

shortformer

False

neel

LN

ArthurConmy/redwood_tokenizer

GPT2TokenizerFast

50257.0

J8auoAiqFanHN7mOtkTrFA9voRk=

llama-7b

llama-7b-hf

6.5B

6476005376

6476005376

32

32

4096

32000

silu

rotary

False

LlamaForCausalLM

RMS

huggyllama/llama-7b

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

llama-13b

llama-13b-hf

13B

12687769600

12687769600

40

40

5120

32000

silu

rotary

False

LlamaForCausalLM

RMS

huggyllama/llama-13b

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

llama-30b

llama-30b-hf

32B

32102154240

32102154240

60

52

6656

32000

silu

rotary

False

LlamaForCausalLM

RMS

huggyllama/llama-30b

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

llama-65b

llama-65b-hf

65B

64760053760

64760053760

80

64

8192

32000

silu

rotary

False

LlamaForCausalLM

RMS

huggyllama/llama-65b

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

Llama-2-7b

6.5B

6476005376

6476005376

32

32

4096

32000

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-2-7b-hf

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

Llama-2-7b-chat

6.5B

6476005376

6476005376

32

32

4096

32000

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-2-7b-chat-hf

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

Llama-2-13b

13B

12687769600

12687769600

40

40

5120

32000

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-2-13b-hf

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

Llama-2-13b-chat

13B

12687769600

12687769600

40

40

5120

32000

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-2-13b-chat-hf

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

Llama-2-70b-chat

78B

77846282240

77846282240

80

64

8192

32000

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-2-70b-chat-hf

LlamaTokenizerFast

32000.0

e3A7wYziNQPAWcJ15GMAQY8qZqw=

CodeLlamallama-2-7b

6.5B

6476005376

6476005376

32

32

4096

32016

silu

rotary

False

LlamaForCausalLM

RMS

codellama/CodeLlama-7b-hf

CodeLlamaTokenizerFast

32016.0

Tq7bUWJcm1X5kj9R-2uR1o7lSq8=

CodeLlama-7b-python

6.5B

6476005376

6476005376

32

32

4096

32000

silu

rotary

False

LlamaForCausalLM

RMS

codellama/CodeLlama-7b-Python-hf

CodeLlamaTokenizerFast

32000.0

8UzKES6KUWi5kIvXOLgDHqiLXZk=

CodeLlama-7b-instruct

6.5B

6476005376

6476005376

32

32

4096

32016

silu

rotary

False

LlamaForCausalLM

RMS

codellama/CodeLlama-7b-Instruct-hf

CodeLlamaTokenizerFast

32016.0

Tq7bUWJcm1X5kj9R-2uR1o7lSq8=

meta-llama/Meta-Llama-3-8B

7.8B

7784628224

7784628224

32

32

4096

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Meta-Llama-3-8B

PreTrainedTokenizerFast

128000.0

RnzNv9w_ITBp6b2dcibKR7_l85I=

meta-llama/Meta-Llama-3-8B-Instruct

7.8B

7784628224

7784628224

32

32

4096

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Meta-Llama-3-8B-Instruct

PreTrainedTokenizerFast

128000.0

RnzNv9w_ITBp6b2dcibKR7_l85I=

meta-llama/Meta-Llama-3-70B

78B

77846282240

77846282240

80

64

8192

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Meta-Llama-3-70B

PreTrainedTokenizerFast

128000.0

RnzNv9w_ITBp6b2dcibKR7_l85I=

meta-llama/Meta-Llama-3-70B-Instruct

78B

77846282240

77846282240

80

64

8192

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Meta-Llama-3-70B-Instruct

PreTrainedTokenizerFast

128000.0

RnzNv9w_ITBp6b2dcibKR7_l85I=

meta-llama/Llama-3.1-70B

78B

77846282240

77846282240

80

64

8192

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.1-70B

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.1-8B

7.8B

7784628224

7784628224

32

32

4096

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.1-8B

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.1-8B-Instruct

7.8B

7784628224

7784628224

32

32

4096

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.1-8B-Instruct

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.1-70B-Instruct

78B

77846282240

77846282240

80

64

8192

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.1-70B-Instruct

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.2-1B

1.1B

1073741824

1073741824

16

32

2048

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.2-1B

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.2-3B

3.2B

3170893824

3170893824

28

24

3072

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.2-3B

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.2-1B-Instruct

1.1B

1073741824

1073741824

16

32

2048

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.2-1B-Instruct

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.2-3B-Instruct

3.2B

3170893824

3170893824

28

24

3072

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.2-3B-Instruct

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

meta-llama/Llama-3.3-70B-Instruct

78B

77846282240

77846282240

80

64

8192

128256

silu

rotary

False

LlamaForCausalLM

RMS

meta-llama/Llama-3.3-70B-Instruct

PreTrainedTokenizerFast

128000.0

j9N50ddC7mjCgS4GseU9LmKZDKk=

othello-gpt

Baidicoot/Othello-GPT-Transformer-Lens

25M

25165824

25165824

8

8

512

61

gelu

standard

False

mingpt

LN

bert-base-cased

google-bert/bert-base-cased

85M

84934656

84934656

12

12

768

28996

gelu

standard

False

BertForMaskedLM

LN

google-bert/bert-base-cased

BertTokenizerFast

28996.0

SSKvHuFYtPbvgwMSLSIhfFE_kF8=

bert-base-uncased

google-bert/bert-base-uncased

85M

84934656

84934656

12

12

768

30522

gelu

standard

False

BertForMaskedLM

LN

google-bert/bert-base-uncased

BertTokenizerFast

30522.0

G9iEWgpI_JY73i8Lym9gBVhq4BI=

bert-large-cased

google-bert/bert-large-cased

302M

301989888

301989888

24

16

1024

28996

gelu

standard

False

BertForMaskedLM

LN

google-bert/bert-large-cased

BertTokenizerFast

28996.0

SSKvHuFYtPbvgwMSLSIhfFE_kF8=

bert-large-uncased

google-bert/bert-large-uncased

302M

301989888

301989888

24

16

1024

30522

gelu

standard

False

BertForMaskedLM

LN

google-bert/bert-large-uncased

BertTokenizerFast

30522.0

G9iEWgpI_JY73i8Lym9gBVhq4BI=

tiny-stories-1M

393K

393216

393216

8

16

64

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-1M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-3M

1.6M

1572864

1572864

8

16

128

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-3M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-8M

6.3M

6291456

6291456

8

16

256

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-8M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-28M

25M

25165824

25165824

8

16

512

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-28M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-33M

28M

28311552

28311552

4

16

768

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-33M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-1M

393K

393216

393216

8

16

64

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-1M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-3M

1.6M

1572864

1572864

8

16

128

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-3M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-8M

6.3M

6291456

6291456

8

16

256

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-8M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-28M

25M

25165824

25165824

8

16

512

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-28M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-33M

28M

28311552

28311552

4

16

768

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-33M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-1L-21M

13M

12582912

12582912

1

16

1024

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-1Layer-21M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-2L-33M

25M

25165824

25165824

2

16

1024

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-2Layers-33M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-1L-21M

13M

12582912

12582912

1

16

1024

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instuct-1Layer-21M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

tiny-stories-instruct-2L-33M

25M

25165824

25165824

2

16

1024

50257

gelu_new

standard

False

GPTNeoForCausalLM

LN

roneneldan/TinyStories-Instruct-2Layers-33M

GPT2TokenizerFast

50257.0

v8xfIj5kwZX5RwgLU66lZNZUlE4=

stablelm-base-alpha-3b

stabilityai/stablelm-base-alpha-3b

3.2B

3221225472

3221225472

16

32

4096

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

stabilityai/stablelm-base-alpha-3b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

stablelm-base-alpha-7b

stabilityai/stablelm-base-alpha-7b

7.2B

7247757312

7247757312

16

48

6144

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

stabilityai/stablelm-base-alpha-7b

GPTNeoXTokenizerFast

50254.0

96EawM8Lij99W7OBTk0KW2ELUrQ=

stablelm-tuned-alpha-3b

stabilityai/stablelm-tuned-alpha-3b

3.2B

3221225472

3221225472

16

32

4096

50688

gelu

rotary

True

GPTNeoXForCausalLM

LN

stabilityai/stablelm-tuned-alpha-3b

GPTNeoXTokenizerFast

50254.0

RD3vcWSd_TiTpqo5dHyICzaXtGQ=

stablelm-tuned-alpha-7b

stabilityai/stablelm-tuned-alpha-7b

7.2B

7247757312

7247757312

16

48

6144

50432

gelu

rotary

True

GPTNeoXForCausalLM

LN

stabilityai/stablelm-tuned-alpha-7b

GPTNeoXTokenizerFast

50254.0

RD3vcWSd_TiTpqo5dHyICzaXtGQ=

mistral-7b

mistralai/Mistral-7B-v0.1

7.8B

7784628224

7784628224

32

32

4096

32000

silu

rotary

False

MistralForCausalLM

RMS

mistralai/Mistral-7B-v0.1

LlamaTokenizerFast

32000.0

kkCQxUk-PF9Ay_ZKDdKCh02YaGQ=

mistral-7b-instruct

mistralai/Mistral-7B-Instruct-v0.1

7.8B

7784628224

7784628224

32

32

4096

32000

silu

rotary

False

MistralForCausalLM

RMS

mistralai/Mistral-7B-Instruct-v0.1

LlamaTokenizerFast

32000.0

kkCQxUk-PF9Ay_ZKDdKCh02YaGQ=

mistralai/Mistral-Small-24B-Base-2501

23B

23488102400

23488102400

40

32

5120

131072

silu

rotary

False

MistralForCausalLM

RMS

mistralai/Mistral-Small-24B-Base-2501

LlamaTokenizerFast

131072.0

GEwgZayWxpmhQxtMq-WrVFJEfqM=

mistral-nemo-base-2407

mistralai/Mistral-Nemo-Base-2407

12B

12163481600

12163481600

40

32

5120

131072

silu

rotary

False

MistralForCausalLM

RMS

mistralai/Mistral-Nemo-Base-2407

PreTrainedTokenizerFast

131072.0

0xs_eSvVgsyZGbcLSIpGUn4Gdms=

mixtral

mistralai/Mixtral-8x7B-v0.1

47B

47245688832

47245688832

32

32

4096

32000

silu

rotary

False

MixtralForCausalLM

RMS

mistralai/Mixtral-8x7B-v0.1

LlamaTokenizerFast

32000.0

kkCQxUk-PF9Ay_ZKDdKCh02YaGQ=

mixtral-instruct

mistralai/Mixtral-8x7B-Instruct-v0.1

47B

47245688832

47245688832

32

32

4096

32000

silu

rotary

False

MixtralForCausalLM

RMS

mistralai/Mixtral-8x7B-Instruct-v0.1

LlamaTokenizerFast

32000.0

kkCQxUk-PF9Ay_ZKDdKCh02YaGQ=

bloom-560m

bigscience/bloom-560m

302M

301989888

301989888

24

16

1024

250880

gelu_fast

alibi

False

BloomForCausalLM

LN

bigscience/bloom-560m

BloomTokenizerFast

250680.0

OO9NZoesMCpWsijo1O2DAbq9GqI=

bloom-1b1

bigscience/bloom-1b1

679M

679477248

679477248

24

16

1536

250880

gelu_fast

alibi

False

BloomForCausalLM

LN

bigscience/bloom-1b1

BloomTokenizerFast

250680.0

OO9NZoesMCpWsijo1O2DAbq9GqI=

bloom-1b7

bigscience/bloom-1b7

1.2B

1207959552

1207959552

24

16

2048

250880

gelu_fast

alibi

False

BloomForCausalLM

LN

bigscience/bloom-1b7

BloomTokenizerFast

250680.0

OO9NZoesMCpWsijo1O2DAbq9GqI=

bloom-3b

bigscience/bloom-3b

2.4B

2359296000

2359296000

30

32

2560

250880

gelu_fast

alibi

False

BloomForCausalLM

LN

bigscience/bloom-3b

BloomTokenizerFast

250680.0

OO9NZoesMCpWsijo1O2DAbq9GqI=

bloom-7b1

bigscience/bloom-7b1

6.0B

6039797760

6039797760

30

32

4096

250880

gelu_fast

alibi

False

BloomForCausalLM

LN

bigscience/bloom-7b1

BloomTokenizerFast

250680.0

OO9NZoesMCpWsijo1O2DAbq9GqI=

santacoder

bigcode/santacoder

1.2B

1207959552

1207959552

24

16

2048

49280

gelu_fast

standard

False

GPT2LMHeadCustomModel

LN

bigcode/santacoder

GPT2TokenizerFast

49152.0

GiKC-dU7fpR4sGNkpwn7JKK6qys=

qwen-1.8b

Qwen/Qwen-1_8B

1.2B

1214251008

1214251008

24

16

2048

151936

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-1_8B

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen-7b

Qwen/Qwen-7B

6.5B

6476005376

6476005376

32

32

4096

151936

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-7B

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen-14b

Qwen/Qwen-14B

13B

12609126400

12609126400

40

40

5120

152064

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-14B

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen-1.8b-chat

Qwen/Qwen-1_8B-Chat

1.2B

1214251008

1214251008

24

16

2048

151936

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-1_8B-Chat

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen-7b-chat

Qwen/Qwen-7B-Chat

6.5B

6476005376

6476005376

32

32

4096

151936

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-7B-Chat

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen-14b-chat

Qwen/Qwen-14B-Chat

13B

12609126400

12609126400

40

40

5120

152064

silu

rotary

False

QWenLMHeadModel

RMS

Qwen/Qwen-14B-Chat

QWenTokenizer

151851.0

LXUZBV-DGPX2Ty50XH848Cn_umU=

qwen1.5-0.5b

Qwen/Qwen1.5-0.5B

308M

308281344

308281344

24

16

1024

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-0.5B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-0.5b-chat

Qwen/Qwen1.5-0.5B-Chat

308M

308281344

308281344

24

16

1024

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-0.5B-Chat

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-1.8b

Qwen/Qwen1.5-1.8B

1.2B

1214251008

1214251008

24

16

2048

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-1.8B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-1.8b-chat

Qwen/Qwen1.5-1.8B-Chat

1.2B

1214251008

1214251008

24

16

2048

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-1.8B-Chat

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-4b

Qwen/Qwen1.5-4B

3.2B

3171942400

3171942400

40

20

2560

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-4B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-4b-chat

Qwen/Qwen1.5-4B-Chat

3.2B

3171942400

3171942400

40

20

2560

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-4B-Chat

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-7b

Qwen/Qwen1.5-7B

6.5B

6476005376

6476005376

32

32

4096

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-7B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-7b-chat

Qwen/Qwen1.5-7B-Chat

6.5B

6476005376

6476005376

32

32

4096

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-7B-Chat

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-14b

Qwen/Qwen1.5-14B

13B

12609126400

12609126400

40

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-14B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen1.5-14b-chat

Qwen/Qwen1.5-14B-Chat

13B

12609126400

12609126400

40

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen1.5-14B-Chat

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-0.5b

Qwen/Qwen2-0.5B

391M

390856704

390856704

24

14

896

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-0.5B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-0.5b-instruct

Qwen/Qwen2-0.5B-Instruct

391M

390856704

390856704

24

14

896

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-0.5B-Instruct

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-1.5b

Qwen/Qwen2-1.5B

1.4B

1420296192

1420296192

28

12

1536

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-1.5B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-1.5b-instruct

Qwen/Qwen2-1.5B-Instruct

1.4B

1420296192

1420296192

28

12

1536

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-1.5B-Instruct

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-7b

Qwen/Qwen2-7B

7.1B

7141851136

7141851136

28

28

3584

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-7B

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2-7b-instruct

Qwen/Qwen2-7B-Instruct

7.1B

7141851136

7141851136

28

28

3584

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2-7B-Instruct

Qwen2TokenizerFast

151643.0

vakQOjPaHpZ23kxcqX0tTXi2EzQ=

qwen2.5-0.5b

Qwen/Qwen2.5-0.5B

391M

390856704

390856704

24

14

896

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-0.5B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-0.5b-instruct

Qwen/Qwen2.5-0.5B-Instruct

391M

390856704

390856704

24

14

896

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-0.5B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-1.5b

Qwen/Qwen2.5-1.5B

1.4B

1420296192

1420296192

28

12

1536

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-1.5B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-1.5b-instruct

Qwen/Qwen2.5-1.5B-Instruct

1.4B

1420296192

1420296192

28

12

1536

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-1.5B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-3b

Qwen/Qwen2.5-3B

3.0B

3038773248

3038773248

36

16

2048

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-3B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-3b-instruct

Qwen/Qwen2.5-3B-Instruct

3.0B

3038773248

3038773248

36

16

2048

151936

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-3B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-7b

Qwen/Qwen2.5-7B

7.1B

7141851136

7141851136

28

28

3584

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-7B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-7b-instruct

Qwen/Qwen2.5-7B-Instruct

7.1B

7141851136

7141851136

28

28

3584

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-7B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-14b

Qwen/Qwen2.5-14B

15B

15225323520

15225323520

48

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-14B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-14b-instruct

Qwen/Qwen2.5-14B-Instruct

15B

15225323520

15225323520

48

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-14B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-32b

Qwen/Qwen2.5-32B

34B

33889976320

33889976320

64

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-32B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-32b-instruct

Qwen/Qwen2.5-32B-Instruct

34B

33889976320

33889976320

64

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-32B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-72b

Qwen/Qwen2.5-72B

80B

79607889920

79607889920

80

64

8192

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-72B

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen2.5-72b-instruct

Qwen/Qwen2.5-72B-Instruct

80B

79607889920

79607889920

80

64

8192

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/Qwen2.5-72B-Instruct

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen-32b-preview

Qwen/QwQ-32B-Preview

34B

33889976320

33889976320

64

40

5120

152064

silu

rotary

False

Qwen2ForCausalLM

RMS

Qwen/QwQ-32B-Preview

Qwen2TokenizerFast

151643.0

NI384GYDfJidzgXg_-9habj8lOk=

qwen3-0.6b

Qwen/Qwen3-0.6B

499M

499122176

499122176

28

16

1024

151936

silu

rotary

False

Qwen3ForCausalLM

RMS

Qwen/Qwen3-0.6B

Qwen2TokenizerFast

151643.0

OfOK7SjmrN4KFUqlDylX-Up77SM=

qwen3-1.7b

Qwen/Qwen3-1.7B

1.5B

1526726656

1526726656

28

16

2048

151936

silu

rotary

False

Qwen3ForCausalLM

RMS

Qwen/Qwen3-1.7B

Qwen2TokenizerFast

151643.0

OfOK7SjmrN4KFUqlDylX-Up77SM=

qwen3-4b

Qwen/Qwen3-4B

4.2B

4199546880

4199546880

36

32

2560

151936

silu

rotary

False

Qwen3ForCausalLM

RMS

Qwen/Qwen3-4B

Qwen2TokenizerFast

151643.0

OfOK7SjmrN4KFUqlDylX-Up77SM=

qwen3-8b

Qwen/Qwen3-8B

7.9B

7851737088

7851737088

36

32

4096

151936

silu

rotary

False

Qwen3ForCausalLM

RMS

Qwen/Qwen3-8B

Qwen2TokenizerFast

151643.0

OfOK7SjmrN4KFUqlDylX-Up77SM=

qwen3-14b

Qwen/Qwen3-14B

15B

14889779200

14889779200

40

40

5120

151936

silu

rotary

False

Qwen3ForCausalLM

RMS

Qwen/Qwen3-14B

Qwen2TokenizerFast

151643.0

OfOK7SjmrN4KFUqlDylX-Up77SM=

phi-1

microsoft/phi-1

1.2B

1207959552

1207959552

24

32

2048

51200

gelu_new

rotary

True

PhiForCausalLM

LN

microsoft/phi-1

CodeGenTokenizer

50257.0

TYk6J3OrqdU2F7JYiSfFXtd-vB4=

phi-1_5

microsoft/phi-1_5

1.2B

1207959552

1207959552

24

32

2048

51200

gelu_new

rotary

True

PhiForCausalLM

LN

microsoft/phi-1_5

CodeGenTokenizer

50257.0

TYk6J3OrqdU2F7JYiSfFXtd-vB4=

phi-2

microsoft/phi-2

2.5B

2516582400

2516582400

32

32

2560

51200

gelu_new

rotary

True

PhiForCausalLM

LN

microsoft/phi-2

CodeGenTokenizer

50257.0

TYk6J3OrqdU2F7JYiSfFXtd-vB4=

phi-3

microsoft/Phi-3-mini-4k-instruct

3.6B

3623878656

3623878656

32

32

3072

32064

silu

rotary

False

Phi3ForCausalLM

RMS

microsoft/Phi-3-mini-4k-instruct

LlamaTokenizer

32000.0

2BcGXsWoZjuOkMtb6uTbGL68fbc=

phi-4

microsoft/phi-4

15B

15204352000

15204352000

40

40

5120

100352

silu

rotary

False

Phi3ForCausalLM

RMS

microsoft/phi-4

GPT2Tokenizer

100352.0

uJZqWk6gqn6tO_nlSJEZsP9MITQ=

gemma-2b

google/gemma-2b

2.1B

2113929216

2113929216

18

8

2048

256000

gelu_new

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2b

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-7b

google/gemma-7b

7.8B

7751073792

7751073792

28

16

3072

256000

gelu_new

rotary

False

GemmaForCausalLM

RMS

google/gemma-7b

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2b-it

google/gemma-2b-it

2.1B

2113929216

2113929216

18

8

2048

256000

gelu_new

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2b-it

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-7b-it

google/gemma-7b-it

7.8B

7751073792

7751073792

28

16

3072

256000

gelu_new

rotary

False

GemmaForCausalLM

RMS

google/gemma-7b-it

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-2b

google/gemma-2-2b

2.1B

2146959360

2146959360

26

8

2304

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-2b

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-2b-it

google/gemma-2-2b-it

2.1B

2146959360

2146959360

26

8

2304

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-2b-it

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-9b

google/gemma-2-9b

8.9B

8940158976

8940158976

42

16

3584

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-9b

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-9b-it

google/gemma-2-9b-it

8.9B

8940158976

8940158976

42

16

3584

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-9b-it

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-27b

google/gemma-2-27b

27B

26914848768

26914848768

46

32

4608

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-27b

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

gemma-2-27b-it

google/gemma-2-27b-it

27B

26914848768

26914848768

46

32

4608

256000

gelu_pytorch_tanh

rotary

False

Gemma2ForCausalLM

RMS

google/gemma-2-27b-it

GemmaTokenizerFast

256000.0

87mmm7o-5SoGMD05LzhcJdB_XBk=

yi-6b

01-ai/Yi-6B

6.5B

6476005376

6476005376

32

32

4096

64000

silu

rotary

False

LlamaForCausalLM

RMS

01-ai/Yi-6B

LlamaTokenizerFast

63992.0

VGXAFrTzytwGdUlX6AWH0NacncM=

yi-34b

01-ai/Yi-34B

39B

38755368960

38755368960

60

56

7168

64000

silu

rotary

False

LlamaForCausalLM

RMS

01-ai/Yi-34B

LlamaTokenizerFast

64000.0

VBBPi7l7j0Xrv93YNq1tizlalWw=

yi-6b-chat

01-ai/Yi-6B-Chat

6.5B

6476005376

6476005376

32

32

4096

64000

silu

rotary

False

LlamaForCausalLM

RMS

01-ai/Yi-6B-Chat

LlamaTokenizerFast

63992.0

VGXAFrTzytwGdUlX6AWH0NacncM=

yi-34b-chat

01-ai/Yi-34B-Chat

39B

38755368960

38755368960

60

56

7168

64000

silu

rotary

False

LlamaForCausalLM

RMS

01-ai/Yi-34B-Chat

LlamaTokenizerFast

63992.0

VGXAFrTzytwGdUlX6AWH0NacncM=

t5-small

google-t5/t5-small

19M

18874368

18874368

6

8

512

32128

relu

relative_positional_bias

False

T5ForConditionalGeneration

LN

google-t5/t5-small

T5TokenizerFast

32100.0

jQeywCyCMVL_vza2wKfpuwjNVys=

t5-base

google-t5/t5-base

85M

84934656

84934656

12

12

768

32128

relu

relative_positional_bias

False

T5ForConditionalGeneration

LN

google-t5/t5-base

T5TokenizerFast

32100.0

jQeywCyCMVL_vza2wKfpuwjNVys=

t5-large

google-t5/t5-large

302M

301989888

301989888

24

16

1024

32128

relu

relative_positional_bias

False

T5ForConditionalGeneration

LN

google-t5/t5-large

T5TokenizerFast

32100.0

jQeywCyCMVL_vza2wKfpuwjNVys=

mGPT

1.2B

1207959552

1207959552

24

16

2048

100000

gelu_new

standard

False

GPT2LMHeadModel

LN

ai-forever/mGPT

GPT2TokenizerFast

100000.0

8j6CU_p3zgyeEBZ1Z3lu358tiy0=