Model Properties Table#

n_params

n_layers

d_model

n_heads

act_fn

n_ctx

d_vocab

d_head

d_mlp

n_key_value_heads

gpt2-small

85M

12

768

12

gelu

1024

50257

64

3072

gpt2-medium

302M

24

1024

16

gelu

1024

50257

64

4096

gpt2-large

708M

36

1280

20

gelu

1024

50257

64

5120

gpt2-xl

1.5B

48

1600

25

gelu

1024

50257

64

6400

distillgpt2

42M

6

768

12

gelu

1024

50257

64

3072

opt-125m

85M

12

768

12

relu

2048

50272

64

3072

opt-1.3b

1.2B

24

2048

32

relu

2048

50272

64

8192

opt-2.7b

2.5B

32

2560

32

relu

2048

50272

80

10240

opt-6.7b

6.4B

32

4096

32

relu

2048

50272

128

16384

opt-13b

13B

40

5120

40

relu

2048

50272

128

20480

opt-30b

30B

48

7168

56

relu

2048

50272

128

28672

opt-66b

65B

64

9216

72

relu

2048

50272

128

36864

gpt-neo-125M

85M

12

768

12

gelu

2048

50257

64

3072

gpt-neo-1.3B

1.2B

24

2048

16

gelu

2048

50257

128

8192

gpt-neo-2.7B

2.5B

32

2560

20

gelu

2048

50257

128

10240

gpt-j-6B

5.6B

28

4096

16

gelu

2048

50400

256

16384

gpt-neox-20b

20B

44

6144

64

gelu

2048

50432

96

24576

stanford-gpt2-small-a

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-b

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-c

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-d

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-small-e

85M

12

768

12

gelu

1024

50257

64

3072

stanford-gpt2-medium-a

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-b

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-c

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-d

302M

24

1024

16

gelu

1024

50257

64

4096

stanford-gpt2-medium-e

302M

24

1024

16

gelu

1024

50257

64

4096

pythia-14m

1.2M

6

128

4

gelu

2048

50304

32

512

pythia-31m

4.7M

6

256

8

gelu

2048

50304

32

1024

pythia-70m

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-deduped

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-deduped

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-deduped

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-deduped

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-deduped

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-deduped

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-deduped

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-deduped

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-v0

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-v0

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-v0

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-v0

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-v0

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-v0

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-v0

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-v0

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-70m-deduped-v0

19M

6

512

8

gelu

2048

50304

64

2048

pythia-160m-deduped-v0

85M

12

768

12

gelu

2048

50304

64

3072

pythia-410m-deduped-v0

302M

24

1024

16

gelu

2048

50304

64

4096

pythia-1b-deduped-v0

805M

16

2048

8

gelu

2048

50304

256

8192

pythia-1.4b-deduped-v0

1.2B

24

2048

16

gelu

2048

50304

128

8192

pythia-2.8b-deduped-v0

2.5B

32

2560

32

gelu

2048

50304

80

10240

pythia-6.9b-deduped-v0

6.4B

32

4096

32

gelu

2048

50432

128

16384

pythia-12b-deduped-v0

11B

36

5120

40

gelu

2048

50688

128

20480

pythia-160m-seed1

85M

12

768

12

gelu

2048

50304

64

3072

pythia-160m-seed2

85M

12

768

12

gelu

2048

50304

64

3072

pythia-160m-seed3

85M

12

768

12

gelu

2048

50304

64

3072

solu-1l-pile

13M

1

1024

16

solu

1024

50278

64

4096

solu-2l-pile

13M

2

736

11

solu

1024

50278

64

2944

solu-4l-pile

13M

4

512

8

solu

1024

50278

64

2048

solu-6l-pile

42M

6

768

12

solu

1024

50278

64

3072

solu-8l-pile

101M

8

1024

16

solu

1024

50278

64

4096

solu-10l-pile

197M

10

1280

20

solu

1024

50278

64

5120

solu-12l-pile

340M

12

1536

24

solu

1024

50278

64

6144

solu-1l

3.1M

1

512

8

solu

1024

48262

64

2048

solu-2l

6.3M

2

512

8

solu

1024

48262

64

2048

solu-3l

9.4M

3

512

8

solu

1024

48262

64

2048

solu-4l

13M

4

512

8

solu

1024

48262

64

2048

solu-6l

42M

6

768

12

solu

1024

48262

64

3072

solu-8l

101M

8

1024

16

solu

1024

48262

64

4096

solu-10l

197M

10

1280

20

solu

1024

48262

64

5120

solu-12l

340M

12

1536

24

solu

1024

48262

64

6144

gelu-1l

3.1M

1

512

8

gelu

1024

48262

64

2048

gelu-2l

6.3M

2

512

8

gelu

1024

48262

64

2048

gelu-3l

9.4M

3

512

8

gelu

1024

48262

64

2048

gelu-4l

13M

4

512

8

gelu

1024

48262

64

2048

attn-only-1l

1.0M

1

512

8

attn_only

1024

48262

64

2048

attn-only-2l

2.1M

2

512

8

attn_only

1024

48262

64

2048

attn-only-3l

3.1M

3

512

8

attn_only

1024

48262

64

2048

attn-only-4l

4.2M

4

512

8

attn_only

1024

48262

64

2048

attn-only-2l-demo

2.1M

2

512

8

attn_only

1024

50277

64

2048

solu-1l-wiki

3.1M

1

512

8

solu

1024

48262

64

2048

solu-4l-wiki

13M

4

512

8

solu

1024

48262

64

2048

redwood_attn_2l

524K

2

256

8

attn_only

2048

50259

32

-1

llama-7b

6.5B

32

4096

32

silu

2048

32000

128

11008

llama-13b

13B

40

5120

40

silu

2048

32000

128

13824

llama-30b

32B

60

6656

52

silu

2048

32000

128

17920

llama-65b

65B

80

8192

64

silu

2048

32000

128

22016

Llama-2-7b

6.5B

32

4096

32

silu

4096

32000

128

11008

Llama-2-7b-chat

6.5B

32

4096

32

silu

4096

32000

128

11008

Llama-2-13b

13B

40

5120

40

silu

4096

32000

128

13824

Llama-2-13b-chat

13B

40

5120

40

silu

4096

32000

128

13824

Llama-2-70b-chat

78B

80

8192

64

silu

4096

32000

128

28672

8

CodeLlamallama-2-7b

6.5B

32

4096

32

silu

4096

32016

128

11008

CodeLlama-7b-python

6.5B

32

4096

32

silu

4096

32000

128

11008

CodeLlama-7b-instruct

6.5B

32

4096

32

silu

4096

32016

128

11008

meta-llama/Meta-Llama-3-8B

7.8B

32

4096

32

silu

8192

128256

128

14336

8

meta-llama/Meta-Llama-3-8B-Instruct

7.8B

32

4096

32

silu

8192

128256

128

14336

8

meta-llama/Meta-Llama-3-70B

78B

80

8192

64

silu

8192

128256

128

28672

8

meta-llama/Meta-Llama-3-70B-Instruct

78B

80

8192

64

silu

8192

128256

128

28672

8

meta-llama/Llama-3.2-1B

1.1B

16

2048

32

silu

2048

128256

64

8192

8

meta-llama/Llama-3.2-3B

3.2B

28

3072

24

silu

2048

128256

128

8192

8

meta-llama/Llama-3.2-1B-Instruct

1.1B

16

2048

32

silu

2048

128256

64

8192

8

meta-llama/Llama-3.2-3B-Instruct

3.2B

28

3072

24

silu

2048

128256

128

8192

8

meta-llama/Llama-3.1-70B

78B

80

8192

64

silu

2048

128256

128

28672

8

meta-llama/Llama-3.1-8B

7.8B

32

4096

32

silu

2048

128256

128

14336

8

meta-llama/Llama-3.1-8B-Instruct

7.8B

32

4096

32

silu

2048

128256

128

14336

8

meta-llama/Llama-3.1-70B-Instruct

78B

80

8192

64

silu

2048

128256

128

28672

8

othello-gpt

25M

8

512

8

gelu

59

61

64

2048

bert-base-cased

85M

12

768

12

gelu

512

28996

64

3072

tiny-stories-1M

393K

8

64

16

gelu

2048

50257

4

256

tiny-stories-3M

1.6M

8

128

16

gelu

2048

50257

8

512

tiny-stories-8M

6.3M

8

256

16

gelu

2048

50257

16

1024

tiny-stories-28M

25M

8

512

16

gelu

2048

50257

32

2048

tiny-stories-33M

28M

4

768

16

gelu

2048

50257

48

3072

tiny-stories-instruct-1M

393K

8

64

16

gelu

2048

50257

4

256

tiny-stories-instruct-3M

1.6M

8

128

16

gelu

2048

50257

8

512

tiny-stories-instruct-8M

6.3M

8

256

16

gelu

2048

50257

16

1024

tiny-stories-instruct-28M

25M

8

512

16

gelu

2048

50257

32

2048

tiny-stories-instruct-33M

28M

4

768

16

gelu

2048

50257

48

3072

tiny-stories-1L-21M

13M

1

1024

16

gelu

2048

50257

64

4096

tiny-stories-2L-33M

25M

2

1024

16

gelu

2048

50257

64

4096

tiny-stories-instruct-1L-21M

13M

1

1024

16

gelu

2048

50257

64

4096

tiny-stories-instruct-2L-33M

25M

2

1024

16

gelu

2048

50257

64

4096

stablelm-base-alpha-3b

3.2B

16

4096

32

gelu

4096

50688

128

16384

stablelm-base-alpha-7b

7.2B

16

6144

48

gelu

4096

50432

128

24576

stablelm-tuned-alpha-3b

3.2B

16

4096

32

gelu

4096

50688

128

16384

stablelm-tuned-alpha-7b

7.2B

16

6144

48

gelu

4096

50432

128

24576

mistral-7b

7.8B

32

4096

32

silu

2048

32000

128

14336

8

mistral-7b-instruct

7.8B

32

4096

32

silu

2048

32000

128

14336

8

mistral-nemo-base-2407

12B

40

5120

32

silu

2048

131072

128

14336

8

mixtral

47B

32

4096

32

silu

32768

32000

128

14336

8

mixtral-instruct

47B

32

4096

32

silu

32768

32000

128

14336

8

bloom-560m

302M

24

1024

16

gelu

2048

250880

64

4096

bloom-1b1

679M

24

1536

16

gelu

2048

250880

96

6144

bloom-1b7

1.2B

24

2048

16

gelu

2048

250880

128

8192

bloom-3b

2.4B

30

2560

32

gelu

2048

250880

80

10240

bloom-7b1

6.0B

30

4096

32

gelu

2048

250880

128

16384

santacoder

1.2B

24

2048

16

gelu

2048

49280

128

8192

qwen-1.8b

1.2B

24

2048

16

silu

2048

151936

128

5504

qwen-7b

6.5B

32

4096

32

silu

2048

151936

128

11008

qwen-14b

13B

40

5120

40

silu

2048

152064

128

13696

qwen-1.8b-chat

1.2B

24

2048

16

silu

2048

151936

128

5504

qwen-7b-chat

6.5B

32

4096

32

silu

2048

151936

128

11008

qwen-14b-chat

13B

40

5120

40

silu

2048

152064

128

13696

qwen1.5-0.5b

308M

24

1024

16

silu

2048

151936

64

2816

16

qwen1.5-0.5b-chat

308M

24

1024

16

silu

2048

151936

64

2816

16

qwen1.5-1.8b

1.2B

24

2048

16

silu

2048

151936

128

5504

16

qwen1.5-1.8b-chat

1.2B

24

2048

16

silu

2048

151936

128

5504

16

qwen1.5-4b

3.2B

40

2560

20

silu

2048

151936

128

6912

20

qwen1.5-4b-chat

3.2B

40

2560

20

silu

2048

151936

128

6912

20

qwen1.5-7b

6.5B

32

4096

32

silu

2048

151936

128

11008

32

qwen1.5-7b-chat

6.5B

32

4096

32

silu

2048

151936

128

11008

32

qwen1.5-14b

13B

40

5120

40

silu

2048

152064

128

13696

40

qwen1.5-14b-chat

13B

40

5120

40

silu

2048

152064

128

13696

40

Qwen/Qwen2-0.5B

391M

24

896

14

silu

2048

151936

64

4864

2

Qwen/Qwen2-0.5B-Instruct

391M

24

896

14

silu

2048

151936

64

4864

2

Qwen/Qwen2-1.5B

1.4B

28

1536

12

silu

2048

151936

128

8960

2

Qwen/Qwen2-1.5B-Instruct

1.4B

28

1536

12

silu

2048

151936

128

8960

2

Qwen/Qwen2-7B

7.1B

28

3584

28

silu

2048

152064

128

18944

4

Qwen/Qwen2-7B-Instruct

7.1B

28

3584

28

silu

2048

152064

128

18944

4

phi-1

1.2B

24

2048

32

gelu

2048

51200

64

8192

phi-1_5

1.2B

24

2048

32

gelu

2048

51200

64

8192

phi-2

2.5B

32

2560

32

gelu

2048

51200

80

10240

phi-3

3.6B

32

3072

32

silu

4096

32064

96

8192

gemma-2b

2.1B

18

2048

8

gelu

8192

256000

256

16384

1

gemma-7b

7.8B

28

3072

16

gelu

8192

256000

256

24576

16

gemma-2b-it

2.1B

18

2048

8

gelu

8192

256000

256

16384

1

gemma-7b-it

7.8B

28

3072

16

gelu

8192

256000

256

24576

16

gemma-2-2b

2.1B

26

2304

8

gelu_pytorch_tanh

8192

256000

256

9216

4

gemma-2-2b-it

2.1B

26

2304

8

gelu_pytorch_tanh

8192

256000

256

9216

4

gemma-2-9b

8.9B

42

3584

16

gelu_pytorch_tanh

8192

256000

256

14336

8

gemma-2-9b-it

8.9B

42

3584

16

gelu_pytorch_tanh

8192

256000

256

14336

8

gemma-2-27b

27B

46

4608

32

gelu_pytorch_tanh

8192

256000

128

36864

16

gemma-2-27b-it

27B

46

4608

32

gelu_pytorch_tanh

8192

256000

128

36864

16

yi-6b

6.5B

32

4096

32

silu

4096

64000

128

11008

4

yi-34b

39B

60

7168

56

silu

4096

64000

128

20480

8

yi-6b-chat

6.5B

32

4096

32

silu

4096

64000

128

11008

4

yi-34b-chat

39B

60

7168

56

silu

4096

64000

128

20480

8

t5-small

19M

6

512

8

relu

20

32128

64

2048

t5-base

85M

12

768

12

relu

20

32128

64

3072

t5-large

302M

24

1024

16

relu

20

32128

64

4096

mGPT

1.2B

24

2048

16

gelu

2048

100000

128

8192