Skip to content

Commit 0a4ffea

Browse files
committed
Merge branch 'feat/internlm2' into v0.2.3-internlm2
2 parents a3260a8 + bfbc10f commit 0a4ffea

12 files changed

Lines changed: 1406 additions & 85 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ aim_logs/
128128
nvmelogs/
129129
run_backup/
130130
runs/
131+
RUN/
131132
runs_bak/
132133
LLM_ALERT
133134
small_demo/

configs/_base_/default_runtime.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
cudnn_deterministic = False
4+
cudnn_benchmark = False
5+
6+
enable_tb = True
7+
8+
grad_profiling = dict(
9+
# calculate layer norms and parameter norms, and show them on tensorboard
10+
grad_norm_profiling=False,
11+
# count zero gradients, and show them on tensorboard
12+
zero_grad_profiling=False,
13+
# [optional] layers displayed on tensorboard, default: layers=["ScaleColumnParallelLinear"]
14+
# if not set, display all layers
15+
layers=["ScaleColumnParallelLinear"],
16+
vocab_grad_norm_profiling=False,
17+
interval_steps=5,
18+
)
19+
20+
grad_scaler = dict(
21+
fp16=dict(
22+
# the initial loss scale, defaults to 2**16
23+
initial_scale=2**16,
24+
# the minimum loss scale, defaults to None
25+
min_scale=1,
26+
# the number of steps to increase loss scale when no overflow occurs
27+
growth_interval=1000,
28+
),
29+
# the multiplication factor for increasing loss scale, defaults to 2
30+
growth_factor=2,
31+
# the multiplication factor for decreasing loss scale, defaults to 0.5
32+
backoff_factor=0.5,
33+
# the maximum loss scale, defaults to None
34+
max_scale=2**24,
35+
# the number of overflows before decreasing loss scale, defaults to 2
36+
hysteresis=2,
37+
)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
model_type = "INTERNLM2"
4+
5+
VOCAB_SIZE = 92544
6+
HIDDEN_SIZE = 6144
7+
NUM_ATTENTION_HEAD = 48
8+
NUM_KV_ATTENTION_HEAD = 8
9+
MLP_RATIO = 8 / 3
10+
NUM_LAYER = 48
11+
12+
model = dict(
13+
num_chunks=1,
14+
checkpoint=1.0,
15+
dtype="torch.bfloat16",
16+
embed_split_hidden=True,
17+
num_layers=NUM_LAYER,
18+
hidden_size=HIDDEN_SIZE,
19+
vocab_size=VOCAB_SIZE,
20+
embed_grad_scale=1,
21+
parallel_output=True,
22+
num_attention_heads=NUM_ATTENTION_HEAD,
23+
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
24+
mlp_ratio=MLP_RATIO,
25+
norm_type="rmsnorm",
26+
adapt_hf=True,
27+
apply_post_layer_norm=False,
28+
no_bias=True,
29+
layer_norm_epsilon=1e-5,
30+
rope_base=1000000,
31+
)
32+
33+
hybrid_zero_optimizer = dict(
34+
# Enable low_level_optimzer overlap_communication
35+
overlap_sync_grad=True,
36+
overlap_sync_param=False,
37+
# bucket size for nccl communication params
38+
reduce_bucket_size=512 * 1024 * 1024,
39+
# grad clipping
40+
clip_grad_norm=1.0,
41+
)
42+
43+
# zero1 parallel:
44+
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
45+
# so parameters will be divided within the range of dp.
46+
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
47+
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
48+
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
49+
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
50+
# pipeline parallel (dict):
51+
# 1. size: int, the size of pipeline parallel.
52+
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
53+
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
54+
parallel = dict(
55+
zero1=dict(size=16, fsdp=False),
56+
tensor=2,
57+
pipeline=dict(size=1, interleaved_overlap=True),
58+
sequence_parallel=True,
59+
)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
model_type = "INTERNLM2"
4+
5+
VOCAB_SIZE = 92544
6+
HIDDEN_SIZE = 4096
7+
NUM_ATTENTION_HEAD = 32
8+
NUM_KV_ATTENTION_HEAD = 8
9+
MLP_RATIO = 3.5
10+
NUM_LAYER = 32
11+
12+
model = dict(
13+
num_chunks=1,
14+
checkpoint=0.2,
15+
dtype="torch.bfloat16",
16+
embed_split_hidden=True,
17+
num_layers=NUM_LAYER,
18+
hidden_size=HIDDEN_SIZE,
19+
vocab_size=VOCAB_SIZE,
20+
embed_grad_scale=1,
21+
parallel_output=True,
22+
num_attention_heads=NUM_ATTENTION_HEAD,
23+
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
24+
mlp_ratio=MLP_RATIO,
25+
norm_type="rmsnorm",
26+
adapt_hf=False,
27+
apply_post_layer_norm=False,
28+
no_bias=True,
29+
layer_norm_epsilon=1e-5,
30+
rope_base=1000000,
31+
)
32+
33+
hybrid_zero_optimizer = dict(
34+
# Enable low_level_optimzer overlap_communication
35+
overlap_sync_grad=True,
36+
overlap_sync_param=False,
37+
# bucket size for nccl communication params
38+
reduce_bucket_size=512 * 1024 * 1024,
39+
# grad clipping
40+
clip_grad_norm=1.0,
41+
)
42+
43+
# zero1 parallel:
44+
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
45+
# so parameters will be divided within the range of dp.
46+
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
47+
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
48+
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
49+
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
50+
# pipeline parallel (dict):
51+
# 1. size: int, the size of pipeline parallel.
52+
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
53+
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
54+
parallel = dict(
55+
zero1=dict(size=8, fsdp=False),
56+
tensor=1,
57+
pipeline=dict(size=1, interleaved_overlap=True),
58+
sequence_parallel=False,
59+
)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
model_type = "INTERNLM"
4+
5+
VOCAB_SIZE = 103168
6+
HIDDEN_SIZE = 5120
7+
NUM_ATTENTION_HEAD = 40
8+
MLP_RATIO = 8 / 3
9+
NUM_LAYER = 60
10+
11+
model = dict(
12+
num_chunks=1,
13+
checkpoint=False,
14+
dtype="torch.bfloat16",
15+
embed_split_hidden=True,
16+
num_layers=NUM_LAYER,
17+
hidden_size=HIDDEN_SIZE,
18+
vocab_size=VOCAB_SIZE,
19+
embed_grad_scale=1,
20+
parallel_output=True,
21+
num_attention_heads=NUM_ATTENTION_HEAD,
22+
mlp_ratio=MLP_RATIO,
23+
norm_type="rmsnorm",
24+
apply_post_layer_norm=False,
25+
layer_norm_epsilon=1e-5,
26+
)
27+
28+
hybrid_zero_optimizer = dict(
29+
# Enable overlap_communication
30+
overlap_sync_grad=True,
31+
overlap_sync_param=False,
32+
# bucket size for nccl communication params
33+
reduce_bucket_size=512 * 1024 * 1024,
34+
# grad clipping
35+
clip_grad_norm=1.0,
36+
)
37+
38+
# zero1 parallel:
39+
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
40+
# so parameters will be divided within the range of dp.
41+
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
42+
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
43+
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
44+
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
45+
# pipeline parallel (dict):
46+
# 1. size: int, the size of pipeline parallel.
47+
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
48+
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
49+
parallel = dict(
50+
zero1=dict(size=8, fsdp=False),
51+
tensor=4,
52+
pipeline=dict(size=1, interleaved_overlap=True),
53+
sequence_parallel=False,
54+
)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
model_type = "INTERNLM"
4+
5+
VOCAB_SIZE = 103168
6+
HIDDEN_SIZE = 4096
7+
NUM_ATTENTION_HEAD = 32
8+
MLP_RATIO = 8 / 3
9+
NUM_LAYER = 32
10+
11+
model = dict(
12+
num_chunks=1,
13+
checkpoint=False,
14+
dtype="torch.bfloat16",
15+
embed_split_hidden=True,
16+
num_layers=NUM_LAYER,
17+
hidden_size=HIDDEN_SIZE,
18+
vocab_size=VOCAB_SIZE,
19+
embed_grad_scale=1,
20+
parallel_output=True,
21+
num_attention_heads=NUM_ATTENTION_HEAD,
22+
mlp_ratio=MLP_RATIO,
23+
norm_type="rmsnorm",
24+
apply_post_layer_norm=False,
25+
layer_norm_epsilon=1e-5,
26+
)
27+
28+
hybrid_zero_optimizer = dict(
29+
# Enable overlap_communication
30+
overlap_sync_grad=True,
31+
overlap_sync_param=False,
32+
# bucket size for nccl communication params
33+
reduce_bucket_size=512 * 1024 * 1024,
34+
# grad clipping
35+
clip_grad_norm=1.0,
36+
)
37+
38+
# zero1 parallel:
39+
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
40+
# so parameters will be divided within the range of dp.
41+
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
42+
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
43+
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
44+
# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1.
45+
# pipeline parallel (dict):
46+
# 1. size: int, the size of pipeline parallel.
47+
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
48+
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
49+
parallel = dict(
50+
zero1=dict(size=8, fsdp=False),
51+
tensor=1,
52+
pipeline=dict(size=1, interleaved_overlap=True),
53+
sequence_parallel=False,
54+
)

0 commit comments

Comments
 (0)