|
| 1 | +# Copyright (c) InternLM. All rights reserved. |
| 2 | + |
| 3 | +model_type = "INTERNLM2" |
| 4 | + |
| 5 | +VOCAB_SIZE = 92544 |
| 6 | +HIDDEN_SIZE = 6144 |
| 7 | +NUM_ATTENTION_HEAD = 48 |
| 8 | +NUM_KV_ATTENTION_HEAD = 8 |
| 9 | +MLP_RATIO = 8 / 3 |
| 10 | +NUM_LAYER = 48 |
| 11 | + |
| 12 | +model = dict( |
| 13 | + num_chunks=1, |
| 14 | + checkpoint=1.0, |
| 15 | + dtype="torch.bfloat16", |
| 16 | + embed_split_hidden=True, |
| 17 | + num_layers=NUM_LAYER, |
| 18 | + hidden_size=HIDDEN_SIZE, |
| 19 | + vocab_size=VOCAB_SIZE, |
| 20 | + embed_grad_scale=1, |
| 21 | + parallel_output=True, |
| 22 | + num_attention_heads=NUM_ATTENTION_HEAD, |
| 23 | + num_kv_attention_heads=NUM_KV_ATTENTION_HEAD, |
| 24 | + mlp_ratio=MLP_RATIO, |
| 25 | + norm_type="rmsnorm", |
| 26 | + adapt_hf=True, |
| 27 | + apply_post_layer_norm=False, |
| 28 | + no_bias=True, |
| 29 | + layer_norm_epsilon=1e-5, |
| 30 | + rope_base=1000000, |
| 31 | +) |
| 32 | + |
| 33 | +hybrid_zero_optimizer = dict( |
| 34 | + # Enable low_level_optimzer overlap_communication |
| 35 | + overlap_sync_grad=True, |
| 36 | + overlap_sync_param=False, |
| 37 | + # bucket size for nccl communication params |
| 38 | + reduce_bucket_size=512 * 1024 * 1024, |
| 39 | + # grad clipping |
| 40 | + clip_grad_norm=1.0, |
| 41 | +) |
| 42 | + |
| 43 | +# zero1 parallel: |
| 44 | +# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, |
| 45 | +# so parameters will be divided within the range of dp. |
| 46 | +# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. |
| 47 | +# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. |
| 48 | +# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. |
| 49 | +# 4. fsdp: bool, whether to use fsdp in pytorch, which can be a subsitution of ZeRO1. |
| 50 | +# pipeline parallel (dict): |
| 51 | +# 1. size: int, the size of pipeline parallel. |
| 52 | +# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. |
| 53 | +# tensor parallel: tensor parallel size, usually the number of GPUs per node. |
| 54 | +parallel = dict( |
| 55 | + zero1=dict(size=16, fsdp=False), |
| 56 | + tensor=2, |
| 57 | + pipeline=dict(size=1, interleaved_overlap=True), |
| 58 | + sequence_parallel=True, |
| 59 | +) |
0 commit comments