Compare commits
2 Commits
723082f3ab
...
70404b8e87
Author | SHA1 | Date | |
---|---|---|---|
![]() |
70404b8e87 | ||
![]() |
4505546641 |
@ -18,30 +18,30 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--out_dir "out" \
|
||||
--epochs 3 \
|
||||
--embedding_epoch 2 \
|
||||
--batch_size 72 \
|
||||
--learning_rate 1e-4 \
|
||||
--dtype "bfloat16" \
|
||||
--batch_size 48 \
|
||||
--learning_rate 2e-4 \
|
||||
--dtype bfloat16 \
|
||||
--use_swanlab \
|
||||
--swanlab_project "MiniMind-Pretrain" \
|
||||
--num_workers 1 \
|
||||
--accumulation_steps 16 \
|
||||
--grad_clip 0.5 \
|
||||
--accumulation_steps 32 \
|
||||
--grad_clip 1.0 \
|
||||
--warmup_iters 0 \
|
||||
--log_interval 100 \
|
||||
--save_interval 10000 \
|
||||
--dim 1024 \
|
||||
--n_layers 32 \
|
||||
--n_layers 18 \
|
||||
--max_seq_len 512 \
|
||||
--use_moe False \
|
||||
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
||||
--profile \
|
||||
--profile_interval 10 \
|
||||
--use_flash_attn \
|
||||
--knowledge_num 960400 \
|
||||
--knowledge_num 1048576 \
|
||||
--knowledge_length 32 \
|
||||
--database_init_path "./dataset/stable/sentence_trex_data.json" \
|
||||
--fast_clustering \
|
||||
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
||||
--memory_monitor_interval 10 \
|
||||
--model_type "model_original" \
|
||||
--model_size "0.5B"
|
||||
--model_size 814.724
|
||||
|
47
run_file/experiment_1.3.1.sh
Normal file
47
run_file/experiment_1.3.1.sh
Normal file
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 激活conda环境
|
||||
source $(conda info --base)/etc/profile.d/conda.sh
|
||||
conda activate ycz_accelerate
|
||||
|
||||
# 设置环境变量以帮助调试
|
||||
export NCCL_DEBUG=INFO
|
||||
export PYTHONFAULTHANDLER=1
|
||||
|
||||
# 实验1.3.0 - 使用命令行参数直接配置accelerate
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--multi_gpu \
|
||||
--num_processes=4 \
|
||||
--mixed_precision=bf16 \
|
||||
--main_process_port=29500 \
|
||||
train_pretrain_accelerate.py \
|
||||
--out_dir "out" \
|
||||
--epochs 3 \
|
||||
--embedding_epoch 2 \
|
||||
--batch_size 48 \
|
||||
--learning_rate 2e-4 \
|
||||
--dtype bfloat16 \
|
||||
--use_swanlab \
|
||||
--swanlab_project "MiniMind-Pretrain" \
|
||||
--num_workers 1 \
|
||||
--accumulation_steps 32 \
|
||||
--grad_clip 1.0 \
|
||||
--warmup_iters 0 \
|
||||
--log_interval 100 \
|
||||
--save_interval 10000 \
|
||||
--dim 1024 \
|
||||
--n_layers 18 \
|
||||
--max_seq_len 512 \
|
||||
--use_moe False \
|
||||
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
||||
--profile \
|
||||
--profile_interval 10 \
|
||||
--use_flash_attn \
|
||||
--knowledge_num 1048576 \
|
||||
--knowledge_length 32 \
|
||||
--database_init_path "./dataset/stable/sentence_trex_data.json" \
|
||||
--fast_clustering \
|
||||
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
||||
--memory_monitor_interval 10 \
|
||||
--model_type "model" \
|
||||
--model_size 814.724
|
@ -334,7 +334,7 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
optimizer_end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
# 预取数据
|
||||
prefetch_factor = 4 # 预取的批次数
|
||||
prefetch_factor = 8 # 预取的批次数
|
||||
data_iter = iter(train_loader)
|
||||
prefetch_batches = []
|
||||
|
||||
@ -632,8 +632,8 @@ def main():
|
||||
parser.add_argument("--memory_monitor", action="store_true", default=False, help="启用内存监控")
|
||||
parser.add_argument("--memory_monitor_interval", type=int, default=10, help="内存监控间隔(步数)")
|
||||
parser.add_argument("--model_type", type=str, default="model", help="使用什么模型训练") #model,model_original
|
||||
parser.add_argument("--model_size", type=str, default="0.05B", help="模型大小")
|
||||
parser.add_argument("--swanlab_online", type=bool, default=True, help="是否使用在线SwanLab服务")
|
||||
parser.add_argument("--model_size", type=float, default=50.0, help="模型大小")
|
||||
parser.add_argument("--swanlab_online", type=bool, default=False, help="是否使用在线SwanLab服务")
|
||||
args = parser.parse_args()
|
||||
|
||||
#########################################################
|
||||
@ -709,15 +709,15 @@ def main():
|
||||
project=args.swanlab_project,
|
||||
experiment_name=args.swanlab_run_name,
|
||||
description="MiniMind预训练实验,使用本地部署的SwanLab进行可视化",
|
||||
config=config_dict,
|
||||
mode="offline"
|
||||
config=config_dict
|
||||
)
|
||||
else:
|
||||
swanlab_run = swanlab.init(
|
||||
project=args.swanlab_project,
|
||||
experiment_name=args.swanlab_run_name,
|
||||
description="MiniMind预训练实验,使用本地部署的SwanLab进行可视化",
|
||||
config=config_dict
|
||||
config=config_dict,
|
||||
mode="offline"
|
||||
)
|
||||
else:
|
||||
swanlab_run = None
|
||||
|
23
uv.lock
generated
23
uv.lock
generated
@ -12,6 +12,15 @@ resolution-markers = [
|
||||
"python_full_version < '3.11' and sys_platform != 'linux'",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
version = "2.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/10/2a/c93173ffa1b39c1d0395b7e842bbdc62e556ca9d8d3b5572926f3e4ca752/absl_py-2.3.1.tar.gz", hash = "sha256:a97820526f7fbfd2ec1bce83f3f25e3a14840dac0d8e02a0b71cd75db3f77fc9", size = 116588, upload-time = "2025-07-03T09:31:44.05Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "accelerate"
|
||||
version = "1.7.0"
|
||||
@ -1794,6 +1803,7 @@ dependencies = [
|
||||
{ name = "regex" },
|
||||
{ name = "requests" },
|
||||
{ name = "rich" },
|
||||
{ name = "rouge-score" },
|
||||
{ name = "rpds-py" },
|
||||
{ name = "s3transfer" },
|
||||
{ name = "safetensors" },
|
||||
@ -1965,6 +1975,7 @@ requires-dist = [
|
||||
{ name = "regex", specifier = "==2024.11.6" },
|
||||
{ name = "requests", specifier = "==2.32.3" },
|
||||
{ name = "rich", specifier = "==13.7.1" },
|
||||
{ name = "rouge-score", specifier = ">=0.1.2" },
|
||||
{ name = "rpds-py", specifier = "==0.24.0" },
|
||||
{ name = "s3transfer", specifier = "==0.13.0" },
|
||||
{ name = "safetensors", specifier = "==0.5.3" },
|
||||
@ -3496,6 +3507,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222", size = 240681, upload-time = "2024-02-28T14:51:14.353Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rouge-score"
|
||||
version = "0.1.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "absl-py" },
|
||||
{ name = "nltk" },
|
||||
{ name = "numpy" },
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e2/c5/9136736c37022a6ad27fea38f3111eb8f02fe75d067f9a985cc358653102/rouge_score-0.1.2.tar.gz", hash = "sha256:c7d4da2683e68c9abf0135ef915d63a46643666f848e558a1b9f7ead17ff0f04", size = 17400, upload-time = "2022-07-22T22:46:22.909Z" }
|
||||
|
||||
[[package]]
|
||||
name = "rpds-py"
|
||||
version = "0.24.0"
|
||||
|
Loading…
x
Reference in New Issue
Block a user