修复了一些bug
This commit is contained in:
parent
d701003f8a
commit
d9d281967e
126
.vscode/launch.json
vendored
126
.vscode/launch.json
vendored
@ -2,39 +2,123 @@
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "DynamicKV-LLM Mini Minimind Debug",
|
||||
"name": "MiniMind Training (Direct Python)",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/train_pretrain_accelerate.py",
|
||||
"args": [
|
||||
"--out_dir", "out",
|
||||
"--epochs", "3",
|
||||
"--embedding_epoch", "2",
|
||||
"--batch_size", "128",
|
||||
"--learning_rate", "8e-5",
|
||||
"--dtype", "bfloat16",
|
||||
"--use_swanlab",
|
||||
"--swanlab_project", "MiniMind-Pretrain",
|
||||
"--num_workers", "1",
|
||||
"--accumulation_steps", "16",
|
||||
"--grad_clip", "0.5",
|
||||
"--warmup_iters", "0",
|
||||
"--log_interval", "1",
|
||||
"--save_interval", "10000",
|
||||
"--dim", "512",
|
||||
"--n_layers", "8",
|
||||
"--max_seq_len", "512",
|
||||
"--data_path", "./dataset/stable/merged_pretrain.jsonl",
|
||||
"--profile",
|
||||
"--profile_interval", "10",
|
||||
"--use_flash_attn",
|
||||
"--knowledge_num", "1048576",
|
||||
"--knowledge_length", "32",
|
||||
"--database_init_path", "./dataset/stable/sentence_trex_data.json",
|
||||
"--fast_clustering",
|
||||
"--cluster_cache_path", "./cache/cluster_tokens_single.pt",
|
||||
"--memory_monitor_interval", "10",
|
||||
"--model_type", "model",
|
||||
"--model_size", "538"
|
||||
],
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "0",
|
||||
"NCCL_DEBUG": "INFO",
|
||||
"PYTHONFAULTHANDLER": "1"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"stopOnEntry": false,
|
||||
"python": "${workspaceFolder}/.venv/bin/python"
|
||||
},
|
||||
{
|
||||
"name": "MiniMind Training (Direct Python - Simple)",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/train_pretrain_accelerate.py",
|
||||
"args": [
|
||||
"--epochs", "1",
|
||||
"--batch_size", "32",
|
||||
"--learning_rate", "1e-4",
|
||||
"--log_interval", "10",
|
||||
"--profile_interval", "2",
|
||||
"--model_type", "model_original"
|
||||
],
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "0"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"stopOnEntry": false,
|
||||
"python": "${workspaceFolder}/.venv/bin/python"
|
||||
},
|
||||
{
|
||||
"name": "MiniMind Test (Direct Python)",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/test.py",
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "0"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"python": "${workspaceFolder}/.venv/bin/python"
|
||||
},
|
||||
{
|
||||
"name": "MiniMind Training Debug (Accelerate)",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "accelerate.commands.launch",
|
||||
"args": [
|
||||
"--num_processes=1",
|
||||
"--mixed_precision=bf16",
|
||||
"--main_process_port=29500",
|
||||
"train_pretrain_accelerate.py",
|
||||
"--batch_size", "16",
|
||||
"--knowledge_num", "48020",
|
||||
"--num_workers", "1",
|
||||
"--epochs", "4",
|
||||
"--learning_rate", "2e-4",
|
||||
"--dtype", "bfloat16",
|
||||
"--accumulation_steps", "32",
|
||||
"--grad_clip", "1.0",
|
||||
"--log_interval", "50",
|
||||
"--save_interval", "10000",
|
||||
"--dim", "512",
|
||||
"--n_layers", "8",
|
||||
"--max_seq_len", "512",
|
||||
"--use_flash_attn",
|
||||
"--profile",
|
||||
"--profile_interval", "10"
|
||||
"${workspaceFolder}/train_pretrain_accelerate.py",
|
||||
"--epochs", "1",
|
||||
"--batch_size", "32",
|
||||
"--learning_rate", "1e-4",
|
||||
"--log_interval", "10",
|
||||
"--profile_interval", "2",
|
||||
"--model_type", "model_original"
|
||||
],
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "0"
|
||||
},
|
||||
"console": "integratedTerminal",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"stopOnEntry": false
|
||||
"stopOnEntry": false,
|
||||
"python": "${workspaceFolder}/.venv/bin/python"
|
||||
},
|
||||
{
|
||||
"name": "MiniMind Test Only",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/test.py",
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "0"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
45
run_file/experiment_1.4.0.sh
Normal file
45
run_file/experiment_1.4.0.sh
Normal file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 激活conda环境
|
||||
# source $(conda info --base)/etc/profile.d/conda.sh
|
||||
# conda activate ycz_accelerate
|
||||
|
||||
# 设置环境变量以帮助调试
|
||||
export NCCL_DEBUG=INFO
|
||||
export PYTHONFAULTHANDLER=1
|
||||
|
||||
# 实验1.3.0 - 使用命令行参数直接配置accelerate
|
||||
CUDA_VISIBLE_DEVICES=0 accelerate launch \
|
||||
--num_processes=1 \
|
||||
--mixed_precision=bf16 \
|
||||
--main_process_port=29500 \
|
||||
train_pretrain_accelerate.py \
|
||||
--out_dir "out" \
|
||||
--epochs 3 \
|
||||
--embedding_epoch 2 \
|
||||
--batch_size 128 \
|
||||
--learning_rate 8e-5 \
|
||||
--dtype bfloat16 \
|
||||
--use_swanlab \
|
||||
--swanlab_project "MiniMind-Pretrain" \
|
||||
--num_workers 1 \
|
||||
--accumulation_steps 16 \
|
||||
--grad_clip 0.5 \
|
||||
--warmup_iters 0 \
|
||||
--log_interval 100 \
|
||||
--save_interval 10000 \
|
||||
--dim 512 \
|
||||
--n_layers 8 \
|
||||
--max_seq_len 512 \
|
||||
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
||||
--profile \
|
||||
--profile_interval 10 \
|
||||
--use_flash_attn \
|
||||
--knowledge_num 1048576 \
|
||||
--knowledge_length 32 \
|
||||
--database_init_path "./dataset/stable/sentence_trex_data.json" \
|
||||
--fast_clustering \
|
||||
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
||||
--memory_monitor_interval 10 \
|
||||
--model_type "model" \
|
||||
--model_size 538
|
@ -506,7 +506,7 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run):
|
||||
def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer):
|
||||
loss_fct = nn.CrossEntropyLoss(reduction='none')
|
||||
epoch_start_time = time.time()
|
||||
total_steps_in_epoch = len(train_loader)
|
||||
@ -691,11 +691,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
# 随机选择一个样本
|
||||
random_idx = torch.randint(0, X.size(0), (1,)).item()
|
||||
sample_input = X[random_idx:random_idx+1] # [1, seq_len]
|
||||
sample_target = Y[random_idx:random_idx+1] # [1, seq_len]
|
||||
|
||||
# 取前面的部分作为prompt(例如前一半)
|
||||
prompt_len = min(sample_input.size(1) // 2, sample_input.size(1) - 10)
|
||||
# 取前面的部分作为prompt,确保后面有10个token作为真实值
|
||||
prompt_len = sample_input.size(1) // 2
|
||||
prompt_input = sample_input[:, :prompt_len]
|
||||
|
||||
# 获取真实的后10个token
|
||||
true_next_tokens = sample_target[:, prompt_len-1:prompt_len-1+10] # 真实的接下来10个token
|
||||
|
||||
# 生成10个token
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.eval() # 设置为评估模式
|
||||
@ -711,15 +715,23 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
)
|
||||
|
||||
# 转换为人类可读文本
|
||||
original_text = tokenizer.decode(sample_input[0], skip_special_tokens=True)
|
||||
prompt_text = tokenizer.decode(prompt_input[0], skip_special_tokens=True)
|
||||
generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||
new_tokens_text = generated_text[len(prompt_text):]
|
||||
true_text = tokenizer.decode(true_next_tokens[0], skip_special_tokens=True)
|
||||
|
||||
Logger(f"生成文本示例:", accelerator)
|
||||
Logger(f" 原始文本: {original_text[:100]}...", accelerator)
|
||||
Logger(f" 输入提示: {prompt_text[-50:]}", accelerator)
|
||||
Logger(f" 生成续写: {new_tokens_text}", accelerator)
|
||||
# 获取新生成的token
|
||||
prompt_tokens = prompt_input[0].tolist()
|
||||
generated_tokens = generated[0].tolist()
|
||||
|
||||
if len(generated_tokens) > len(prompt_tokens):
|
||||
new_tokens = generated_tokens[len(prompt_tokens):len(prompt_tokens)+10] # 只取前10个
|
||||
generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||
else:
|
||||
generated_text = "[未生成新token]"
|
||||
|
||||
Logger(f"文本生成对比:", accelerator)
|
||||
Logger(f" 输入提示: {prompt_text}", accelerator)
|
||||
Logger(f" 真实续写: {true_text}", accelerator)
|
||||
Logger(f" 模型生成: {generated_text}", accelerator)
|
||||
|
||||
unwrapped_model.train() # 恢复训练模式
|
||||
|
||||
@ -841,7 +853,7 @@ def main():
|
||||
parser.add_argument("--accumulation_steps", type=int, default=32)
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0)
|
||||
parser.add_argument("--warmup_iters", type=int, default=0)
|
||||
parser.add_argument("--log_interval", type=int, default=100)
|
||||
parser.add_argument("--log_interval", type=int, default=1)
|
||||
parser.add_argument("--save_interval", type=int, default=10000)
|
||||
parser.add_argument('--dim', default=512, type=int)
|
||||
parser.add_argument('--n_layers', default=8, type=int)
|
||||
@ -1033,7 +1045,7 @@ def main():
|
||||
overall_start_time = time.time() # Record overall start time
|
||||
for epoch in range(args.epochs):
|
||||
Logger(f"开始第{epoch+1}轮训练", accelerator)
|
||||
train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run) # Pass overall start time
|
||||
train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer) # Pass tokenizer
|
||||
|
||||
# 每个epoch结束后进行内存清理
|
||||
Logger(f"第{epoch+1}轮训练完成,进行内存清理", accelerator)
|
||||
|
Loading…
x
Reference in New Issue
Block a user