修复了一些bug

This commit is contained in:
Yu Chengzhang 2025-07-17 12:06:28 +08:00
parent d701003f8a
commit d9d281967e
3 changed files with 175 additions and 34 deletions

128
.vscode/launch.json vendored
View File

@ -2,39 +2,123 @@
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
"name": "DynamicKV-LLM Mini Minimind Debug", "name": "MiniMind Training (Direct Python)",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/train_pretrain_accelerate.py",
"args": [
"--out_dir", "out",
"--epochs", "3",
"--embedding_epoch", "2",
"--batch_size", "128",
"--learning_rate", "8e-5",
"--dtype", "bfloat16",
"--use_swanlab",
"--swanlab_project", "MiniMind-Pretrain",
"--num_workers", "1",
"--accumulation_steps", "16",
"--grad_clip", "0.5",
"--warmup_iters", "0",
"--log_interval", "1",
"--save_interval", "10000",
"--dim", "512",
"--n_layers", "8",
"--max_seq_len", "512",
"--data_path", "./dataset/stable/merged_pretrain.jsonl",
"--profile",
"--profile_interval", "10",
"--use_flash_attn",
"--knowledge_num", "1048576",
"--knowledge_length", "32",
"--database_init_path", "./dataset/stable/sentence_trex_data.json",
"--fast_clustering",
"--cluster_cache_path", "./cache/cluster_tokens_single.pt",
"--memory_monitor_interval", "10",
"--model_type", "model",
"--model_size", "538"
],
"env": {
"CUDA_VISIBLE_DEVICES": "0",
"NCCL_DEBUG": "INFO",
"PYTHONFAULTHANDLER": "1"
},
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false,
"stopOnEntry": false,
"python": "${workspaceFolder}/.venv/bin/python"
},
{
"name": "MiniMind Training (Direct Python - Simple)",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/train_pretrain_accelerate.py",
"args": [
"--epochs", "1",
"--batch_size", "32",
"--learning_rate", "1e-4",
"--log_interval", "10",
"--profile_interval", "2",
"--model_type", "model_original"
],
"env": {
"CUDA_VISIBLE_DEVICES": "0"
},
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false,
"stopOnEntry": false,
"python": "${workspaceFolder}/.venv/bin/python"
},
{
"name": "MiniMind Test (Direct Python)",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/test.py",
"env": {
"CUDA_VISIBLE_DEVICES": "0"
},
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false,
"python": "${workspaceFolder}/.venv/bin/python"
},
{
"name": "MiniMind Training Debug (Accelerate)",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"module": "accelerate.commands.launch", "module": "accelerate.commands.launch",
"args": [ "args": [
"--num_processes=1", "--num_processes=1",
"--mixed_precision=bf16", "--mixed_precision=bf16",
"--main_process_port=29500", "${workspaceFolder}/train_pretrain_accelerate.py",
"train_pretrain_accelerate.py", "--epochs", "1",
"--batch_size", "16", "--batch_size", "32",
"--knowledge_num", "48020", "--learning_rate", "1e-4",
"--num_workers", "1", "--log_interval", "10",
"--epochs", "4", "--profile_interval", "2",
"--learning_rate", "2e-4", "--model_type", "model_original"
"--dtype", "bfloat16",
"--accumulation_steps", "32",
"--grad_clip", "1.0",
"--log_interval", "50",
"--save_interval", "10000",
"--dim", "512",
"--n_layers", "8",
"--max_seq_len", "512",
"--use_flash_attn",
"--profile",
"--profile_interval", "10"
], ],
"env": { "env": {
"CUDA_VISIBLE_DEVICES": "0" "CUDA_VISIBLE_DEVICES": "0"
}, },
"console": "integratedTerminal",
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false, "justMyCode": false,
"stopOnEntry": false "stopOnEntry": false,
"python": "${workspaceFolder}/.venv/bin/python"
},
{
"name": "MiniMind Test Only",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/test.py",
"env": {
"CUDA_VISIBLE_DEVICES": "0"
},
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false
} }
] ]
} }

View File

@ -0,0 +1,45 @@
#!/bin/bash
# 激活conda环境
# source $(conda info --base)/etc/profile.d/conda.sh
# conda activate ycz_accelerate
# 设置环境变量以帮助调试
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
# 实验1.3.0 - 使用命令行参数直接配置accelerate
CUDA_VISIBLE_DEVICES=0 accelerate launch \
--num_processes=1 \
--mixed_precision=bf16 \
--main_process_port=29500 \
train_pretrain_accelerate.py \
--out_dir "out" \
--epochs 3 \
--embedding_epoch 2 \
--batch_size 128 \
--learning_rate 8e-5 \
--dtype bfloat16 \
--use_swanlab \
--swanlab_project "MiniMind-Pretrain" \
--num_workers 1 \
--accumulation_steps 16 \
--grad_clip 0.5 \
--warmup_iters 0 \
--log_interval 100 \
--save_interval 10000 \
--dim 512 \
--n_layers 8 \
--max_seq_len 512 \
--data_path "./dataset/stable/merged_pretrain.jsonl" \
--profile \
--profile_interval 10 \
--use_flash_attn \
--knowledge_num 1048576 \
--knowledge_length 32 \
--database_init_path "./dataset/stable/sentence_trex_data.json" \
--fast_clustering \
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
--memory_monitor_interval 10 \
--model_type "model" \
--model_size 538

View File

@ -506,7 +506,7 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non
return model, tokenizer return model, tokenizer
def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run): def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer):
loss_fct = nn.CrossEntropyLoss(reduction='none') loss_fct = nn.CrossEntropyLoss(reduction='none')
epoch_start_time = time.time() epoch_start_time = time.time()
total_steps_in_epoch = len(train_loader) total_steps_in_epoch = len(train_loader)
@ -691,11 +691,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
# 随机选择一个样本 # 随机选择一个样本
random_idx = torch.randint(0, X.size(0), (1,)).item() random_idx = torch.randint(0, X.size(0), (1,)).item()
sample_input = X[random_idx:random_idx+1] # [1, seq_len] sample_input = X[random_idx:random_idx+1] # [1, seq_len]
sample_target = Y[random_idx:random_idx+1] # [1, seq_len]
# 取前面的部分作为prompt(例如前一半) # 取前面的部分作为prompt确保后面有10个token作为真实值
prompt_len = min(sample_input.size(1) // 2, sample_input.size(1) - 10) prompt_len = sample_input.size(1) // 2
prompt_input = sample_input[:, :prompt_len] prompt_input = sample_input[:, :prompt_len]
# 获取真实的后10个token
true_next_tokens = sample_target[:, prompt_len-1:prompt_len-1+10] # 真实的接下来10个token
# 生成10个token # 生成10个token
unwrapped_model = accelerator.unwrap_model(model) unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.eval() # 设置为评估模式 unwrapped_model.eval() # 设置为评估模式
@ -711,15 +715,23 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
) )
# 转换为人类可读文本 # 转换为人类可读文本
original_text = tokenizer.decode(sample_input[0], skip_special_tokens=True)
prompt_text = tokenizer.decode(prompt_input[0], skip_special_tokens=True) prompt_text = tokenizer.decode(prompt_input[0], skip_special_tokens=True)
generated_text = tokenizer.decode(generated[0], skip_special_tokens=True) true_text = tokenizer.decode(true_next_tokens[0], skip_special_tokens=True)
new_tokens_text = generated_text[len(prompt_text):]
Logger(f"生成文本示例:", accelerator) # 获取新生成的token
Logger(f" 原始文本: {original_text[:100]}...", accelerator) prompt_tokens = prompt_input[0].tolist()
Logger(f" 输入提示: {prompt_text[-50:]}", accelerator) generated_tokens = generated[0].tolist()
Logger(f" 生成续写: {new_tokens_text}", accelerator)
if len(generated_tokens) > len(prompt_tokens):
new_tokens = generated_tokens[len(prompt_tokens):len(prompt_tokens)+10] # 只取前10个
generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
else:
generated_text = "[未生成新token]"
Logger(f"文本生成对比:", accelerator)
Logger(f" 输入提示: {prompt_text}", accelerator)
Logger(f" 真实续写: {true_text}", accelerator)
Logger(f" 模型生成: {generated_text}", accelerator)
unwrapped_model.train() # 恢复训练模式 unwrapped_model.train() # 恢复训练模式
@ -841,7 +853,7 @@ def main():
parser.add_argument("--accumulation_steps", type=int, default=32) parser.add_argument("--accumulation_steps", type=int, default=32)
parser.add_argument("--grad_clip", type=float, default=1.0) parser.add_argument("--grad_clip", type=float, default=1.0)
parser.add_argument("--warmup_iters", type=int, default=0) parser.add_argument("--warmup_iters", type=int, default=0)
parser.add_argument("--log_interval", type=int, default=100) parser.add_argument("--log_interval", type=int, default=1)
parser.add_argument("--save_interval", type=int, default=10000) parser.add_argument("--save_interval", type=int, default=10000)
parser.add_argument('--dim', default=512, type=int) parser.add_argument('--dim', default=512, type=int)
parser.add_argument('--n_layers', default=8, type=int) parser.add_argument('--n_layers', default=8, type=int)
@ -1033,7 +1045,7 @@ def main():
overall_start_time = time.time() # Record overall start time overall_start_time = time.time() # Record overall start time
for epoch in range(args.epochs): for epoch in range(args.epochs):
Logger(f"开始第{epoch+1}轮训练", accelerator) Logger(f"开始第{epoch+1}轮训练", accelerator)
train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run) # Pass overall start time train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer) # Pass tokenizer
# 每个epoch结束后进行内存清理 # 每个epoch结束后进行内存清理
Logger(f"{epoch+1}轮训练完成,进行内存清理", accelerator) Logger(f"{epoch+1}轮训练完成,进行内存清理", accelerator)