diff --git a/.vscode/launch.json b/.vscode/launch.json index b954801..217806d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -2,39 +2,123 @@ "version": "0.2.0", "configurations": [ { - "name": "DynamicKV-LLM Mini Minimind Debug", + "name": "MiniMind Training (Direct Python)", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/train_pretrain_accelerate.py", + "args": [ + "--out_dir", "out", + "--epochs", "3", + "--embedding_epoch", "2", + "--batch_size", "128", + "--learning_rate", "8e-5", + "--dtype", "bfloat16", + "--use_swanlab", + "--swanlab_project", "MiniMind-Pretrain", + "--num_workers", "1", + "--accumulation_steps", "16", + "--grad_clip", "0.5", + "--warmup_iters", "0", + "--log_interval", "1", + "--save_interval", "10000", + "--dim", "512", + "--n_layers", "8", + "--max_seq_len", "512", + "--data_path", "./dataset/stable/merged_pretrain.jsonl", + "--profile", + "--profile_interval", "10", + "--use_flash_attn", + "--knowledge_num", "1048576", + "--knowledge_length", "32", + "--database_init_path", "./dataset/stable/sentence_trex_data.json", + "--fast_clustering", + "--cluster_cache_path", "./cache/cluster_tokens_single.pt", + "--memory_monitor_interval", "10", + "--model_type", "model", + "--model_size", "538" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "0", + "NCCL_DEBUG": "INFO", + "PYTHONFAULTHANDLER": "1" + }, + "cwd": "${workspaceFolder}", + "console": "integratedTerminal", + "justMyCode": false, + "stopOnEntry": false, + "python": "${workspaceFolder}/.venv/bin/python" + }, + { + "name": "MiniMind Training (Direct Python - Simple)", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/train_pretrain_accelerate.py", + "args": [ + "--epochs", "1", + "--batch_size", "32", + "--learning_rate", "1e-4", + "--log_interval", "10", + "--profile_interval", "2", + "--model_type", "model_original" + ], + "env": { + "CUDA_VISIBLE_DEVICES": "0" + }, + "cwd": "${workspaceFolder}", + "console": "integratedTerminal", + "justMyCode": false, + "stopOnEntry": false, + "python": "${workspaceFolder}/.venv/bin/python" + }, + { + "name": "MiniMind Test (Direct Python)", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/test.py", + "env": { + "CUDA_VISIBLE_DEVICES": "0" + }, + "cwd": "${workspaceFolder}", + "console": "integratedTerminal", + "justMyCode": false, + "python": "${workspaceFolder}/.venv/bin/python" + }, + { + "name": "MiniMind Training Debug (Accelerate)", "type": "python", "request": "launch", "module": "accelerate.commands.launch", "args": [ "--num_processes=1", "--mixed_precision=bf16", - "--main_process_port=29500", - "train_pretrain_accelerate.py", - "--batch_size", "16", - "--knowledge_num", "48020", - "--num_workers", "1", - "--epochs", "4", - "--learning_rate", "2e-4", - "--dtype", "bfloat16", - "--accumulation_steps", "32", - "--grad_clip", "1.0", - "--log_interval", "50", - "--save_interval", "10000", - "--dim", "512", - "--n_layers", "8", - "--max_seq_len", "512", - "--use_flash_attn", - "--profile", - "--profile_interval", "10" + "${workspaceFolder}/train_pretrain_accelerate.py", + "--epochs", "1", + "--batch_size", "32", + "--learning_rate", "1e-4", + "--log_interval", "10", + "--profile_interval", "2", + "--model_type", "model_original" ], "env": { "CUDA_VISIBLE_DEVICES": "0" }, - "console": "integratedTerminal", "cwd": "${workspaceFolder}", + "console": "integratedTerminal", "justMyCode": false, - "stopOnEntry": false + "stopOnEntry": false, + "python": "${workspaceFolder}/.venv/bin/python" + }, + { + "name": "MiniMind Test Only", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/test.py", + "env": { + "CUDA_VISIBLE_DEVICES": "0" + }, + "cwd": "${workspaceFolder}", + "console": "integratedTerminal", + "justMyCode": false } ] -} +} \ No newline at end of file diff --git a/run_file/experiment_1.4.0.sh b/run_file/experiment_1.4.0.sh new file mode 100644 index 0000000..077801b --- /dev/null +++ b/run_file/experiment_1.4.0.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# 激活conda环境 +# source $(conda info --base)/etc/profile.d/conda.sh +# conda activate ycz_accelerate + +# 设置环境变量以帮助调试 +export NCCL_DEBUG=INFO +export PYTHONFAULTHANDLER=1 + +# 实验1.3.0 - 使用命令行参数直接配置accelerate +CUDA_VISIBLE_DEVICES=0 accelerate launch \ + --num_processes=1 \ + --mixed_precision=bf16 \ + --main_process_port=29500 \ + train_pretrain_accelerate.py \ + --out_dir "out" \ + --epochs 3 \ + --embedding_epoch 2 \ + --batch_size 128 \ + --learning_rate 8e-5 \ + --dtype bfloat16 \ + --use_swanlab \ + --swanlab_project "MiniMind-Pretrain" \ + --num_workers 1 \ + --accumulation_steps 16 \ + --grad_clip 0.5 \ + --warmup_iters 0 \ + --log_interval 100 \ + --save_interval 10000 \ + --dim 512 \ + --n_layers 8 \ + --max_seq_len 512 \ + --data_path "./dataset/stable/merged_pretrain.jsonl" \ + --profile \ + --profile_interval 10 \ + --use_flash_attn \ + --knowledge_num 1048576 \ + --knowledge_length 32 \ + --database_init_path "./dataset/stable/sentence_trex_data.json" \ + --fast_clustering \ + --cluster_cache_path "./cache/cluster_tokens_single.pt" \ + --memory_monitor_interval 10 \ + --model_type "model" \ + --model_size 538 diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py index 74413be..9edb298 100644 --- a/train_pretrain_accelerate.py +++ b/train_pretrain_accelerate.py @@ -506,7 +506,7 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non return model, tokenizer -def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run): +def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer): loss_fct = nn.CrossEntropyLoss(reduction='none') epoch_start_time = time.time() total_steps_in_epoch = len(train_loader) @@ -691,11 +691,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a # 随机选择一个样本 random_idx = torch.randint(0, X.size(0), (1,)).item() sample_input = X[random_idx:random_idx+1] # [1, seq_len] + sample_target = Y[random_idx:random_idx+1] # [1, seq_len] - # 取前面的部分作为prompt(例如前一半) - prompt_len = min(sample_input.size(1) // 2, sample_input.size(1) - 10) + # 取前面的部分作为prompt,确保后面有10个token作为真实值 + prompt_len = sample_input.size(1) // 2 prompt_input = sample_input[:, :prompt_len] + # 获取真实的后10个token + true_next_tokens = sample_target[:, prompt_len-1:prompt_len-1+10] # 真实的接下来10个token + # 生成10个token unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.eval() # 设置为评估模式 @@ -711,15 +715,23 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a ) # 转换为人类可读文本 - original_text = tokenizer.decode(sample_input[0], skip_special_tokens=True) prompt_text = tokenizer.decode(prompt_input[0], skip_special_tokens=True) - generated_text = tokenizer.decode(generated[0], skip_special_tokens=True) - new_tokens_text = generated_text[len(prompt_text):] + true_text = tokenizer.decode(true_next_tokens[0], skip_special_tokens=True) - Logger(f"生成文本示例:", accelerator) - Logger(f" 原始文本: {original_text[:100]}...", accelerator) - Logger(f" 输入提示: {prompt_text[-50:]}", accelerator) - Logger(f" 生成续写: {new_tokens_text}", accelerator) + # 获取新生成的token + prompt_tokens = prompt_input[0].tolist() + generated_tokens = generated[0].tolist() + + if len(generated_tokens) > len(prompt_tokens): + new_tokens = generated_tokens[len(prompt_tokens):len(prompt_tokens)+10] # 只取前10个 + generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True) + else: + generated_text = "[未生成新token]" + + Logger(f"文本生成对比:", accelerator) + Logger(f" 输入提示: {prompt_text}", accelerator) + Logger(f" 真实续写: {true_text}", accelerator) + Logger(f" 模型生成: {generated_text}", accelerator) unwrapped_model.train() # 恢复训练模式 @@ -841,7 +853,7 @@ def main(): parser.add_argument("--accumulation_steps", type=int, default=32) parser.add_argument("--grad_clip", type=float, default=1.0) parser.add_argument("--warmup_iters", type=int, default=0) - parser.add_argument("--log_interval", type=int, default=100) + parser.add_argument("--log_interval", type=int, default=1) parser.add_argument("--save_interval", type=int, default=10000) parser.add_argument('--dim', default=512, type=int) parser.add_argument('--n_layers', default=8, type=int) @@ -1033,7 +1045,7 @@ def main(): overall_start_time = time.time() # Record overall start time for epoch in range(args.epochs): Logger(f"开始第{epoch+1}轮训练", accelerator) - train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run) # Pass overall start time + train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, args, ctx, overall_start_time, swanlab_run, tokenizer) # Pass tokenizer # 每个epoch结束后进行内存清理 Logger(f"第{epoch+1}轮训练完成,进行内存清理", accelerator)