diff --git a/run_file/experiment_1.3.0.sh b/run_file/experiment_1.3.0.sh new file mode 100644 index 0000000..6ad154e --- /dev/null +++ b/run_file/experiment_1.3.0.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# 激活conda环境 +source $(conda info --base)/etc/profile.d/conda.sh +conda activate ycz_accelerate + +# 设置环境变量以帮助调试 +export NCCL_DEBUG=INFO +export PYTHONFAULTHANDLER=1 + +# 实验1.3.0 - 使用命令行参数直接配置accelerate +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ + --multi_gpu \ + --num_processes=4 \ + --mixed_precision=bf16 \ + --main_process_port=29500 \ + train_pretrain_accelerate.py \ + --out_dir "out" \ + --epochs 3 \ + --embedding_epoch 2 \ + --batch_size 72 \ + --learning_rate 1e-4 \ + --dtype "bfloat16" \ + --use_swanlab \ + --swanlab_project "MiniMind-Pretrain" \ + --num_workers 1 \ + --accumulation_steps 16 \ + --grad_clip 0.5 \ + --warmup_iters 0 \ + --log_interval 100 \ + --save_interval 10000 \ + --dim 1024 \ + --n_layers 32 \ + --max_seq_len 512 \ + --use_moe False \ + --data_path "./dataset/stable/merged_pretrain.jsonl" \ + --profile \ + --profile_interval 10 \ + --use_flash_attn \ + --knowledge_num 960400 \ + --knowledge_length 32 \ + --database_init_path "./dataset/stable/sentence_trex_data.json" \ + --fast_clustering \ + --cluster_cache_path "./cache/cluster_tokens_single.pt" \ + --memory_monitor_interval 10 \ + --model_type "model_original" \ + --model_size "0.5B" diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py index 5435c28..b10585c 100644 --- a/train_pretrain_accelerate.py +++ b/train_pretrain_accelerate.py @@ -200,14 +200,20 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non with open(database_init_path, 'r', encoding='utf-8') as f: database_data = json.load(f) + sentences_data = [] + for data in database_data: + sentences_data.append(data['target'][0]['sentence']) + # 提取sentences列表 - sentences_data = database_data.get('sentences', []) + # sentences_data = database_data.get('sentences', []) Logger(f"Loaded {len(sentences_data)} sentences from database") # 2. 按照importance_score进行排序(从高到低) - sorted_sentences = sorted(sentences_data, key=lambda x: x.get('importance_score', 0.0), reverse=True) - Logger(f"Sorted sentences by importance score (highest: {sorted_sentences[0].get('importance_score', 0.0)}, lowest: {sorted_sentences[-1].get('importance_score', 0.0)})") - + try: + sorted_sentences = sorted(sentences_data, key=lambda x: x.get('importance_score', 0.0), reverse=True) + Logger(f"Sorted sentences by importance score (highest: {sorted_sentences[0].get('importance_score', 0.0)}, lowest: {sorted_sentences[-1].get('importance_score', 0.0)})") + except: + sorted_sentences = sentences_data # 3. 处理每条数据,不进行聚类 Logger("Processing individual sentences...") processed_rows = [] @@ -218,16 +224,25 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non # 处理所需数量的句子 num_to_process = min(knowledge_num, len(sorted_sentences)) + # 添加截断统计变量 + total_sentences = 0 + truncated_sentences = 0 + for i in range(num_to_process): sentence_data = sorted_sentences[i] - sentence = sentence_data.get('corrected_sentence', '') + try: + sentence = sentence_data.get('corrected_sentence') + except: + sentence = sentence_data # 将句子转换为tokens sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False) # 截断或填充到knowledge_length + total_sentences += 1 if len(sentence_tokens) > knowledge_length: # 如果超过长度,截断 + truncated_sentences += 1 sentence_tokens = sentence_tokens[:knowledge_length] Logger(f"Sentence {i+1} truncated from {len(tokenizer.encode(sentence, add_special_tokens=False))} to {knowledge_length} tokens") else: @@ -254,6 +269,13 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non # 转换为tensor processed_tensor = torch.tensor(processed_rows, dtype=torch.long) + # 计算并打印截断句子的占比 + truncation_ratio = truncated_sentences / total_sentences if total_sentences > 0 else 0.0 + Logger(f"截断句子统计:") + Logger(f" - 总句子数: {total_sentences}") + Logger(f" - 截断句子数: {truncated_sentences}") + Logger(f" - 截断句子占比: {truncation_ratio:.4f} ({truncation_ratio*100:.2f}%)") + Logger(f"Data processing completed:") Logger(f" - Processed {num_to_process} sentences") Logger(f" - Added {knowledge_num - num_to_process} empty entries") @@ -312,7 +334,7 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a optimizer_end = torch.cuda.Event(enable_timing=True) # 预取数据 - prefetch_factor = 2 # 预取的批次数 + prefetch_factor = 4 # 预取的批次数 data_iter = iter(train_loader) prefetch_batches = [] @@ -603,13 +625,15 @@ def main(): parser.add_argument("--use_flash_attn", action="store_true", default=True, help="启用FlashAttention") parser.add_argument("--knowledge_num", type=int, default=960400,help="知识库的数据数目") parser.add_argument("--knowledge_length", type=int, default=32,help="知识库的句子长度") - parser.add_argument("--database_init_path", type=str, default="./dataset/combined_prepare.json", help="数据库初始化路径") + parser.add_argument("--database_init_path", type=str, default="./dataset/stable/sentence_trex_data.json", help="数据库初始化路径") parser.add_argument("--fast_clustering", action="store_true", default=True, help="使用快速近似聚类算法(适用于大数据集)") parser.add_argument("--cluster_cache_path", type=str, default="./cache/cluster_tokens_single.pt", help="聚类结果缓存文件路径") parser.add_argument("--recompute_clusters", action="store_true", default=False, help="强制重新计算聚类,忽略缓存文件") parser.add_argument("--memory_monitor", action="store_true", default=False, help="启用内存监控") parser.add_argument("--memory_monitor_interval", type=int, default=10, help="内存监控间隔(步数)") parser.add_argument("--model_type", type=str, default="model", help="使用什么模型训练") #model,model_original + parser.add_argument("--model_size", type=str, default="0.05B", help="模型大小") + parser.add_argument("--swanlab_online", type=bool, default=True, help="是否使用在线SwanLab服务") args = parser.parse_args() ######################################################### @@ -678,16 +702,23 @@ def main(): # 初始化SwanLab实验实例 swanlab_run = None if args.use_swanlab and accelerator.is_main_process: - # 初始化SwanLab - swanlab_run = swanlab.init( - project=args.swanlab_project, - experiment_name=args.swanlab_run_name, - description="MiniMind预训练实验,使用本地部署的SwanLab进行可视化", - config=config_dict - # 设置SwanLab服务器地址和API Key - # host="http://100.123.118.114:11071", - # api_key="LesBT7HRq23HNBrOPKP8S" - ) + if args.swanlab_online: + # 使用在线SwanLab服务 + # 初始化SwanLab + swanlab_run = swanlab.init( + project=args.swanlab_project, + experiment_name=args.swanlab_run_name, + description="MiniMind预训练实验,使用本地部署的SwanLab进行可视化", + config=config_dict, + mode="offline" + ) + else: + swanlab_run = swanlab.init( + project=args.swanlab_project, + experiment_name=args.swanlab_run_name, + description="MiniMind预训练实验,使用本地部署的SwanLab进行可视化", + config=config_dict + ) else: swanlab_run = None