From 7fcc46b39a650dbda938c066f348082477e72eef Mon Sep 17 00:00:00 2001 From: jingyaogong Date: Fri, 4 Apr 2025 11:39:41 +0800 Subject: [PATCH] update seed set --- train_distill_reason.py | 9 ++++++++- train_distillation.py | 9 ++++++++- train_dpo.py | 9 ++++++++- train_full_sft.py | 9 ++++++++- train_lora.py | 9 ++++++++- train_pretrain.py | 9 ++++++++- 6 files changed, 48 insertions(+), 6 deletions(-) diff --git a/train_distill_reason.py b/train_distill_reason.py index 1fe9ad1..93effde 100644 --- a/train_distill_reason.py +++ b/train_distill_reason.py @@ -163,7 +163,6 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * lm_config.max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" args.wandb_run_name = f"MiniMind-Distill-Reasoning-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" @@ -171,9 +170,17 @@ if __name__ == "__main__": ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb diff --git a/train_distillation.py b/train_distillation.py index 5f5f9f6..985e037 100644 --- a/train_distillation.py +++ b/train_distillation.py @@ -209,7 +209,6 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" args.wandb_run_name = f"MiniMind-Dist-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" @@ -217,9 +216,17 @@ if __name__ == "__main__": ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb diff --git a/train_dpo.py b/train_dpo.py index e0b67af..e79dfb5 100644 --- a/train_dpo.py +++ b/train_dpo.py @@ -195,7 +195,6 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * lm_config.max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" args.wandb_run_name = f"MiniMind-Full-DPO-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" @@ -203,9 +202,17 @@ if __name__ == "__main__": ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb diff --git a/train_full_sft.py b/train_full_sft.py index 5185861..49bd3a8 100644 --- a/train_full_sft.py +++ b/train_full_sft.py @@ -150,7 +150,6 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * lm_config.max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" @@ -158,9 +157,17 @@ if __name__ == "__main__": ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb diff --git a/train_lora.py b/train_lora.py index 2549cb5..6f373dd 100644 --- a/train_lora.py +++ b/train_lora.py @@ -137,15 +137,22 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * lm_config.max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) args.wandb_run_name = f"MiniMind-Lora-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" if args.use_wandb and (not ddp or ddp_local_rank == 0): diff --git a/train_pretrain.py b/train_pretrain.py index a4465b6..780905e 100644 --- a/train_pretrain.py +++ b/train_pretrain.py @@ -146,7 +146,6 @@ if __name__ == "__main__": os.makedirs(args.save_dir, exist_ok=True) os.makedirs(args.out_dir, exist_ok=True) tokens_per_iter = args.batch_size * lm_config.max_seq_len - torch.manual_seed(1337) device_type = "cuda" if "cuda" in args.device else "cpu" args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" @@ -156,9 +155,17 @@ if __name__ == "__main__": ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? ddp_local_rank, DEVICE = 0, "cuda:0" + base_seed = 1337 + torch.manual_seed(base_seed) + torch.cuda.manual_seed(base_seed) + if ddp: init_distributed_mode() args.device = torch.device(DEVICE) + rank = dist.get_rank() + torch.manual_seed(base_seed + rank) + # 同时设置 CUDA 的随机种子 + torch.cuda.manual_seed(base_seed + rank) if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb