update seed set

2025-04-04 11:39:41 +08:00 · 2025-04-04 11:39:41 +08:00 · 7fcc46b39a
commit 7fcc46b39a
parent 08e9a22a25
6 changed files with 48 additions and 6 deletions
--- a/train_distill_reason.py
+++ b/train_distill_reason.py
@ -163,7 +163,6 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * lm_config.max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    args.wandb_run_name = f"MiniMind-Distill-Reasoning-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
@ -171,9 +170,17 @@ if __name__ == "__main__":
    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    if args.use_wandb and (not ddp or ddp_local_rank == 0):
        import wandb
--- a/train_distillation.py
+++ b/train_distillation.py
@ -209,7 +209,6 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    args.wandb_run_name = f"MiniMind-Dist-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
@ -217,9 +216,17 @@ if __name__ == "__main__":
    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    if args.use_wandb and (not ddp or ddp_local_rank == 0):
        import wandb
--- a/train_dpo.py
+++ b/train_dpo.py
@ -195,7 +195,6 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * lm_config.max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    args.wandb_run_name = f"MiniMind-Full-DPO-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
@ -203,9 +202,17 @@ if __name__ == "__main__":
    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    if args.use_wandb and (not ddp or ddp_local_rank == 0):
        import wandb
--- a/train_full_sft.py
+++ b/train_full_sft.py
@ -150,7 +150,6 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * lm_config.max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
@ -158,9 +157,17 @@ if __name__ == "__main__":
    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    if args.use_wandb and (not ddp or ddp_local_rank == 0):
        import wandb
--- a/train_lora.py
+++ b/train_lora.py
@ -137,15 +137,22 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * lm_config.max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    args.wandb_run_name = f"MiniMind-Lora-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
    if args.use_wandb and (not ddp or ddp_local_rank == 0):
--- a/train_pretrain.py
+++ b/train_pretrain.py
@ -146,7 +146,6 @@ if __name__ == "__main__":
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)
    tokens_per_iter = args.batch_size * lm_config.max_seq_len
-    torch.manual_seed(1337)
    device_type = "cuda" if "cuda" in args.device else "cpu"

    args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
@ -156,9 +155,17 @@ if __name__ == "__main__":
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"

+    base_seed = 1337
+    torch.manual_seed(base_seed)
+    torch.cuda.manual_seed(base_seed)
+
    if ddp:
        init_distributed_mode()
        args.device = torch.device(DEVICE)
+        rank = dist.get_rank()
+        torch.manual_seed(base_seed + rank)
+        # 同时设置 CUDA 的随机种子
+        torch.cuda.manual_seed(base_seed + rank)

    if args.use_wandb and (not ddp or ddp_local_rank == 0):
        import wandb