Compare commits
2 Commits
6590cef358
...
57138a5a79
| Author | SHA1 | Date | |
|---|---|---|---|
| 57138a5a79 | |||
| 1ca2f10e65 |
@ -22,15 +22,6 @@
|
|||||||
"bf16": {
|
"bf16": {
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": "auto",
|
|
||||||
"eps": "auto",
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"scheduler": {
|
"scheduler": {
|
||||||
"type": "WarmupLR",
|
"type": "WarmupLR",
|
||||||
"params": {
|
"params": {
|
||||||
|
|||||||
@ -577,7 +577,7 @@ class MiniMindLM(PreTrainedModel):
|
|||||||
config_class = LMConfig
|
config_class = LMConfig
|
||||||
|
|
||||||
def __init__(self, params: LMConfig = None):
|
def __init__(self, params: LMConfig = None):
|
||||||
self.params = params or LMConfig()
|
self.params = params
|
||||||
super().__init__(self.params)
|
super().__init__(self.params)
|
||||||
self.vocab_size, self.n_layers = params.vocab_size, params.n_layers
|
self.vocab_size, self.n_layers = params.vocab_size, params.n_layers
|
||||||
self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
|
self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
|
||||||
|
|||||||
@ -51,14 +51,14 @@ MAIN_PROCESS_PORT="29500"
|
|||||||
MODEL_TYPE="model_memory" # 🔥 使用Token-based Memory模型
|
MODEL_TYPE="model_memory" # 🔥 使用Token-based Memory模型
|
||||||
MODEL_SIZE="50.0"
|
MODEL_SIZE="50.0"
|
||||||
DIM="512"
|
DIM="512"
|
||||||
N_LAYERS="16"
|
N_LAYERS="8"
|
||||||
N_HEADS="32"
|
N_HEADS="16"
|
||||||
MAX_SEQ_LEN="512"
|
MAX_SEQ_LEN="512"
|
||||||
USE_MOE="false"
|
USE_MOE="false"
|
||||||
|
|
||||||
# 🔥 知识库配置(四损失系统优化)
|
# 🔥 知识库配置(四损失系统优化)
|
||||||
KNOWLEDGE_NUM="1048576" # 1M entries
|
KNOWLEDGE_NUM="1048576" # 1M entries
|
||||||
KNOWLEDGE_LENGTH="8" # 🔥 增加到32个token提升表达能力
|
KNOWLEDGE_LENGTH="16" # 🔥 增加到16个token提升表达能力
|
||||||
KNOWLEDGE_DIM="128" # 保留兼容性
|
KNOWLEDGE_DIM="128" # 保留兼容性
|
||||||
DISABLE_DB="false"
|
DISABLE_DB="false"
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ DISABLE_DB="false"
|
|||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
EPOCHS="3"
|
EPOCHS="3"
|
||||||
EMBEDDING_EPOCH="2"
|
EMBEDDING_EPOCH="2"
|
||||||
BATCH_SIZE="42" # 🔥 降低批次大小以适应更复杂的计算
|
BATCH_SIZE="64" # 🔥 降低批次大小以适应更复杂的计算
|
||||||
ACCUMULATION_STEPS="8" # 🔥 增加累积步数保持有效批次大小
|
ACCUMULATION_STEPS="8" # 🔥 增加累积步数保持有效批次大小
|
||||||
LEARNING_RATE="2e-4" # 🔥 适度降低学习率提升稳定性
|
LEARNING_RATE="2e-4" # 🔥 适度降低学习率提升稳定性
|
||||||
DTYPE="bfloat16"
|
DTYPE="bfloat16"
|
||||||
@ -76,12 +76,14 @@ WARMUP_ITERS="0"
|
|||||||
|
|
||||||
# 🔥 四损失系统配置
|
# 🔥 四损失系统配置
|
||||||
BALANCE_LOSS_COEF="0.01" # 平衡损失系数
|
BALANCE_LOSS_COEF="0.01" # 平衡损失系数
|
||||||
SIMILARITY_LOSS_COEF="0.15" # 🔥 相似度损失系数(核心损失)
|
SIMILARITY_LOSS_COEF="0.8" # 🔥 相似度损失系数(核心损失)
|
||||||
DIVERSITY_LOSS_COEF="0.08" # 🔥 多样性损失系数(避免候选重复)
|
DIVERSITY_LOSS_COEF="0.2" # 🔥 多样性损失系数(避免候选重复)
|
||||||
|
|
||||||
# 数据和缓存路径
|
# 数据和缓存路径
|
||||||
DATA_PATH="dataset/stable/merged_pretrain.jsonl"
|
DATA_PATH="dataset/stable/merged_pretrain.jsonl"
|
||||||
DATABASE_INIT_PATH="dataset/stable/sentence_trex_data.json"
|
DATABASE_INIT_PATH="dataset/stable/sentence_trex_data.json"
|
||||||
|
DATA_PATH="dataset/stable/merged_pretrain.jsonl"
|
||||||
|
DATABASE_INIT_PATH="dataset/stable/sentence_trex_data.json"
|
||||||
CLUSTER_CACHE_PATH="None" # 禁用聚类缓存
|
CLUSTER_CACHE_PATH="None" # 禁用聚类缓存
|
||||||
VAL_DATA_PATH="dataset/stable/eval_data.json"
|
VAL_DATA_PATH="dataset/stable/eval_data.json"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user