Compare commits
5 Commits
bed6faa379
...
da5ac6a5c0
Author | SHA1 | Date | |
---|---|---|---|
da5ac6a5c0 | |||
8dd7cfaf72 | |||
cb286d26d1 | |||
0c8c6e5d1a | |||
b6bd97aaaa |
@ -20,6 +20,10 @@ class LMConfig(PretrainedConfig):
|
|||||||
dropout: float = 0.0,
|
dropout: float = 0.0,
|
||||||
flash_attn: bool = True,
|
flash_attn: bool = True,
|
||||||
####################################################
|
####################################################
|
||||||
|
# DB related configurations
|
||||||
|
####################################################
|
||||||
|
disable_db: bool = False, # 特殊模式:禁用数据库功能
|
||||||
|
####################################################
|
||||||
# Here are the specific configurations of MOE
|
# Here are the specific configurations of MOE
|
||||||
# When use_moe is false, the following is invalid
|
# When use_moe is false, the following is invalid
|
||||||
####################################################
|
####################################################
|
||||||
@ -47,6 +51,10 @@ class LMConfig(PretrainedConfig):
|
|||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
self.flash_attn = flash_attn
|
self.flash_attn = flash_attn
|
||||||
####################################################
|
####################################################
|
||||||
|
# DB related configurations
|
||||||
|
####################################################
|
||||||
|
self.disable_db = disable_db # 设置是否禁用数据库
|
||||||
|
####################################################
|
||||||
# Here are the specific configurations of MOE
|
# Here are the specific configurations of MOE
|
||||||
# When use_moe is false, the following is invalid
|
# When use_moe is false, the following is invalid
|
||||||
####################################################
|
####################################################
|
||||||
|
@ -94,7 +94,7 @@ class Attention(nn.Module):
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
pos_cis: torch.Tensor,
|
pos_cis: torch.Tensor,
|
||||||
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
use_cache=False,
|
use_cache=True,
|
||||||
db_value=None):
|
db_value=None):
|
||||||
bsz, seq_len, _ = x.shape #bsz: 批量大小, seq_len: 序列长度, _: 隐藏维度
|
bsz, seq_len, _ = x.shape #bsz: 批量大小, seq_len: 序列长度, _: 隐藏维度
|
||||||
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) #将输入张量x分别通过线性层wq, wk, wv进行变换,得到查询、键和值。
|
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) #将输入张量x分别通过线性层wq, wk, wv进行变换,得到查询、键和值。
|
||||||
@ -174,12 +174,12 @@ class CrossAttention(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
self.num_heads = 8
|
self.num_heads = 8
|
||||||
self.head_dim = 768 // self.num_heads
|
self.head_dim = self.config.dim // self.num_heads
|
||||||
self.to_q = nn.Linear(768, 768, bias=False)
|
self.to_q = nn.Linear(self.config.dim, self.config.dim, bias=False)
|
||||||
self.to_k = nn.Linear(768, 768, bias=False)
|
self.to_k = nn.Linear(self.config.dim, self.config.dim, bias=False)
|
||||||
self.to_v = nn.Linear(768, 768, bias=False)
|
self.to_v = nn.Linear(self.config.dim, self.config.dim, bias=False)
|
||||||
|
|
||||||
self.to_out = nn.Linear(768, 768, bias=False)
|
self.to_out = nn.Linear(self.config.dim, self.config.dim, bias=False)
|
||||||
|
|
||||||
def forward(self, x, db, context_mask=None, pos_emb=None):
|
def forward(self, x, db, context_mask=None, pos_emb=None):
|
||||||
batch_size = x.size(0)
|
batch_size = x.size(0)
|
||||||
@ -205,7 +205,7 @@ class CrossAttention(nn.Module):
|
|||||||
|
|
||||||
context = torch.matmul(attn_weights, v)
|
context = torch.matmul(attn_weights, v)
|
||||||
|
|
||||||
context = context.transpose(1, 2).contiguous().view(batch_size, -1, 768)
|
context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.dim)
|
||||||
|
|
||||||
context = self.to_out(context)
|
context = self.to_out(context)
|
||||||
|
|
||||||
@ -373,7 +373,7 @@ class MiniMindBlock(nn.Module):
|
|||||||
# self.product_key_topk = min(16, self.num_keys) # 确保不超过num_keys
|
# self.product_key_topk = min(16, self.num_keys) # 确保不超过num_keys
|
||||||
# self.num_experts_per_head_topk = 1 # 最终每个头选取的专家数
|
# self.num_experts_per_head_topk = 1 # 最终每个头选取的专家数
|
||||||
|
|
||||||
def forward(self, x,db_value, pos_cis, past_key_value=None, use_cache=False):
|
def forward(self, x, db_value, pos_cis, past_key_value=None, use_cache=True):
|
||||||
# import pdb;pdb.set_trace()
|
# import pdb;pdb.set_trace()
|
||||||
# db_value = None
|
# db_value = None
|
||||||
|
|
||||||
@ -426,7 +426,7 @@ class MiniMindBlock(nn.Module):
|
|||||||
db_value=db_value
|
db_value=db_value
|
||||||
)
|
)
|
||||||
|
|
||||||
h_attn = self.cross_att(h_attn,db_value)
|
h_attn = self.cross_att(h_attn, db_value)
|
||||||
|
|
||||||
# 残差连接
|
# 残差连接
|
||||||
h = x + h_attn
|
h = x + h_attn
|
||||||
@ -523,19 +523,36 @@ class MiniMindLM(PreTrainedModel):
|
|||||||
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
|
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
|
||||||
self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
|
self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
|
||||||
self.tok_embeddings.weight = self.output.weight
|
self.tok_embeddings.weight = self.output.weight
|
||||||
self.downsample_v = nn.Sequential(
|
|
||||||
nn.Conv1d(511*8,128*8,kernel_size=1,padding='same'),
|
# Calculate input dimension
|
||||||
nn.Conv1d(128*8,128,kernel_size=1,padding='same'),
|
input_dim = (self.params.max_seq_len-1)*self.params.n_layers
|
||||||
nn.Conv1d(128,8,kernel_size=1,padding='same')
|
# Use a bottleneck architecture to reduce parameters
|
||||||
|
bottleneck_dim = 256 # Significantly smaller bottleneck dimension
|
||||||
|
|
||||||
|
# Factorized shared downsampling using two smaller convolutions
|
||||||
|
self.shared_downsample = nn.Sequential(
|
||||||
|
# First reduce input dimension to bottleneck
|
||||||
|
nn.Conv1d(input_dim, bottleneck_dim, kernel_size=1, padding='same'),
|
||||||
|
nn.ReLU(), # Non-linearity to improve representation capacity
|
||||||
|
# Then expand to target dimension
|
||||||
|
nn.Conv1d(bottleneck_dim, 128*8, kernel_size=1, padding='same')
|
||||||
)
|
)
|
||||||
self.downsample_q = nn.Sequential(
|
|
||||||
nn.Conv1d(511*8,128*8,kernel_size=1,padding='same'),
|
# Specific layers for v path
|
||||||
nn.Conv1d(128*8,512,kernel_size=1,padding='same')
|
self.downsample_v_specific = nn.Sequential(
|
||||||
|
nn.Conv1d(128*8, 128, kernel_size=1, padding='same'),
|
||||||
|
nn.Conv1d(128, 8, kernel_size=1, padding='same')
|
||||||
|
)
|
||||||
|
|
||||||
|
# Specific layers for q path
|
||||||
|
self.downsample_q_specific = nn.Sequential(
|
||||||
|
nn.Conv1d(128*8, 512, kernel_size=1, padding='same')
|
||||||
)
|
)
|
||||||
self.register_buffer("pos_cis",
|
self.register_buffer("pos_cis",
|
||||||
precompute_pos_cis(dim=params.dim // params.n_heads, theta=params.rope_theta),
|
precompute_pos_cis(dim=params.dim // params.n_heads, theta=params.rope_theta),
|
||||||
persistent=False)
|
persistent=False)
|
||||||
self.OUT = CausalLMOutputWithPast()
|
self.OUT = CausalLMOutputWithPast()
|
||||||
|
self.params = params
|
||||||
|
|
||||||
def forward(self,
|
def forward(self,
|
||||||
input_ids: Optional[torch.Tensor] = None,
|
input_ids: Optional[torch.Tensor] = None,
|
||||||
@ -551,10 +568,19 @@ class MiniMindLM(PreTrainedModel):
|
|||||||
h_list = []
|
h_list = []
|
||||||
|
|
||||||
for l, layer in enumerate(self.layers):
|
for l, layer in enumerate(self.layers):
|
||||||
|
# 禁用数据库模式,使用固定值替代数据库查询
|
||||||
|
if self.params.disable_db:
|
||||||
|
# 创建一个形状为[batch_size, n_layers, dim]的tensor,所有元素值为1e-4
|
||||||
|
batch_size = h.size(0)
|
||||||
|
db_value = torch.full((batch_size, self.n_layers, self.params.dim), 1e-4,
|
||||||
|
dtype=h.dtype, device=h.device)
|
||||||
|
else:
|
||||||
|
# 正常模式,使用数据库查询
|
||||||
index = self.extract_db.q_to_k(h)
|
index = self.extract_db.q_to_k(h)
|
||||||
db_value = self.extract_db.get_data(index)
|
db_value = self.extract_db.get_data(index)
|
||||||
|
|
||||||
h, past_kv = layer(
|
h, past_kv = layer(
|
||||||
h,db_value, pos_cis,
|
h, db_value, pos_cis,
|
||||||
past_key_value=past_key_values[l],
|
past_key_value=past_key_values[l],
|
||||||
use_cache=use_cache
|
use_cache=use_cache
|
||||||
)
|
)
|
||||||
@ -562,17 +588,22 @@ class MiniMindLM(PreTrainedModel):
|
|||||||
past_kvs.append(past_kv)
|
past_kvs.append(past_kv)
|
||||||
h_list.append(h.unsqueeze(0))
|
h_list.append(h.unsqueeze(0))
|
||||||
|
|
||||||
|
h_tensor = torch.cat(h_list, dim=0).permute(1, 0, 2, 3)
|
||||||
|
|
||||||
|
# 只在非禁用数据库模式下执行数据库更新逻辑
|
||||||
|
if not self.params.disable_db:
|
||||||
# 使用detach()分离计算图,避免多次反向传播
|
# 使用detach()分离计算图,避免多次反向传播
|
||||||
h_tensor = torch.cat(h_list,dim=0).permute(1,0,2,3)
|
|
||||||
h_tensor_detached = h_tensor.detach()
|
h_tensor_detached = h_tensor.detach()
|
||||||
h_tensor_detached = h_tensor_detached.reshape(h_tensor_detached.shape[0],-1,768)
|
h_tensor_detached = h_tensor_detached.reshape(h_tensor_detached.shape[0], -1, self.params.dim)
|
||||||
|
|
||||||
# 数据库更新逻辑与主计算图分离
|
# 数据库更新逻辑与主计算图分离
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
z_v = self.downsample_v(h_tensor_detached)
|
# Compute shared downsampling layer once
|
||||||
z_q = self.downsample_q(h_tensor_detached)
|
shared_features = self.shared_downsample(h_tensor_detached)
|
||||||
|
z_v = self.downsample_v_specific(shared_features)
|
||||||
|
z_q = self.downsample_q_specific(shared_features)
|
||||||
z_k = self.extract_db.q_to_k(z_q)
|
z_k = self.extract_db.q_to_k(z_q)
|
||||||
self.extract_db.updata_value(z_k,z_v)
|
self.extract_db.updata_value(z_k, z_v)
|
||||||
|
|
||||||
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
||||||
logits = self.output(self.norm(h)[:, slice_indices, :])
|
logits = self.output(self.norm(h)[:, slice_indices, :])
|
||||||
|
@ -197,11 +197,18 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--n_layers', default=24, type=int) #层数,用于控制模型层数。
|
parser.add_argument('--n_layers', default=24, type=int) #层数,用于控制模型层数。
|
||||||
parser.add_argument('--max_seq_len', default=1024, type=int) #最大序列长度,用于控制输入序列的最大长度。
|
parser.add_argument('--max_seq_len', default=1024, type=int) #最大序列长度,用于控制输入序列的最大长度。
|
||||||
parser.add_argument('--use_moe', default=False, type=bool) #是否使用MOE,用于控制是否使用MOE。
|
parser.add_argument('--use_moe', default=False, type=bool) #是否使用MOE,用于控制是否使用MOE。
|
||||||
|
parser.add_argument('--disable_db', action='store_true', help="禁用数据库功能,使用固定值1e-4替代") #禁用数据库功能,启用特殊模式
|
||||||
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_hq.jsonl") #数据路径,用于控制数据集的路径。
|
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_hq.jsonl") #数据路径,用于控制数据集的路径。
|
||||||
parser.add_argument("--pretrained_embedding_path", type=str, default=None, help="Path to pretrained token embedding weights (.pth file)")
|
parser.add_argument("--pretrained_embedding_path", type=str, default=None, help="Path to pretrained token embedding weights (.pth file)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
lm_config = LMConfig(dim=args.dim, n_layers=args.n_layers, max_seq_len=args.max_seq_len, use_moe=args.use_moe) #创建LMConfig对象,用于控制模型配置。
|
lm_config = LMConfig(
|
||||||
|
dim=args.dim,
|
||||||
|
n_layers=args.n_layers,
|
||||||
|
max_seq_len=args.max_seq_len,
|
||||||
|
use_moe=args.use_moe,
|
||||||
|
disable_db=args.disable_db # 添加禁用数据库参数
|
||||||
|
) #创建LMConfig对象,用于控制模型配置。
|
||||||
args.save_dir = os.path.join(args.out_dir) #创建保存目录。
|
args.save_dir = os.path.join(args.out_dir) #创建保存目录。
|
||||||
os.makedirs(args.save_dir, exist_ok=True) #创建保存目录。
|
os.makedirs(args.save_dir, exist_ok=True) #创建保存目录。
|
||||||
os.makedirs(args.out_dir, exist_ok=True) #创建输出目录。
|
os.makedirs(args.out_dir, exist_ok=True) #创建输出目录。
|
||||||
@ -234,10 +241,9 @@ if __name__ == "__main__":
|
|||||||
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
# Merge args and lm_config into a single config dictionary
|
# Merge args and lm_config parameters for wandb config
|
||||||
config = vars(args)
|
config = vars(args).copy()
|
||||||
for key, value in vars(lm_config).items():
|
config.update(lm_config.__dict__)
|
||||||
config[f"lm_{key}"] = value
|
|
||||||
|
|
||||||
wandb.init(project=args.wandb_project, name=args.wandb_run_name, config=config)
|
wandb.init(project=args.wandb_project, name=args.wandb_run_name, config=config)
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user