update minimind-v1
This commit is contained in:
parent
1752b85535
commit
90e8601c30
17
2-eval.py
17
2-eval.py
@ -12,12 +12,11 @@ warnings.filterwarnings('ignore')
|
|||||||
|
|
||||||
|
|
||||||
def count_parameters(model):
|
def count_parameters(model):
|
||||||
return sum(p.numel() for p in model.parameters())
|
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
|
||||||
|
|
||||||
def init_model(lm_config):
|
def init_model(lm_config):
|
||||||
tokenizer = AutoTokenizer.from_pretrained('./model',
|
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||||
trust_remote_code=True, use_fast=False)
|
|
||||||
model_from = 1 # 1从权重,2用transformers
|
model_from = 1 # 1从权重,2用transformers
|
||||||
|
|
||||||
if model_from == 1:
|
if model_from == 1:
|
||||||
@ -40,10 +39,7 @@ def init_model(lm_config):
|
|||||||
# 加载到模型中
|
# 加载到模型中
|
||||||
model.load_state_dict(state_dict, strict=False)
|
model.load_state_dict(state_dict, strict=False)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained("minimind", trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained('minimind', trust_remote_code=True)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained('minimind',
|
|
||||||
trust_remote_code=True, use_fast=False)
|
|
||||||
model = model.to(device)
|
model = model.to(device)
|
||||||
|
|
||||||
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
|
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
|
||||||
@ -65,7 +61,7 @@ if __name__ == "__main__":
|
|||||||
out_dir = 'out'
|
out_dir = 'out'
|
||||||
start = ""
|
start = ""
|
||||||
temperature = 0.7
|
temperature = 0.7
|
||||||
top_k = 8
|
top_k = 16
|
||||||
setup_seed(1337)
|
setup_seed(1337)
|
||||||
# device = 'cpu'
|
# device = 'cpu'
|
||||||
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
||||||
@ -89,6 +85,11 @@ if __name__ == "__main__":
|
|||||||
stream = True
|
stream = True
|
||||||
|
|
||||||
prompt_datas = [
|
prompt_datas = [
|
||||||
|
'你叫什么名字啊?',
|
||||||
|
'你叫什么名字?',
|
||||||
|
'中国有哪些比较好的大学?',
|
||||||
|
'全世界最好的大学是什么?',
|
||||||
|
'你知道光速是多少吗?',
|
||||||
'你知道长江吗?',
|
'你知道长江吗?',
|
||||||
'人类的血液主要由哪些成分组成?',
|
'人类的血液主要由哪些成分组成?',
|
||||||
'第一颗人造卫星是哪个国家发射的?',
|
'第一颗人造卫星是哪个国家发射的?',
|
||||||
|
38
README.md
38
README.md
@ -1,6 +1,7 @@
|
|||||||

|

|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
|

|
||||||
[](https://github.com/jingyaogong/minimind/stargazers)
|
[](https://github.com/jingyaogong/minimind/stargazers)
|
||||||
[](LICENSE)
|
[](LICENSE)
|
||||||
[](https://github.com/jingyaogong/minimind/commits/master)
|
[](https://github.com/jingyaogong/minimind/commits/master)
|
||||||
@ -38,14 +39,15 @@
|
|||||||
因此,本项目的目标是把上手LLM的门槛无限降低,
|
因此,本项目的目标是把上手LLM的门槛无限降低,
|
||||||
直接从0开始训练一个极其轻量的语言模型。
|
直接从0开始训练一个极其轻量的语言模型。
|
||||||
|
|
||||||
(截至2024.8.28)MiniMind首发包含4个型号模型,最小仅需26M(0.02B),即可具备Amazing的对话能力!
|
(截至2024.09.01)MiniMind包含5个型号模型,最小仅需26M(0.02B),即可具备Amazing的对话能力!
|
||||||
|
|
||||||
| 模型 (大小) | 速度 (Tokens/s) | 推理占用 | 训练占用(`batch_size=8`) |
|
| 模型 (大小) | 速度 (Tokens/s) | 推理占用 | 训练占用(`batch_size=8`) | release | 主观评分(/100) |
|
||||||
|------------------------|---------------|--------|----------------------|
|
|------------------------|---------------|--------|----------------------|--------------------|------------|
|
||||||
| MiniMind-small-T (26M) | 91.9 | 0.5 GB | 3.6 GB |
|
| MiniMind-small-T (26M) | 91.9 | 0.5 GB | 3.6 GB | 2024.08.28 | 55' |
|
||||||
| MiniMind-small (56M) | 85.2 | 0.7 GB | 4.5 GB |
|
| MiniMind-small (56M) | 85.2 | 0.7 GB | 4.5 GB | 2024.08.28 | 55' |
|
||||||
| MiniMind (218M) | 57.6 | 2.1 GB | 10.4 GB |
|
| MiniMind (218M) | 57.6 | 2.1 GB | 10.4 GB | 2024.08.28 | 75' |
|
||||||
| MiniMind-MoE (166M) | 64.9 | 1.6 GB | 7.4 GB |
|
| MiniMind-MoE (166M) | 64.9 | 1.6 GB | 7.4 GB | 2024.08.28 | 40' |
|
||||||
|
| MiniMind-V1 (108M) | 78.3 | 1.0 GB | 6.4 GB | 2024.09.01 (new🎉) | 80' |
|
||||||
|
|
||||||
> 该分析在一个带有Torch 2.1.2、CUDA 12.2和Flash Attention 2的RTX 3090 GPU上运行。
|
> 该分析在一个带有Torch 2.1.2、CUDA 12.2和Flash Attention 2的RTX 3090 GPU上运行。
|
||||||
|
|
||||||
@ -64,6 +66,8 @@
|
|||||||
👉**最近更新**
|
👉**最近更新**
|
||||||
|
|
||||||
<details close>
|
<details close>
|
||||||
|
<summary> <b>2024-09-01 (new🎉)</b> </summary>
|
||||||
|
- 更新MiniMind-V1 (108M)模型,采用minimind_tokenizer,预训练轮次3 + SFT轮次10,更充分训练,性能更强。
|
||||||
<summary> <b>2024-08-27</b> </summary>
|
<summary> <b>2024-08-27</b> </summary>
|
||||||
- 项目首次开源
|
- 项目首次开源
|
||||||
</details>
|
</details>
|
||||||
@ -179,7 +183,9 @@ python 2-eval.py
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
- 📙【Pretrain数据】:[seq-monkey通用文本数据集](https://github.com/mobvoi/seq-monkey-data/blob/main/docs/pretrain_open_corpus.md)
|
-
|
||||||
|
|
||||||
|
📙【Pretrain数据】:[seq-monkey通用文本数据集](https://github.com/mobvoi/seq-monkey-data/blob/main/docs/pretrain_open_corpus.md)
|
||||||
是由多种公开来源的数据(如网页、百科、博客、开源代码、书籍等)汇总清洗而成。
|
是由多种公开来源的数据(如网页、百科、博客、开源代码、书籍等)汇总清洗而成。
|
||||||
整理成统一的JSONL格式,并经过了严格的筛选和去重,确保数据的全面性、规模、可信性和高质量。
|
整理成统一的JSONL格式,并经过了严格的筛选和去重,确保数据的全面性、规模、可信性和高质量。
|
||||||
总量大约在10B token,适合中文大语言模型的预训练。
|
总量大约在10B token,适合中文大语言模型的预训练。
|
||||||
@ -252,7 +258,8 @@ MiniMind的整体结构一致,只是在RoPE计算、推理函数和FFN层的
|
|||||||
| minimind-small-T | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
|
| minimind-small-T | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
|
||||||
| minimind-small | 56M | 32000 | 8 | 640 | 8 | 16 | - | - |
|
| minimind-small | 56M | 32000 | 8 | 640 | 8 | 16 | - | - |
|
||||||
| minimind | 218M | 32000 | 16 | 1024 | 8 | 16 | - | - |
|
| minimind | 218M | 32000 | 16 | 1024 | 8 | 16 | - | - |
|
||||||
| minimind-MoE | 166M | 32000 | 8 | 640 | 8 | 16 | 2+4 | 2 |
|
| minimind-MoE | 162M | 32000 | 8 | 640 | 8 | 16 | 2+4 | 2 |
|
||||||
|
| minimind-V1 | 108M | 6400 | 16 | 768 | 8 | 16 | - | - |
|
||||||
|
|
||||||
此外作为参考,GPT3的层数和维度参数见下表:
|
此外作为参考,GPT3的层数和维度参数见下表:
|
||||||

|

|
||||||
@ -272,6 +279,7 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
|
|||||||
| minimind-small | 56M | 32000 | 24 | ≈6 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
|
| minimind-small | 56M | 32000 | 24 | ≈6 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
|
||||||
| minimind | 218M | 32000 | 16 | ≈15 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
|
| minimind | 218M | 32000 | 16 | ≈15 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
|
||||||
| minimind-MoE | 166M | 32000 | 16 | ≈13 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
|
| minimind-MoE | 166M | 32000 | 16 | ≈13 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
|
||||||
|
| minimind-V1 | 108M | 6400 | 16 | ≈8 hour (1 epoch) | ≈3 hour (1 epoch) | ≈1 hour (1 epoch) |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -323,6 +331,7 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
|
|||||||
| minimind-small | 56M | d_model=640<br/>n_layers=8 | [链接](https://pan.baidu.com/s/1nJuOpnu5115FDuz6Ewbeqg?pwd=6666) | [链接](https://pan.baidu.com/s/1lRX0IcpjNFSySioeCfifRQ?pwd=6666) | [链接](https://pan.baidu.com/s/1LzVxBpL0phtGUH267Undqw?pwd=6666) |
|
| minimind-small | 56M | d_model=640<br/>n_layers=8 | [链接](https://pan.baidu.com/s/1nJuOpnu5115FDuz6Ewbeqg?pwd=6666) | [链接](https://pan.baidu.com/s/1lRX0IcpjNFSySioeCfifRQ?pwd=6666) | [链接](https://pan.baidu.com/s/1LzVxBpL0phtGUH267Undqw?pwd=6666) |
|
||||||
| minimind | 218M | d_model=1024<br/>n_layers=16 | [链接](https://pan.baidu.com/s/1jzA7uLEi-Jen2fW5olCmEg?pwd=6666) | [链接](https://pan.baidu.com/s/1Hvt0Q_UB_uW2sWTw6w1zRQ?pwd=6666) | [链接](https://pan.baidu.com/s/1fau9eat3lXilnrG3XNhG5Q?pwd=6666) |
|
| minimind | 218M | d_model=1024<br/>n_layers=16 | [链接](https://pan.baidu.com/s/1jzA7uLEi-Jen2fW5olCmEg?pwd=6666) | [链接](https://pan.baidu.com/s/1Hvt0Q_UB_uW2sWTw6w1zRQ?pwd=6666) | [链接](https://pan.baidu.com/s/1fau9eat3lXilnrG3XNhG5Q?pwd=6666) |
|
||||||
| minimind-MoE | 166M | d_model=1024<br/>n_layers=8<br/>share+route=2+4 | [链接](https://pan.baidu.com/s/11CneDVTkw2Y6lNilQX5bWw?pwd=6666) | [链接](https://pan.baidu.com/s/1fRq4MHZec3z-oLK6sCzj_A?pwd=6666) | [链接](https://pan.baidu.com/s/1HC2KSM_-RHRtgv7ZDkKI9Q?pwd=6666) |
|
| minimind-MoE | 166M | d_model=1024<br/>n_layers=8<br/>share+route=2+4 | [链接](https://pan.baidu.com/s/11CneDVTkw2Y6lNilQX5bWw?pwd=6666) | [链接](https://pan.baidu.com/s/1fRq4MHZec3z-oLK6sCzj_A?pwd=6666) | [链接](https://pan.baidu.com/s/1HC2KSM_-RHRtgv7ZDkKI9Q?pwd=6666) |
|
||||||
|
| minimind-V1 | 108M | d_model=768<br/>n_layers=16 | - | [链接](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [链接](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -349,6 +358,8 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
|
|||||||
|
|
||||||
# 📌 Eval
|
# 📌 Eval
|
||||||
|
|
||||||
|
> 【注】以下测试于2024.8.28完成,此日期后发布的(例如MiniMind-V1)新模型,无特殊需要时将不加入测试。
|
||||||
|
|
||||||
[A] [minimind-small-T(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
|
[A] [minimind-small-T(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
|
||||||
[B] [minimind-small(0.05B)](https://pan.baidu.com/s/1lRX0IcpjNFSySioeCfifRQ?pwd=6666)<br/>
|
[B] [minimind-small(0.05B)](https://pan.baidu.com/s/1lRX0IcpjNFSySioeCfifRQ?pwd=6666)<br/>
|
||||||
[C] [minimind-MoE(0.16B)](https://pan.baidu.com/s/1fRq4MHZec3z-oLK6sCzj_A?pwd=6666)<br/>
|
[C] [minimind-MoE(0.16B)](https://pan.baidu.com/s/1fRq4MHZec3z-oLK6sCzj_A?pwd=6666)<br/>
|
||||||
@ -498,7 +509,9 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
|
|||||||
* minimind-MoE(0.16B)表现很差,甚至不如它同配置的dense模型minimind(0.05B)
|
* minimind-MoE(0.16B)表现很差,甚至不如它同配置的dense模型minimind(0.05B)
|
||||||
,其实这并非MoE的锅。同样是因为偷懒提前kill腾出资源给小模型,但是MoE模型多专家模式需要的训练轮次本来就需要酌情更高,在epochs设置为2时训练的极其不充分。minimind不久前实验阶段在Yi
|
,其实这并非MoE的锅。同样是因为偷懒提前kill腾出资源给小模型,但是MoE模型多专家模式需要的训练轮次本来就需要酌情更高,在epochs设置为2时训练的极其不充分。minimind不久前实验阶段在Yi
|
||||||
tokenizer上试验过MoE的充分训练版本,可以做到比dense表现肉眼可见的好。现在先这样了hh,日后腾出服务器再训练更新v2 v3版本。
|
tokenizer上试验过MoE的充分训练版本,可以做到比dense表现肉眼可见的好。现在先这样了hh,日后腾出服务器再训练更新v2 v3版本。
|
||||||
* F模型的回答看起来是这里最完美的,尽管存在些许幻觉瞎编的情况。但GPT-4o和kimi的评分都一致认为它“信息过度冗长,且有重复内容,存在幻觉”。其实这种评价太严格了,100个字中有10个字是幻觉,就很容易把它归到0分。由于F模型训练文本默认长度更长,数据集大得多,所以回答的看起来很完备,在体积近似的情况下,数据比模型更重要得多。
|
*
|
||||||
|
|
||||||
|
F模型的回答看起来是这里最完美的,尽管存在些许幻觉瞎编的情况。但GPT-4o和kimi的评分都一致认为它“信息过度冗长,且有重复内容,存在幻觉”。其实这种评价太严格了,100个字中有10个字是幻觉,就很容易把它归到0分。由于F模型训练文本默认长度更长,数据集大得多,所以回答的看起来很完备,在体积近似的情况下,数据比模型更重要得多。
|
||||||
|
|
||||||
> 🙋♂️个人主观评价:F>D>A≈B>C>E
|
> 🙋♂️个人主观评价:F>D>A≈B>C>E
|
||||||
|
|
||||||
@ -516,7 +529,7 @@ minimind模型本身没有使用较大的数据集训练,也没有针对回答
|
|||||||
> 例如minimind-small的结果细项:
|
> 例如minimind-small的结果细项:
|
||||||
|
|
||||||
| 类别 | 正确数量/总题数 | 正确率 |
|
| 类别 | 正确数量/总题数 | 正确率 |
|
||||||
|---------------------------------|----------------|------------|
|
|----------------------------------------------|----------|--------|
|
||||||
| probability_and_statistics_val | 3/18 | 16.67% |
|
| probability_and_statistics_val | 3/18 | 16.67% |
|
||||||
| law_val | 5/24 | 20.83% |
|
| law_val | 5/24 | 20.83% |
|
||||||
| middle_school_biology_val | 4/21 | 19.05% |
|
| middle_school_biology_val | 4/21 | 19.05% |
|
||||||
@ -617,6 +630,7 @@ minimind模型本身没有使用较大的数据集训练,也没有针对回答
|
|||||||
|
|
||||||
* [./export_model.py](./export_model.py)可以导出模型到transformers格式,推送到huggingface
|
* [./export_model.py](./export_model.py)可以导出模型到transformers格式,推送到huggingface
|
||||||
*
|
*
|
||||||
|
|
||||||
MiniMind的huggingface集合地址:[MiniMind](https://huggingface.co/collections/jingyaogong/minimind-66caf8d999f5c7fa64f399e5)
|
MiniMind的huggingface集合地址:[MiniMind](https://huggingface.co/collections/jingyaogong/minimind-66caf8d999f5c7fa64f399e5)
|
||||||
|
|
||||||
---
|
---
|
||||||
@ -682,6 +696,7 @@ MiniMind的huggingface集合地址:[MiniMind](https://huggingface.co/collectio
|
|||||||
* [Zero-Chatgpt](https://github.com/AI-Study-Han/Zero-Chatgpt/tree/main)
|
* [Zero-Chatgpt](https://github.com/AI-Study-Han/Zero-Chatgpt/tree/main)
|
||||||
|
|
||||||
## ✨Top contributors
|
## ✨Top contributors
|
||||||
|
|
||||||
<a href="https://github.com/jingyaogong/minimind/graphs/contributors">
|
<a href="https://github.com/jingyaogong/minimind/graphs/contributors">
|
||||||
<img src="https://contrib.rocks/image?repo=jingyaogong/minimind" />
|
<img src="https://contrib.rocks/image?repo=jingyaogong/minimind" />
|
||||||
</a>
|
</a>
|
||||||
@ -690,7 +705,6 @@ MiniMind的huggingface集合地址:[MiniMind](https://huggingface.co/collectio
|
|||||||
|
|
||||||
本项目不承担开源模型和代码导致的数据安全、舆情风险或发生任何模型被误导、滥用、传播、不当利用而产生的风险和责任。
|
本项目不承担开源模型和代码导致的数据安全、舆情风险或发生任何模型被误导、滥用、传播、不当利用而产生的风险和责任。
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
This repository is licensed under the [Apache-2.0 License](LICENSE).
|
This repository is licensed under the [Apache-2.0 License](LICENSE).
|
@ -1,6 +1,7 @@
|
|||||||

|

|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
|

|
||||||
[](https://github.com/jingyaogong/minimind/stargazers)
|
[](https://github.com/jingyaogong/minimind/stargazers)
|
||||||
[](LICENSE)
|
[](LICENSE)
|
||||||
[](https://github.com/jingyaogong/minimind/commits/master)
|
[](https://github.com/jingyaogong/minimind/commits/master)
|
||||||
|
@ -10,7 +10,7 @@ conversation_history_origin = []
|
|||||||
conversation_history = conversation_history_origin.copy()
|
conversation_history = conversation_history_origin.copy()
|
||||||
while True:
|
while True:
|
||||||
conversation_history = conversation_history_origin.copy()
|
conversation_history = conversation_history_origin.copy()
|
||||||
query = input('Enter Your Q: ')
|
query = input('[Q]:')
|
||||||
|
|
||||||
# 将用户的问题添加到对话历史中
|
# 将用户的问题添加到对话历史中
|
||||||
conversation_history.append({"role": "user", "content": query})
|
conversation_history.append({"role": "user", "content": query})
|
||||||
@ -22,7 +22,7 @@ while True:
|
|||||||
stream=True
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
print('minimind: ', end='')
|
print('[A]: ', end='')
|
||||||
assistant_res = ''
|
assistant_res = ''
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
# 将生成的回复实时打印出来
|
# 将生成的回复实时打印出来
|
||||||
|
Loading…
x
Reference in New Issue
Block a user