Minimind/main.py
2025-09-06 16:16:12 +08:00

84 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import langextract as lx
import os
os.environ["LANGEXTRACT_API_KEY"] = "gpustack_d402860477878812_9ec494a501497d25b565987754f4db8c"
# 可选:开启日志,便于调试
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 输入文本
input_text = """
这篇文章介绍了张华他今年32岁是一位经验丰富的工程师。
他的同事李明今年28岁是一位充满活力的设计师。
他们一起在一家科技公司工作。
"""
# 提取示例(用于 prompt alignment
examples = [
lx.data.ExampleData(
text="王小明25岁是一名软件开发者。",
extractions=[
lx.data.Extraction(
extraction_class="person",
extraction_text="王小明",
attributes={"age": "25", "occupation": "软件开发者"}
)
]
)
]
# HTTP API 配置(注意:不要带 /v1
# 创建ModelConfig强制使用OpenAI提供者访问vllm端点
model_config = lx.factory.ModelConfig(
model_id="gpt-oss", # 使用vllm中实际部署的模型名称
provider="OpenAILanguageModel", # 强制指定OpenAI提供者
provider_kwargs={
"base_url": "http://192.168.31.127:19090/v1", # vllm API端点
"api_key": "gpustack_d402860477878812_9ec494a501497d25b565987754f4db8c",
"model_id": "gpt-oss" # 确保使用正确的模型ID
}
)
# LangExtract通用配置参数
extract_config = {
"config": model_config,
"max_workers": 5, # 降低并发避免过载vllm服务
"max_char_buffer": 6000, # 适合医学论文的上下文长度
"extraction_passes": 1, # 单次提取避免过多API调用
"temperature": 0.1, # 较低温度确保一致性
"fence_output": True, # 期望代码围栏格式输出
"use_schema_constraints": False, # vllm可能不支持严格schema
"debug": False
}
try:
result = lx.extract(
text_or_documents=input_text,
prompt_description="从文本中提取人物姓名、年龄等信息",
examples=examples,
**extract_config
)
# result = lx.extract(
# text_or_documents=input_text,
# prompt_description="从文本中提取人物姓名、年龄等信息",
# examples=examples,
# model_id="gpt-oss", # Automatically selects OpenAI provider
# api_key="gpustack_d402860477878812_9ec494a501497d25b565987754f4db8c",
# model_url="http://192.168.31.127:19090/v1",
# fence_output=True,
# use_schema_constraints=False
# )
# 输出结果
print("提取结果:")
for extraction in result.extractions:
print(f"类别: {extraction.extraction_class}")
print(f"文本: {extraction.extraction_text}")
print(f"属性: {extraction.attributes}")
print("---")
except Exception as e:
logger.exception("调用 langextract.extract 失败,详情:")
raise