main
HuangHai 3 weeks ago
parent 3a9fbad0ff
commit 682cad6910

@ -18,7 +18,7 @@ Office文档解析测试脚本 - RAG-Anything项目
import argparse
import sys
from pathlib import Path
from raganything import RAGAnything
from raganything import RAGAnything, RAGAnythingConfig
def check_libreoffice_installation():
@ -28,6 +28,8 @@ def check_libreoffice_installation():
返回:
bool: 如果LibreOffice可用返回True否则返回False
"""
"""
import subprocess
# 尝试不同的LibreOffice命令名称
@ -55,6 +57,8 @@ def check_libreoffice_installation():
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
"""
return True
def test_office_document_parsing(file_path: str):
@ -67,6 +71,9 @@ def test_office_document_parsing(file_path: str):
返回:
bool: 解析成功返回True否则返回False
"""
# 在test_office_document_parsing函数中添加
import os
os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
print(f"🧪 测试Office文档解析: {file_path}")
# 检查文件是否存在且是支持的Office格式
@ -92,15 +99,43 @@ def test_office_document_parsing(file_path: str):
rag = RAGAnything(config=config)
try:
# 添加MinerU安装检查
from raganything.mineru_parser import MineruParser
if not MineruParser.check_installation():
print("❌ MinerU未正确安装")
return False
# 确保output_dir已定义
output_dir = "./test_output"
Path(output_dir).mkdir(exist_ok=True)
# 添加PDF转换检查
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF转换路径: {pdf_path}")
if pdf_path.exists():
print(f"✅ PDF已生成大小: {pdf_path.stat().st_size}字节")
else:
print("❌ PDF转换失败")
# 使用MinerU测试文档解析
print("\n🔄 使用MinerU测试文档解析...")
# 使用绝对路径确保输出目录位置明确
output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
output_dir.mkdir(exist_ok=True)
content_list, md_content = rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
output_dir=str(output_dir),
parse_method="auto",
display_stats=True,
)
# 检查输出目录内容
print(f"\n📂 输出目录内容({output_dir}):")
for f in output_dir.glob("*"):
print(f" - {f.name}")
print("✅ 解析成功!")
print(f" 📊 内容块数量: {len(content_list)}")
print(f" 📝 Markdown长度: {len(md_content)} 字符")
@ -167,6 +202,10 @@ def test_office_document_parsing(file_path: str):
print("📁 输出文件保存到: ./test_output")
return True
# 在test_office_document_parsing函数中添加
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
except Exception as e:
print(f"\n❌ Office文档解析失败: {str(e)}")
import traceback
@ -203,3 +242,5 @@ def main():
if __name__ == "__main__":
sys.exit(main())

@ -0,0 +1,80 @@
import asyncio
from raganything import RAGAnything
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
async def main():
# Initialize RAGAnything
rag = RAGAnything(
working_dir="./rag_storage",
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
),
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]} if image_data else {"role": "user", "content": prompt}
],
api_key="your-api-key",
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
**kwargs,
),
embedding_func=EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
),
)
# Process a document
await rag.process_document_complete(
file_path="path/to/your/document.pdf",
output_dir="./output",
parse_method="auto"
)
# Query the processed content
# Pure text query - for basic knowledge base search
text_result = await rag.aquery(
"What are the main findings shown in the figures and tables?",
mode="hybrid"
)
print("Text query result:", text_result)
# Multimodal query with specific multimodal content
multimodal_result = await rag.aquery_with_multimodal(
"Explain this formula and its relevance to the document content",
multimodal_content=[{
"type": "equation",
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
"equation_caption": "Document relevance probability"
}],
mode="hybrid"
)
print("Multimodal query result:", multimodal_result)
if __name__ == "__main__":
asyncio.run(main())
Loading…
Cancel
Save