diff --git a/dsRag/Start.py b/dsRag/Start.py index ba49b059..6e56680e 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -15,6 +15,10 @@ from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT, MS_HOST, MS_PORT, MS_MA from Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager from Milvus.Utils.MilvusConnectionPool import * from Milvus.Utils.MilvusConnectionPool import MilvusConnectionPool +from docx import Document +from docx.shared import Inches +from io import BytesIO +import html2text # 初始化日志 logger = logging.getLogger(__name__) @@ -156,6 +160,42 @@ http://10.10.21.22:8000/static/ai.html class QueryRequest(BaseModel): query: str = Field(..., description="用户查询的问题") +class SaveWordRequest(BaseModel): + html: str = Field(..., description="要保存为Word的HTML内容") + +@app.post("/api/save-word") +async def save_to_word(request: Request): + try: + data = await request.json() + save_request = SaveWordRequest(**data) + except ValidationError as e: + logger.error(f"请求体验证失败: {e.errors()}") + raise HTTPException(status_code=422, detail=e.errors()) + except Exception as e: + logger.error(f"请求解析失败: {str(e)}") + raise HTTPException(status_code=400, detail="无效的请求格式") + + # 转换HTML为纯文本 + h = html2text.HTML2Text() + h.ignore_links = True + plain_text = h.handle(save_request.html) + + # 创建Word文档 + doc = Document() + doc.add_heading('小学数学问答', 0) + doc.add_paragraph(plain_text) + + # 保存到内存中的字节流 + file_stream = BytesIO() + doc.save(file_stream) + file_stream.seek(0) + + return StreamingResponse( + file_stream, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": "attachment; filename=小学数学问答.docx"} + ) + @app.post("/api/rag") async def rag_stream(request: Request): try: diff --git a/dsRag/Test/TestWriteDoc.py b/dsRag/Test/TestWriteDoc.py new file mode 100644 index 00000000..dc8469e1 --- /dev/null +++ b/dsRag/Test/TestWriteDoc.py @@ -0,0 +1,44 @@ +from bs4 import BeautifulSoup +from docx import Document +""" +pip install python-docx html2text beautifulsoup4 +""" + +def html_to_word(html_content, word_path): + # 解析 HTML + soup = BeautifulSoup(html_content, 'html.parser') + + # 创建 Word 文档 + doc = Document() + + # 遍历 HTML 的所有段落 + for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div']): + # 获取文本内容 + text = element.get_text(strip=True) + if text: + # 添加到 Word 文档 + doc.add_paragraph(text) + + # 保存 Word 文档 + doc.save(word_path) + print(f"HTML content saved to {word_path}") + + +# 示例 HTML 内容 +html_content = """ + + +
+This is a paragraph.
+