Files
QingLong/AI/Text2Sql/Util/MarkdownToDocxUtil.py
2025-08-15 09:13:13 +08:00

82 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
import re
def markdown_to_docx(markdown_text, output_file="report.docx"):
"""
将 Markdown 格式的字符串转换为 Word 文档 (.docx)
参数:
markdown_text (str): Markdown 格式的字符串
output_file (str): 输出的 Word 文件名(默认 "report.docx"
"""
# 初始化 Word 文档
doc = Document()
# 设置默认字体为宋体
doc.styles['Normal'].font.name = '宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
# 按行处理 Markdown 内容
for line in markdown_text.split("\n"):
# 处理标题(#、##、###
if line.startswith("#"):
level = line.count("#")
text = line.lstrip("#").strip()
if level == 1:
doc.add_heading(text, level=0)
elif level == 2:
doc.add_heading(text, level=1)
elif level == 3:
doc.add_heading(text, level=2)
# 处理无序列表(- 或 *
elif line.startswith("- ") or line.startswith("* "):
text = line.lstrip("-* ").strip()
paragraph = doc.add_paragraph(style='List Bullet')
add_formatted_text(paragraph, text)
# 处理有序列表1. 或 2.
elif re.match(r"^\d+\. ", line):
text = re.sub(r"^\d+\. ", "", line).strip()
paragraph = doc.add_paragraph(style='List Number')
add_formatted_text(paragraph, text)
# 处理普通段落
else:
if line.strip(): # 忽略空行
paragraph = doc.add_paragraph()
add_formatted_text(paragraph, line.strip())
# 保存 Word 文档
doc.save(output_file)
print(f"Word 文档已生成: {output_file}")
def add_formatted_text(paragraph, text):
"""
将 Markdown 格式的文本添加到 Word 段落中,支持加粗语法(**xx**
参数:
paragraph: Word 段落对象
text (str): 需要添加的文本
"""
# 使用正则表达式匹配加粗语法(**xx** 或 xx** 或 **xx
parts = re.split(r"(\*\*[^*]+\*\*|\*\*[^*]+|[^*]+\*\*)", text)
for part in parts:
if part.startswith("**") and part.endswith("**"):
# 去掉 ** 并设置为加粗
bold_text = part[2:-2]
run = paragraph.add_run(bold_text)
run.bold = True
elif part.startswith("**"):
# 去掉开头的 ** 并设置为加粗
bold_text = part[2:]
run = paragraph.add_run(bold_text)
run.bold = True
elif part.endswith("**"):
# 去掉结尾的 ** 并设置为加粗
bold_text = part[:-2]
run = paragraph.add_run(bold_text)
run.bold = True
else:
# 普通文本
paragraph.add_run(part)