parent
e6c0c618a9
commit
6a888b8cb8
@ -0,0 +1,44 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
"""
|
||||
pip install python-docx html2text beautifulsoup4
|
||||
"""
|
||||
|
||||
def html_to_word(html_content, word_path):
|
||||
# 解析 HTML
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 创建 Word 文档
|
||||
doc = Document()
|
||||
|
||||
# 遍历 HTML 的所有段落
|
||||
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div']):
|
||||
# 获取文本内容
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
# 添加到 Word 文档
|
||||
doc.add_paragraph(text)
|
||||
|
||||
# 保存 Word 文档
|
||||
doc.save(word_path)
|
||||
print(f"HTML content saved to {word_path}")
|
||||
|
||||
|
||||
# 示例 HTML 内容
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Sample HTML</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Heading 1</h1>
|
||||
<p>This is a paragraph.</p>
|
||||
<h2>Heading 2</h2>
|
||||
<div>Content inside a div.</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# 调用函数
|
||||
html_to_word(html_content, "output.docx")
|
Binary file not shown.
Loading…
Reference in new issue