You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1008 B
44 lines
1008 B
from bs4 import BeautifulSoup
|
|
from docx import Document
|
|
"""
|
|
pip install python-docx html2text beautifulsoup4
|
|
"""
|
|
|
|
def html_to_word(html_content, word_path):
|
|
# 解析 HTML
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# 创建 Word 文档
|
|
doc = Document()
|
|
|
|
# 遍历 HTML 的所有段落
|
|
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div']):
|
|
# 获取文本内容
|
|
text = element.get_text(strip=True)
|
|
if text:
|
|
# 添加到 Word 文档
|
|
doc.add_paragraph(text)
|
|
|
|
# 保存 Word 文档
|
|
doc.save(word_path)
|
|
print(f"HTML content saved to {word_path}")
|
|
|
|
|
|
# 示例 HTML 内容
|
|
html_content = """
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Sample HTML</title>
|
|
</head>
|
|
<body>
|
|
<h1>Heading 1</h1>
|
|
<p>This is a paragraph.</p>
|
|
<h2>Heading 2</h2>
|
|
<div>Content inside a div.</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# 调用函数
|
|
html_to_word(html_content, "output.docx") |