commit
831637bb8f
@ -1,4 +0,0 @@
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
@ -1,34 +0,0 @@
|
||||
import PyPDF2
|
||||
import os
|
||||
|
||||
|
||||
def read_pdf_file(file_path):
|
||||
"""
|
||||
读取PDF文件内容
|
||||
:param file_path: PDF文件路径
|
||||
:return: 文档文本内容
|
||||
"""
|
||||
try:
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"文件 {file_path} 不存在")
|
||||
|
||||
# 检查文件是否为PDF
|
||||
if not file_path.lower().endswith('.pdf'):
|
||||
raise ValueError("仅支持.pdf格式的文件")
|
||||
|
||||
text = ""
|
||||
|
||||
# 以二进制模式打开PDF文件
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
|
||||
# 逐页读取内容
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
return text.strip()
|
||||
|
||||
except Exception as e:
|
||||
print(f"读取PDF文件时出错: {str(e)}")
|
||||
return None
|
@ -1,13 +0,0 @@
|
||||
import docx
|
||||
|
||||
|
||||
class SplitDocxUtil:
|
||||
@staticmethod
|
||||
def read_docx(file_path):
|
||||
"""读取docx文件内容"""
|
||||
try:
|
||||
doc = docx.Document(file_path)
|
||||
return "\n".join([para.text for para in doc.paragraphs if para.text])
|
||||
except Exception as e:
|
||||
print(f"读取docx文件出错: {str(e)}")
|
||||
return ""
|
@ -1,72 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
import zipfile
|
||||
|
||||
from docx import Document
|
||||
from docx.oxml.ns import nsmap
|
||||
|
||||
|
||||
def extract_images_from_docx(docx_path, output_folder):
|
||||
"""
|
||||
从docx提取图片并记录位置
|
||||
:param docx_path: Word文档路径
|
||||
:param output_folder: 图片输出文件夹
|
||||
:return: 包含图片路径和位置的列表
|
||||
"""
|
||||
# 创建一个List<String> 记录每个图片的名称和序号
|
||||
image_data = []
|
||||
# 创建临时解压目录
|
||||
temp_dir = os.path.join(output_folder, "temp_docx")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# 解压docx文件
|
||||
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(temp_dir)
|
||||
|
||||
# 读取主文档关系
|
||||
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
|
||||
rels_content = rels_file.read()
|
||||
|
||||
# 加载主文档
|
||||
doc = Document(docx_path)
|
||||
img_counter = 1
|
||||
|
||||
# 遍历所有段落
|
||||
for para_idx, paragraph in enumerate(doc.paragraphs):
|
||||
for run_idx, run in enumerate(paragraph.runs):
|
||||
# 检查运行中的图形
|
||||
for element in run._element:
|
||||
if element.tag.endswith('drawing'):
|
||||
# 提取图片关系ID
|
||||
blip = element.find('.//a:blip', namespaces=nsmap)
|
||||
if blip is not None:
|
||||
embed_id = blip.get('{%s}embed' % nsmap['r'])
|
||||
|
||||
# 从关系文件中获取图片文件名
|
||||
rel_entry = f'<Relationship Id="{embed_id}"'
|
||||
if rel_entry in rels_content:
|
||||
start = rels_content.find(rel_entry)
|
||||
target_start = rels_content.find('Target="', start) + 8
|
||||
target_end = rels_content.find('"', target_start)
|
||||
image_path = rels_content[target_start:target_end]
|
||||
|
||||
# 构建图片源路径
|
||||
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
|
||||
|
||||
if os.path.exists(src_path):
|
||||
# 创建输出文件名
|
||||
ext = os.path.splitext(src_path)[1]
|
||||
# 名称为uuid
|
||||
fileName=uuid.uuid4().hex
|
||||
img_name = f"{fileName}{ext}"
|
||||
image_data.append(img_name)
|
||||
dest_path = os.path.join(output_folder, img_name)
|
||||
# 复制图片
|
||||
shutil.copy(src_path, dest_path)
|
||||
|
||||
img_counter += 1
|
||||
|
||||
# 清理临时目录
|
||||
shutil.rmtree(temp_dir)
|
||||
return image_data
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue