|
|
|
@ -49,17 +49,17 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _run_mineru_command(
|
|
|
|
|
input_path: Union[str, Path],
|
|
|
|
|
output_dir: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
start_page: Optional[int] = None,
|
|
|
|
|
end_page: Optional[int] = None,
|
|
|
|
|
formula: bool = True,
|
|
|
|
|
table: bool = True,
|
|
|
|
|
device: Optional[str] = None,
|
|
|
|
|
source: str = "huggingface",
|
|
|
|
|
input_path: Union[str, Path],
|
|
|
|
|
output_dir: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
start_page: Optional[int] = None,
|
|
|
|
|
end_page: Optional[int] = None,
|
|
|
|
|
formula: bool = True,
|
|
|
|
|
table: bool = True,
|
|
|
|
|
device: Optional[str] = None,
|
|
|
|
|
source: str = "huggingface",
|
|
|
|
|
) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Run mineru command line tool
|
|
|
|
@ -77,6 +77,34 @@ class MineruParser:
|
|
|
|
|
device: Inference device
|
|
|
|
|
source: Model source
|
|
|
|
|
"""
|
|
|
|
|
# 【黄海】 MinerU需要下载模型,可以从国内的源下载:
|
|
|
|
|
# https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#本地部署
|
|
|
|
|
# mineru-models-download
|
|
|
|
|
# 居然在下载 OCR/paddleocr_torch/, 果然是个好东西!
|
|
|
|
|
"""
|
|
|
|
|
(raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
|
|
|
|
|
Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope
|
|
|
|
|
Please select the model type to download: (pipeline, vlm, all) [all]: all
|
|
|
|
|
Downloading all model from modelscope...
|
|
|
|
|
Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
|
|
|
|
|
Download failed: Missing dependencies for SOCKS support.
|
|
|
|
|
(raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
|
|
|
|
|
Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope
|
|
|
|
|
Please select the model type to download: (pipeline, vlm, all) [all]: all
|
|
|
|
|
Downloading all model from modelscope...
|
|
|
|
|
Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-04 21:46:23,860 - modelscope - INFO - Got 1 files, start to download ...
|
|
|
|
|
Downloading [models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt]: 100%|█| 37.9M/37.9M [00:02<00:00, 15.
|
|
|
|
|
Processing 1 items: 100%|███████████████████████████████████████████████████████████| 1.00/1.00 [00:02<00:00, 2.64s/it]
|
|
|
|
|
2025-07-04 21:46:26,507 - modelscope - INFO - Download model 'OpenDataLab/PDF-Extract-Kit-1.0' successfully.
|
|
|
|
|
2025-07-04 21:46:26,507 - modelscope - INFO - Creating symbolic link [C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0].
|
|
|
|
|
Downloading model: models/MFD/YOLO/yolo_v8_ft.pt
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-04 21:46:29,616 - modelscope - INFO - Got 1 files, start to download ...
|
|
|
|
|
Processing 1 items: 0%| | 0.00/1.00 [00:00<?, ?it/s]
|
|
|
|
|
Downloading [models/MFD/YOLO/yolo_v8_ft.pt]: 31%|██████████▌ | 104M/334M [00:06<00:13, 17.7MB/s]
|
|
|
|
|
"""
|
|
|
|
|
cmd = [
|
|
|
|
|
"mineru",
|
|
|
|
|
"-p",
|
|
|
|
@ -129,7 +157,7 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _read_output_files(
|
|
|
|
|
output_dir: Path, file_stem: str
|
|
|
|
|
output_dir: Path, file_stem: str
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Read the output files generated by mineru
|
|
|
|
@ -197,11 +225,11 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_pdf(
|
|
|
|
|
pdf_path: Union[str, Path],
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
pdf_path: Union[str, Path],
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Parse PDF document using MinerU 2.0
|
|
|
|
@ -254,10 +282,10 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_image(
|
|
|
|
|
image_path: Union[str, Path],
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
image_path: Union[str, Path],
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Parse image document using MinerU 2.0
|
|
|
|
@ -402,7 +430,7 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_office_doc(
|
|
|
|
|
doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
|
|
|
|
|
doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Parse office document by first converting to PDF, then parsing with MinerU 2.0
|
|
|
|
@ -437,64 +465,64 @@ class MineruParser:
|
|
|
|
|
if doc_path.suffix.lower() not in supported_office_formats:
|
|
|
|
|
raise ValueError(f"Unsupported office format: {doc_path.suffix}")
|
|
|
|
|
|
|
|
|
|
# Check if LibreOffice is available
|
|
|
|
|
libreoffice_available = False
|
|
|
|
|
working_libreoffice_cmd = None
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
["libreoffice", "--version"],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
check=True,
|
|
|
|
|
timeout=10,
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
errors="ignore",
|
|
|
|
|
)
|
|
|
|
|
libreoffice_available = True
|
|
|
|
|
working_libreoffice_cmd = "libreoffice"
|
|
|
|
|
print(f"LibreOffice detected: {result.stdout.strip()}")
|
|
|
|
|
except (
|
|
|
|
|
subprocess.CalledProcessError,
|
|
|
|
|
FileNotFoundError,
|
|
|
|
|
subprocess.TimeoutExpired,
|
|
|
|
|
):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Try alternative commands for LibreOffice
|
|
|
|
|
if not libreoffice_available:
|
|
|
|
|
for cmd in ["soffice", "libreoffice"]:
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
[cmd, "--version"],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
check=True,
|
|
|
|
|
timeout=10,
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
errors="ignore",
|
|
|
|
|
)
|
|
|
|
|
libreoffice_available = True
|
|
|
|
|
working_libreoffice_cmd = cmd
|
|
|
|
|
print(
|
|
|
|
|
f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
|
|
|
|
|
)
|
|
|
|
|
break
|
|
|
|
|
except (
|
|
|
|
|
subprocess.CalledProcessError,
|
|
|
|
|
FileNotFoundError,
|
|
|
|
|
subprocess.TimeoutExpired,
|
|
|
|
|
):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if not libreoffice_available:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
"LibreOffice is required for Office document conversion but was not found.\n"
|
|
|
|
|
"Please install LibreOffice:\n"
|
|
|
|
|
"- Windows: Download from https://www.libreoffice.org/download/download/\n"
|
|
|
|
|
"- macOS: brew install --cask libreoffice\n"
|
|
|
|
|
"- Ubuntu/Debian: sudo apt-get install libreoffice\n"
|
|
|
|
|
"- CentOS/RHEL: sudo yum install libreoffice\n"
|
|
|
|
|
"Alternatively, convert the document to PDF manually.\n"
|
|
|
|
|
"MinerU 2.0 no longer includes built-in Office document conversion."
|
|
|
|
|
)
|
|
|
|
|
# # Check if LibreOffice is available
|
|
|
|
|
# libreoffice_available = False
|
|
|
|
|
working_libreoffice_cmd = 'soffice'
|
|
|
|
|
# try:
|
|
|
|
|
# result = subprocess.run(
|
|
|
|
|
# ["libreoffice", "--version"],
|
|
|
|
|
# capture_output=True,
|
|
|
|
|
# check=True,
|
|
|
|
|
# timeout=10,
|
|
|
|
|
# encoding="utf-8",
|
|
|
|
|
# errors="ignore",
|
|
|
|
|
# )
|
|
|
|
|
# libreoffice_available = True
|
|
|
|
|
# working_libreoffice_cmd = "libreoffice"
|
|
|
|
|
# print(f"LibreOffice detected: {result.stdout.strip()}")
|
|
|
|
|
# except (
|
|
|
|
|
# subprocess.CalledProcessError,
|
|
|
|
|
# FileNotFoundError,
|
|
|
|
|
# subprocess.TimeoutExpired,
|
|
|
|
|
# ):
|
|
|
|
|
# pass
|
|
|
|
|
#
|
|
|
|
|
# # Try alternative commands for LibreOffice
|
|
|
|
|
# if not libreoffice_available:
|
|
|
|
|
# for cmd in ["soffice", "libreoffice"]:
|
|
|
|
|
# try:
|
|
|
|
|
# result = subprocess.run(
|
|
|
|
|
# [cmd, "--version"],
|
|
|
|
|
# capture_output=True,
|
|
|
|
|
# check=True,
|
|
|
|
|
# timeout=10,
|
|
|
|
|
# encoding="utf-8",
|
|
|
|
|
# errors="ignore",
|
|
|
|
|
# )
|
|
|
|
|
# libreoffice_available = True
|
|
|
|
|
# working_libreoffice_cmd = cmd
|
|
|
|
|
# print(
|
|
|
|
|
# f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
|
|
|
|
|
# )
|
|
|
|
|
# break
|
|
|
|
|
# except (
|
|
|
|
|
# subprocess.CalledProcessError,
|
|
|
|
|
# FileNotFoundError,
|
|
|
|
|
# subprocess.TimeoutExpired,
|
|
|
|
|
# ):
|
|
|
|
|
# continue
|
|
|
|
|
#
|
|
|
|
|
# if not libreoffice_available:
|
|
|
|
|
# raise RuntimeError(
|
|
|
|
|
# "LibreOffice is required for Office document conversion but was not found.\n"
|
|
|
|
|
# "Please install LibreOffice:\n"
|
|
|
|
|
# "- Windows: Download from https://www.libreoffice.org/download/download/\n"
|
|
|
|
|
# "- macOS: brew install --cask libreoffice\n"
|
|
|
|
|
# "- Ubuntu/Debian: sudo apt-get install libreoffice\n"
|
|
|
|
|
# "- CentOS/RHEL: sudo yum install libreoffice\n"
|
|
|
|
|
# "Alternatively, convert the document to PDF manually.\n"
|
|
|
|
|
# "MinerU 2.0 no longer includes built-in Office document conversion."
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# Create temporary directory for PDF conversion
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
|
@ -535,6 +563,7 @@ class MineruParser:
|
|
|
|
|
if result.returncode == 0:
|
|
|
|
|
conversion_successful = True
|
|
|
|
|
print(f"Successfully converted {doc_path.name} to PDF")
|
|
|
|
|
print(convert_cmd)
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print(
|
|
|
|
@ -572,6 +601,7 @@ class MineruParser:
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Parse the converted PDF
|
|
|
|
|
# TODO
|
|
|
|
|
return MineruParser.parse_pdf(
|
|
|
|
|
pdf_path=pdf_path, output_dir=output_dir, **kwargs
|
|
|
|
|
)
|
|
|
|
@ -582,7 +612,7 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_text_file(
|
|
|
|
|
text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
|
|
|
|
|
text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Parse text file by first converting to PDF, then parsing with MinerU 2.0
|
|
|
|
@ -752,9 +782,9 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
# Handle tables
|
|
|
|
|
if (
|
|
|
|
|
"|" in line
|
|
|
|
|
and line.strip().startswith("|")
|
|
|
|
|
and line.strip().endswith("|")
|
|
|
|
|
"|" in line
|
|
|
|
|
and line.strip().startswith("|")
|
|
|
|
|
and line.strip().endswith("|")
|
|
|
|
|
):
|
|
|
|
|
if not in_table:
|
|
|
|
|
in_table = True
|
|
|
|
@ -766,15 +796,15 @@ class MineruParser:
|
|
|
|
|
# End of table
|
|
|
|
|
in_table = False
|
|
|
|
|
if (
|
|
|
|
|
len(table_lines) >= 2
|
|
|
|
|
len(table_lines) >= 2
|
|
|
|
|
): # Need at least header and separator
|
|
|
|
|
try:
|
|
|
|
|
# Parse table
|
|
|
|
|
table_data = []
|
|
|
|
|
for table_line in table_lines:
|
|
|
|
|
if (
|
|
|
|
|
"---" in table_line
|
|
|
|
|
or "===" in table_line
|
|
|
|
|
"---" in table_line
|
|
|
|
|
or "===" in table_line
|
|
|
|
|
):
|
|
|
|
|
continue # Skip separator line
|
|
|
|
|
cells = [
|
|
|
|
@ -1112,11 +1142,11 @@ class MineruParser:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_document(
|
|
|
|
|
file_path: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
file_path: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
**kwargs,
|
|
|
|
|
) -> Tuple[List[Dict[str, Any]], str]:
|
|
|
|
|
"""
|
|
|
|
|
Parse document using MinerU 2.0 based on file extension
|
|
|
|
|