|
|
|
@ -1,131 +0,0 @@
|
|
|
|
|
# 创建虚拟环境
|
|
|
|
|
conda create -n py310 python=3.10
|
|
|
|
|
|
|
|
|
|
# 查看当前存在哪些虚拟环境
|
|
|
|
|
conda env list
|
|
|
|
|
conda info -e
|
|
|
|
|
|
|
|
|
|
# 激活虚拟环境
|
|
|
|
|
conda activate py310
|
|
|
|
|
|
|
|
|
|
# RAG-Anything 官网
|
|
|
|
|
https://github.com/HKUDS/RAG-Anything
|
|
|
|
|
|
|
|
|
|
# libreoffice 官网
|
|
|
|
|
https://zh-cn.libreoffice.org/
|
|
|
|
|
|
|
|
|
|
# Github
|
|
|
|
|
https://github.com/opendatalab/MinerU
|
|
|
|
|
https://github.com/papayalove/Magic-PDF/blob/master/README_zh-CN.md
|
|
|
|
|
https://github.com/opendatalab/PDF-Extract-Kit/blob/main/README_zh-CN.md
|
|
|
|
|
|
|
|
|
|
# mineru 官网
|
|
|
|
|
https://mineru.net/
|
|
|
|
|
|
|
|
|
|
# MinerU v2.0:VLM模型捅破解析效果天花板!
|
|
|
|
|
https://blog.csdn.net/qq1198768105/article/details/148678967
|
|
|
|
|
|
|
|
|
|
# MinerU、Magic-PDF、Magic-Doc
|
|
|
|
|
https://blog.csdn.net/lovechris00/article/details/140584728
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 安装
|
|
|
|
|
pip install raganything
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx) require LibreOffice installation
|
|
|
|
|
Download from LibreOffice official website
|
|
|
|
|
Windows: Download installer from official website
|
|
|
|
|
macOS: brew install --cask libreoffice
|
|
|
|
|
Ubuntu/Debian: sudo apt-get install libreoffice
|
|
|
|
|
CentOS/RHEL: sudo yum install libreoffice
|
|
|
|
|
|
|
|
|
|
# MinerU教程第二弹丨MinerU 本地部署保姆级“喂饭”教程
|
|
|
|
|
https://zhuanlan.zhihu.com/p/1908942870666282723
|
|
|
|
|
|
|
|
|
|
pip install modelscope
|
|
|
|
|
curl -o download_models.py https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py
|
|
|
|
|
python download_models.py
|
|
|
|
|
|
|
|
|
|
# MinerU本地化部署教程——一款AI知识库建站的必备工具
|
|
|
|
|
https://blog.csdn.net/mzl87/article/details/147904238
|
|
|
|
|
|
|
|
|
|
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
|
|
|
|
|
|
|
|
|
|
magic-pdf --version
|
|
|
|
|
|
|
|
|
|
pip install modelscope
|
|
|
|
|
|
|
|
|
|
cd C:\Users\Administrator\PycharmProjects\PythonProject\MinerU
|
|
|
|
|
python
|
|
|
|
|
|
|
|
|
|
magic-pdf -p D:\python\小乔证件\黄琬乔2023蓝桥杯省赛准考证.pdf -o ./output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(py310) PS C:\Users\Administrator> magic-pdf -p D:\python\小乔证件\黄琬乔2023蓝桥杯省赛准考证.pdf -o ./output
|
|
|
|
|
2025-07-05 19:09:59.132 | ERROR | magic_pdf.tools.cli:parse_doc:134 - C:\Users\Administrator\magic-pdf.json not found
|
|
|
|
|
Traceback (most recent call last):
|
|
|
|
|
|
|
|
|
|
File "D:\anaconda3\envs\py310\lib\runpy.py", line 196, in _run_module_as_main
|
|
|
|
|
return _run_code(code, main_globals, None,
|
|
|
|
|
|
|
|
|
|
# 新版本的模型下载命令
|
|
|
|
|
mineru-models-download
|
|
|
|
|
|
|
|
|
|
pip install pycocotools timm
|
|
|
|
|
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
|
|
|
|
|
|
|
|
|
|
# https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md
|
|
|
|
|
https://github.com/opendatalab/MinerU/issues/2357
|
|
|
|
|
|
|
|
|
|
https://gitee.com/bibi100/MinerU/blob/master/README_zh-CN.md#Magic-PDF
|
|
|
|
|
|
|
|
|
|
# OSError: We couldn‘t connect to ‘https://huggingface.co‘ to load this file
|
|
|
|
|
https://blog.csdn.net/qq_38683460/article/details/145661150
|
|
|
|
|
|
|
|
|
|
D:\anaconda3\envs\py310\Lib\site-packages\huggingface_hub\constants.py
|
|
|
|
|
|
|
|
|
|
修改文件
|
|
|
|
|
HUGGINGFACE_CO_URL_HOME = "https://hf-mirror.com/"
|
|
|
|
|
_HF_DEFAULT_ENDPOINT = "https://hf-mirror.com"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mineru -p D:\\python\\小乔证件\\黄琬乔2023蓝桥杯省赛准考证.pdf -o output -m auto -b pipeline --source modelscope
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(base) PS C:\Users\Administrator> conda activate py310
|
|
|
|
|
(py310) PS C:\Users\Administrator> mineru -p D:\\python\\小乔证件\\黄琬乔2023蓝桥杯省赛准考证.pdf -o output -m auto -b pipeline --source modelscope
|
|
|
|
|
2025-07-05 20:51:56.963 | WARNING | mineru.backend.vlm.predictor:<module>:35 - sglang is not installed. If you are not using sglang, you can ignore this warning.
|
|
|
|
|
2025-07-05 20:52:02.387 | INFO | mineru.backend.pipeline.pipeline_analyze:doc_analyze:124 - Batch 1/1: 2 pages/2 pages
|
|
|
|
|
2025-07-05 20:52:02.388 | INFO | mineru.backend.pipeline.model_init:__init__:137 - DocAnalysis init, this may take some times......
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:06,696 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:10,003 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:14,257 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:17,799 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:20,709 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:24,269 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
2025-07-05 20:52:24.405 | INFO | mineru.backend.pipeline.model_init:__init__:182 - DocAnalysis init done!
|
|
|
|
|
2025-07-05 20:52:24.405 | INFO | mineru.backend.pipeline.pipeline_analyze:custom_model_init:64 - model init cost: 22.017439603805542
|
|
|
|
|
Layout Predict: 100%|████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00, 1.05s/it]
|
|
|
|
|
MFD Predict: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.63s/it]
|
|
|
|
|
MFR Predict: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00, 1.50it/s]
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:37,273 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:40,451 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
OCR-det ch: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00, 5.88it/s]
|
|
|
|
|
Table Predict: 0%| | 0/1 [00:00<?, ?it/s]Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:46,921 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Table Predict: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.51s/it]
|
|
|
|
|
Processing pages: 0%| | 0/2 [00:00<?, ?it/s]Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
|
|
|
|
|
2025-07-05 20:52:51,389 - modelscope - INFO - Target directory already exists, skipping creation.
|
|
|
|
|
Processing pages: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.06s/it]
|
|
|
|
|
OCR-rec Predict: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.94it/s]
|
|
|
|
|
2025-07-05 20:52:52.566 | INFO | mineru.cli.common:_process_output:156 - local output dir is output\黄琬乔2023蓝桥杯省赛准考证\auto
|
|
|
|
|
(py310) PS C:\Users\Administrator>
|