|
|
|
|
#### 一、官网
|
|
|
|
|
|
|
|
|
|
[RAG-Anything 官网](https://github.com/HKUDS/RAG-Anything)
|
|
|
|
|
|
|
|
|
|
[Light-RAG 官网](https://github.com/HKUDS/LightRAG)
|
|
|
|
|
|
|
|
|
|
#### 二、环境配置
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
# 删除虚拟环境
|
|
|
|
|
conda remove -n py310 --all
|
|
|
|
|
|
|
|
|
|
# 创建虚拟环境
|
|
|
|
|
conda create -n py310 python=3.10
|
|
|
|
|
|
|
|
|
|
# 查看当前存在哪些虚拟环境
|
|
|
|
|
conda env list
|
|
|
|
|
|
|
|
|
|
# 激活虚拟环境
|
|
|
|
|
conda activate py310
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
#### 三、依赖环境
|
|
|
|
|
|
|
|
|
|
- 1、$Libreoffice$
|
|
|
|
|
|
|
|
|
|
https://zh-cn.libreoffice.org/
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
# 下载的版本:
|
|
|
|
|
https://mirrors.nju.edu.cn/tdf/libreoffice/stable/25.2.4/win/x86_64/LibreOffice_25.2.4_Win_x86-64.msi
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
> **注**:因为后面要使用的$MinerU$能力是将$PDF$转为$markdown$,所以需要一个将$Office$转成$PDF$的功能
|
|
|
|
|
|
|
|
|
|
下载完成后,安装即可。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- 2、$RAGAnything$
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
# 安装RagAnything
|
|
|
|
|
pip install raganything pycocotools timm detectron2 sse_starlette
|
|
|
|
|
|
|
|
|
|
# 安装包
|
|
|
|
|
# pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
|
|
|
|
|
|
|
|
|
|
# 下载模型
|
|
|
|
|
mineru-models-download
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
#### 四、代码调试
|
|
|
|
|
|
|
|
|
|
- 将soffice.exe添加到环境变量
|
|
|
|
|
|
|
|
|
|
- ```
|
|
|
|
|
C:\Program Files\LibreOffice\program
|
|
|
|
|
D:\anaconda3\envs\py310\Scripts
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- **注意**:需要提前配置好环境变量后,再进入$PyCharm$进行调试,因为我发现,如果是在打开$PyCharm$的前提下,添加了环境就是$PyCharm$里面的代码是检测不到的。
|
|
|
|
|
|
|
|
|
|
- 因为原版的程序有$soffice.exe$版本检测框弹出,不能直接用于生产环境,手动修改代码:
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
D:\anaconda3\envs\py310\Lib\site-packages\raganything\mineru_parser.py
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
修改内容:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
# 62行
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _run_mineru_command(
|
|
|
|
|
input_path: Union[str, Path],
|
|
|
|
|
output_dir: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
start_page: Optional[int] = None,
|
|
|
|
|
end_page: Optional[int] = None,
|
|
|
|
|
formula: bool = True,
|
|
|
|
|
table: bool = True,
|
|
|
|
|
device: Optional[str] = None,
|
|
|
|
|
# source: str = "huggingface", # 模型来源,默认 huggingface
|
|
|
|
|
# source: str = "modelscope", # 魔搭下载模型
|
|
|
|
|
source: str = "local" # 使用本地模型
|
|
|
|
|
) -> None:
|
|
|
|
|
|
|
|
|
|
# 107行
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
cmd,
|
|
|
|
|
#capture_output=True, #注释掉这句,可以把输出打印出来
|
|
|
|
|
text=True,
|
|
|
|
|
check=True,
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
errors="ignore",
|
|
|
|
|
)
|
|
|
|
|
print("MinerU command executed successfully")
|
|
|
|
|
|
|
|
|
|
# 442行
|
|
|
|
|
# Check if LibreOffice is available
|
|
|
|
|
#libreoffice_available = False
|
|
|
|
|
working_libreoffice_cmd = 'soffice'
|
|
|
|
|
# try:
|
|
|
|
|
# result = subprocess.run(
|
|
|
|
|
# ["libreoffice", "--version"],
|
|
|
|
|
# capture_output=True,
|
|
|
|
|
# check=True,
|
|
|
|
|
# timeout=10,
|
|
|
|
|
# encoding="utf-8",
|
|
|
|
|
# errors="ignore",
|
|
|
|
|
# )
|
|
|
|
|
# libreoffice_available = True
|
|
|
|
|
# working_libreoffice_cmd = "libreoffice"
|
|
|
|
|
# print(f"LibreOffice detected: {result.stdout.strip()}")
|
|
|
|
|
# except (
|
|
|
|
|
# subprocess.CalledProcessError,
|
|
|
|
|
# FileNotFoundError,
|
|
|
|
|
# subprocess.TimeoutExpired,
|
|
|
|
|
# ):
|
|
|
|
|
# pass
|
|
|
|
|
#
|
|
|
|
|
# # Try alternative commands for LibreOffice
|
|
|
|
|
# if not libreoffice_available:
|
|
|
|
|
# for cmd in ["soffice", "libreoffice"]:
|
|
|
|
|
# try:
|
|
|
|
|
# result = subprocess.run(
|
|
|
|
|
# [cmd, "--version"],
|
|
|
|
|
# capture_output=True,
|
|
|
|
|
# check=True,
|
|
|
|
|
# timeout=10,
|
|
|
|
|
# encoding="utf-8",
|
|
|
|
|
# errors="ignore",
|
|
|
|
|
# )
|
|
|
|
|
# libreoffice_available = True
|
|
|
|
|
# working_libreoffice_cmd = cmd
|
|
|
|
|
# print(
|
|
|
|
|
# f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
|
|
|
|
|
# )
|
|
|
|
|
# break
|
|
|
|
|
# except (
|
|
|
|
|
# subprocess.CalledProcessError,
|
|
|
|
|
# FileNotFoundError,
|
|
|
|
|
# subprocess.TimeoutExpired,
|
|
|
|
|
# ):
|
|
|
|
|
# continue
|
|
|
|
|
#
|
|
|
|
|
# if not libreoffice_available:
|
|
|
|
|
# raise RuntimeError(
|
|
|
|
|
# "LibreOffice is required for Office document conversion but was not found.\n"
|
|
|
|
|
# "Please install LibreOffice:\n"
|
|
|
|
|
# "- Windows: Download from https://www.libreoffice.org/download/download/\n"
|
|
|
|
|
# "- macOS: brew install --cask libreoffice\n"
|
|
|
|
|
# "- Ubuntu/Debian: sudo apt-get install libreoffice\n"
|
|
|
|
|
# "- CentOS/RHEL: sudo yum install libreoffice\n"
|
|
|
|
|
# "Alternatively, convert the document to PDF manually.\n"
|
|
|
|
|
# "MinerU 2.0 no longer includes built-in Office document conversion."
|
|
|
|
|
# )
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 五、相关资料
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
# 转换PDF到Markdown的工具
|
|
|
|
|
https://github.com/opendatalab/MinerU
|
|
|
|
|
|
|
|
|
|
# MinerU依赖的Magic-PDF
|
|
|
|
|
https://github.com/papayalove/Magic-PDF/blob/master/README_zh-CN.md
|
|
|
|
|
|
|
|
|
|
# MinerU依赖的PDF-Extract-Kit
|
|
|
|
|
https://github.com/opendatalab/PDF-Extract-Kit/blob/main/README_zh-CN.md
|
|
|
|
|
|
|
|
|
|
# mineru 官网
|
|
|
|
|
https://mineru.net/
|
|
|
|
|
|
|
|
|
|
# MinerU v2.0:VLM模型捅破解析效果天花板!
|
|
|
|
|
https://blog.csdn.net/qq1198768105/article/details/148678967
|
|
|
|
|
|
|
|
|
|
# MinerU、Magic-PDF、Magic-Doc
|
|
|
|
|
https://blog.csdn.net/lovechris00/article/details/140584728
|
|
|
|
|
|
|
|
|
|
# MinerU教程第二弹丨MinerU 本地部署保姆级“喂饭”教程
|
|
|
|
|
https://zhuanlan.zhihu.com/p/1908942870666282723
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
- 解决在国内网络无法下载$huggingface$的问题
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
# 要修改的文件
|
|
|
|
|
D:\anaconda3\envs\py310\Lib\site-packages\huggingface_hub\constants.py
|
|
|
|
|
|
|
|
|
|
# 修改文件
|
|
|
|
|
HUGGINGFACE_CO_URL_HOME = "https://hf-mirror.com/"
|
|
|
|
|
_HF_DEFAULT_ENDPOINT = "https://hf-mirror.com"
|
|
|
|
|
```
|
|
|
|
|
|