|
|
|
@ -73,11 +73,11 @@ _HF_DEFAULT_ENDPOINT = "https://hf-mirror.com"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- **注意**:需要提前配置好环境变量后,再进入PyCharm进行调试,因为我发现,如果是在打开PyCharm的前提下,添加了环境就是PyCharm里面的代码是检测不到的。
|
|
|
|
|
- **注意**:需要提前配置好环境变量后,再进入$PyCharm$进行调试,因为我发现,如果是在打开$PyCharm$的前提下,添加了环境就是$PyCharm$里面的代码是检测不到的。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- 因为原版的程序有soffice.exe版本检测框弹出,不能直接用于生产环境,我只好手动修改了下代码:
|
|
|
|
|
- 因为原版的程序有$soffice.exe$版本检测框弹出,不能直接用于生产环境,手动修改代码:
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
D:\anaconda3\envs\raganything\Lib\site-packages\raganything\mineru_parser.py
|
|
|
|
@ -86,6 +86,37 @@ _HF_DEFAULT_ENDPOINT = "https://hf-mirror.com"
|
|
|
|
|
修改内容:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
# 62行
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _run_mineru_command(
|
|
|
|
|
input_path: Union[str, Path],
|
|
|
|
|
output_dir: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
start_page: Optional[int] = None,
|
|
|
|
|
end_page: Optional[int] = None,
|
|
|
|
|
formula: bool = True,
|
|
|
|
|
table: bool = True,
|
|
|
|
|
device: Optional[str] = None,
|
|
|
|
|
# source: str = "huggingface", # 模型来源,默认 huggingface
|
|
|
|
|
# source: str = "modelscope", # 因为第一次手动从魔搭下载了模型,所以可以直接使用local模式
|
|
|
|
|
source: str = "local"
|
|
|
|
|
) -> None:
|
|
|
|
|
|
|
|
|
|
# 107行
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
cmd,
|
|
|
|
|
#capture_output=True, #注释掉这句,可以把输出打印出来
|
|
|
|
|
text=True,
|
|
|
|
|
check=True,
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
errors="ignore",
|
|
|
|
|
)
|
|
|
|
|
print("MinerU command executed successfully")
|
|
|
|
|
|
|
|
|
|
# 442行
|
|
|
|
|
# Check if LibreOffice is available
|
|
|
|
|
#libreoffice_available = False
|
|
|
|
|
working_libreoffice_cmd = 'soffice'
|
|
|
|
@ -143,56 +174,10 @@ _HF_DEFAULT_ENDPOINT = "https://hf-mirror.com"
|
|
|
|
|
# "- CentOS/RHEL: sudo yum install libreoffice\n"
|
|
|
|
|
# "Alternatively, convert the document to PDF manually.\n"
|
|
|
|
|
# "MinerU 2.0 no longer includes built-in Office document conversion."
|
|
|
|
|
# )
|
|
|
|
|
# )
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
- 首次运行时,代码会执行下面的类似命令
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
mineru -p C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\tmpt2sl2vd1\\驿来特平台安全.pdf -o output -m auto -b pipeline --source modelscope
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
下载需要等待,但程序本身不显示进度,我一直以为卡住了,后来跟踪代码,才知道它是在下载模型。
|
|
|
|
|
|
|
|
|
|
- 修改源码:
|
|
|
|
|
|
|
|
|
|
```cmd
|
|
|
|
|
D:\anaconda3\envs\raganything\Lib\site-packages\raganything\mineru_parser.py
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
# 62行
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _run_mineru_command(
|
|
|
|
|
input_path: Union[str, Path],
|
|
|
|
|
output_dir: Union[str, Path],
|
|
|
|
|
method: str = "auto",
|
|
|
|
|
lang: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
start_page: Optional[int] = None,
|
|
|
|
|
end_page: Optional[int] = None,
|
|
|
|
|
formula: bool = True,
|
|
|
|
|
table: bool = True,
|
|
|
|
|
device: Optional[str] = None,
|
|
|
|
|
source: str = "local", # 'huggingface' --> 'local'
|
|
|
|
|
) -> None:
|
|
|
|
|
|
|
|
|
|
# 107行
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
cmd,
|
|
|
|
|
#capture_output=True, #注释掉这句,可以把输出打印出来
|
|
|
|
|
text=True,
|
|
|
|
|
check=True,
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
errors="ignore",
|
|
|
|
|
)
|
|
|
|
|
print("MinerU command executed successfully")
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 五、相关资料
|
|
|
|
|
|
|
|
|
|