@ -1,12 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="D:\anaconda3\envs\raganything" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
@ -1,10 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="D:\anaconda3\envs\raganything" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="D:\anaconda3\envs\raganything" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
<option name="sdkName" value="D:\anaconda3\envs\py310" />
|
||||
</component>
|
||||
</project>
|
@ -1,6 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/MinerU" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -1,28 +0,0 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
exclude: ^lightrag/api/webui/
|
||||
- id: end-of-file-fixer
|
||||
exclude: ^lightrag/api/webui/
|
||||
- id: requirements-txt-fixer
|
||||
exclude: ^lightrag/api/webui/
|
||||
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.6.4
|
||||
hooks:
|
||||
- id: ruff-format
|
||||
exclude: ^lightrag/api/webui/
|
||||
- id: ruff
|
||||
args: [--fix, --ignore=E402]
|
||||
exclude: ^lightrag/api/webui/
|
||||
|
||||
|
||||
- repo: https://github.com/mgedmin/check-manifest
|
||||
rev: "0.49"
|
||||
hooks:
|
||||
- id: check-manifest
|
||||
stages: [manual]
|
||||
exclude: ^lightrag/api/webui/
|
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 ✨Data Intelligence Lab@HKU✨
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,9 +0,0 @@
|
||||
include requirements.txt
|
||||
include README.md
|
||||
include README_zh.md
|
||||
include LICENSE
|
||||
recursive-include raganything *.py
|
||||
recursive-include examples *.py
|
||||
global-exclude *.pyc
|
||||
global-exclude __pycache__
|
||||
global-exclude *.egg-info
|
@ -1,106 +0,0 @@
|
||||
import sys
|
||||
from raganything import RAGAnything
|
||||
|
||||
|
||||
def office_document_parsing(file_path: str):
|
||||
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
|
||||
if file_path.suffix.lower() not in supported_extensions:
|
||||
print(f"❌ Unsupported file format: {file_path.suffix}")
|
||||
print(f" Supported formats: {', '.join(supported_extensions)}")
|
||||
return False
|
||||
|
||||
print(f"📄 File format: {file_path.suffix.upper()}")
|
||||
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
|
||||
|
||||
rag = RAGAnything()
|
||||
|
||||
try:
|
||||
print("\n🔄 Testing document parsing with MinerU...")
|
||||
content_list, md_content = rag.parse_document(
|
||||
file_path=str(file_path),
|
||||
output_dir="./test_output",
|
||||
parse_method="auto",
|
||||
display_stats=True,
|
||||
)
|
||||
|
||||
print("✅ Parsing successful!")
|
||||
print(f" 📊 Content blocks: {len(content_list)}")
|
||||
print(f" 📝 Markdown length: {len(md_content)} characters")
|
||||
|
||||
# Analyze content types
|
||||
content_types = {}
|
||||
for item in content_list:
|
||||
if isinstance(item, dict):
|
||||
content_type = item.get("type", "unknown")
|
||||
content_types[content_type] = content_types.get(content_type, 0) + 1
|
||||
|
||||
if content_types:
|
||||
print(" 📋 Content distribution:")
|
||||
for content_type, count in sorted(content_types.items()):
|
||||
print(f" • {content_type}: {count}")
|
||||
|
||||
# Display some parsed content preview
|
||||
if md_content.strip():
|
||||
print("\n📄 Parsed content preview (first 500 characters):")
|
||||
preview = md_content.strip()[:500]
|
||||
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
|
||||
|
||||
# Display some structured content examples
|
||||
text_items = [
|
||||
item
|
||||
for item in content_list
|
||||
if isinstance(item, dict) and item.get("type") == "text"
|
||||
]
|
||||
if text_items:
|
||||
print("\n📝 Sample text blocks:")
|
||||
for i, item in enumerate(text_items[:3], 1):
|
||||
text_content = item.get("text", "")
|
||||
if text_content.strip():
|
||||
preview = text_content.strip()[:200]
|
||||
print(
|
||||
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
|
||||
)
|
||||
|
||||
# Check for images
|
||||
image_items = [
|
||||
item
|
||||
for item in content_list
|
||||
if isinstance(item, dict) and item.get("type") == "image"
|
||||
]
|
||||
if image_items:
|
||||
print(f"\n🖼️ Found {len(image_items)} image(s):")
|
||||
for i, item in enumerate(image_items, 1):
|
||||
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
|
||||
|
||||
# Check for tables
|
||||
table_items = [
|
||||
item
|
||||
for item in content_list
|
||||
if isinstance(item, dict) and item.get("type") == "table"
|
||||
]
|
||||
if table_items:
|
||||
print(f"\n📊 Found {len(table_items)} table(s):")
|
||||
for i, item in enumerate(table_items, 1):
|
||||
table_body = item.get("table_body", "")
|
||||
row_count = len(table_body.split("\n"))
|
||||
print(f" {i}. Table with {row_count} rows")
|
||||
|
||||
print("\n🎉 Office document parsing test completed successfully!")
|
||||
print("📁 Output files saved to: ./test_output")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Office document parsing failed: {str(e)}")
|
||||
import traceback
|
||||
|
||||
print(f" Full error: {traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"
|
||||
|
||||
office_document_parsing(file)
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
Before Width: | Height: | Size: 3.8 KiB |
Before Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 9.4 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 3.6 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 4.9 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 3.2 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 4.7 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 4.3 KiB |
Before Width: | Height: | Size: 8.6 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 3.1 KiB |
Before Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 3.5 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 5.0 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 60 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 7.0 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 7.8 KiB |
Before Width: | Height: | Size: 8.9 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 8.2 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 5.1 KiB |
Before Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 3.3 KiB |
Before Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 3.3 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 5.0 KiB |
Before Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 24 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 82 KiB |
Before Width: | Height: | Size: 2.2 KiB |
Before Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 9.1 KiB |
Before Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 3.7 KiB |
Before Width: | Height: | Size: 1.4 KiB |
Before Width: | Height: | Size: 21 KiB |