diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 538d1892..880d61c1 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -2,7 +2,7 @@ - + diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 2cad77d0..0f9b3bc1 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/ShiTi/Docx/Source.jpg b/dsLightRag/ShiTi/Docx/Source.jpg new file mode 100644 index 00000000..a84c0151 Binary files /dev/null and b/dsLightRag/ShiTi/Docx/Source.jpg differ diff --git a/dsLightRag/ShiTi/Docx/《功和功率问题》巩固练习模板.docx b/dsLightRag/ShiTi/Docx/《功和功率问题》巩固练习模板.docx new file mode 100644 index 00000000..5847fa81 Binary files /dev/null and b/dsLightRag/ShiTi/Docx/《功和功率问题》巩固练习模板.docx differ diff --git a/dsLightRag/ShiTi/Docx/《动能定理》巩固练习.docx b/dsLightRag/ShiTi/Docx/《动能定理》巩固练习.docx new file mode 100644 index 00000000..63d953bb Binary files /dev/null and b/dsLightRag/ShiTi/Docx/《动能定理》巩固练习.docx differ diff --git a/dsLightRag/ShiTi/Docx/实验方案的设计与评价习题.doc b/dsLightRag/ShiTi/Docx/实验方案的设计与评价习题.doc new file mode 100644 index 00000000..0432791d Binary files /dev/null and b/dsLightRag/ShiTi/Docx/实验方案的设计与评价习题.doc differ diff --git a/dsLightRag/ShiTi/Docx/实验装置的组装与连接习题.docx b/dsLightRag/ShiTi/Docx/实验装置的组装与连接习题.docx new file mode 100644 index 00000000..37b79e2b Binary files /dev/null and b/dsLightRag/ShiTi/Docx/实验装置的组装与连接习题.docx differ diff --git a/dsLightRag/ShiTi/Docx/物质的制备习题.doc b/dsLightRag/ShiTi/Docx/物质的制备习题.doc new file mode 100644 index 00000000..a5349618 Binary files /dev/null and b/dsLightRag/ShiTi/Docx/物质的制备习题.doc differ diff --git a/dsLightRag/ShiTi/Res/1、源图.jpg b/dsLightRag/ShiTi/Res/1、源图.jpg new file mode 100644 index 00000000..a84c0151 Binary files /dev/null and b/dsLightRag/ShiTi/Res/1、源图.jpg differ diff --git a/dsLightRag/ShiTi/Res/2、识别出结果.md b/dsLightRag/ShiTi/Res/2、识别出结果.md new file mode 100644 index 00000000..9492305a --- /dev/null +++ b/dsLightRag/ShiTi/Res/2、识别出结果.md @@ -0,0 +1,28 @@ +一、单项选择题:本题共 8 小题,每小题 5 分,共 40 分。在每小题给出的四个选项中,只有一项是符合题目要求的。 + +1. 已知集合 \(A=\left\{x \mid -5 < x^{3} < 5\right\}, B=\left\{-3,-1,0,2,3\right\}\),则 \(A \cap B=\) 【答案】A +A. \(\{-1,0\}\) B. \(\{2,3\}\) C. \(\{-3,-1,0\}\) D. \(\{-1,0,2\}\) + +【解析】\(A \cap B=\{-1,0\}\),选 A。 + +2. 若 \(\frac{2}{z-1}=1+i\),则 \(z=\) 【答案】C +A. \(-1-i\) B. \(-1+i\) C. \(1-i\) D. \(1+i\) + +3. 已知向量 \(\vec{a}=(0,1)\),\(\vec{b}=(2,x)\),若 \(\vec{b} \perp (\vec{b}-4\vec{a})\),则 \(x=\) 【答案】D +A. \(-2\) B. \(-1\) C. \(1\) D. \(2\) + +【解析】\(\vec{b}-4\vec{a}=(2,x-4)\),\(\vec{b} \perp (\vec{b}-4\vec{a})\),\(\therefore \vec{b}(\vec{b}-4\vec{a})=0\), + +\(\therefore 4+x(x-4)=0\),\(\therefore x=2\),选 D。 + +4. 已知 \(\cos(\alpha+\beta)=m\),\(\tan \alpha \tan \beta=2\),则 \(\cos(\alpha-\beta)=\) 【答案】A +A. \(-3m\) B. \(-\frac{m}{3}\) C. \(\frac{m}{3}\) D. \(3m\) + +【解析】\(\left\{\begin{array}{l}\cos \alpha \cos \beta-\sin \alpha \sin \beta=m \\\frac{\sin \alpha \sin \beta}{\cos \alpha \cos \beta}=2\end{array}\right.\),\(\therefore \left\{\begin{array}{l}\sin \alpha \sin \beta=-2m \\\cos \alpha \cos \beta=-m\end{array}\right.\) + +\(\cos(\alpha-\beta)=\cos \alpha \cos \beta+\sin \alpha \sin \beta=-m-2m=-3m\),选 A。 + +5. 已知圆柱和圆锥的底面半径相等,侧面积相等,且它们的高均为 \(\sqrt{3}\),则圆锥的体积为 【答案】B +A. \(2\sqrt{3}\pi\) B. \(3\sqrt{3}\pi\) C. \(6\sqrt{3}\pi\) D. \(9\sqrt{3}\pi\) + +【解析】设它们底面半径为 \(r\),圆锥母线 \(l\),\(\therefore 2\pi r\sqrt{3}=\pi rl\),\(\therefore l=\sqrt{3}\),则圆锥的体积为 \(\frac{1}{3}\pi r^{2}h\)。 \ No newline at end of file diff --git a/dsLightRag/ShiTi/Res/3、整理后的结果.md b/dsLightRag/ShiTi/Res/3、整理后的结果.md new file mode 100644 index 00000000..b6ec30ec --- /dev/null +++ b/dsLightRag/ShiTi/Res/3、整理后的结果.md @@ -0,0 +1,61 @@ +### 题目1 +**题目序号**: 1 +**题目内容**: 已知集合 $A=\left\{x \mid -5 < x^{3} < 5\right\}, B=\left\{-3,-1,0,2,3\right\}$,则 $A \cap B=$ +**选项**: +A. $\{-1,0\}$ +B. $\{2,3\}$ +C. $\{-3,-1,0\}$ +D. $\{-1,0,2\}$ +**答案**: A +**解析**: $A \cap B=\{-1,0\}$,选 A。 + +--- + +### 题目2 +**题目序号**: 2 +**题目内容**: 若 $\frac{2}{z-1}=1+i$,则 $z=$ +**选项**: +A. $-1-i$ +B. $-1+i$ +C. $1-i$ +D. $1+i$ +**答案**: C + +--- + +### 题目3 +**题目序号**: 3 +**题目内容**: 已知向量 $\vec{a}=(0,1)$,$\vec{b}=(2,x)$,若 $\vec{b} \perp (\vec{b}-4\vec{a})$,则 $x=$ +**选项**: +A. $-2$ +B. $-1$ +C. $1$ +D. $2$ +**答案**: D +**解析**: $\vec{b}-4\vec{a}=(2,x-4)$,$\vec{b} \perp (\vec{b}-4\vec{a})$,$\therefore \vec{b}(\vec{b}-4\vec{a})=0$,$\therefore 4+x(x-4)=0$,$\therefore x=2$,选 D。 + +--- + +### 题目4 +**题目序号**: 4 +**题目内容**: 已知 $\cos(\alpha+\beta)=m$,$\tan \alpha \tan \beta=2$,则 $\cos(\alpha-\beta)=$ +**选项**: +A. $-3m$ +B. $-\frac{m}{3}$ +C. $\frac{m}{3}$ +D. $3m$ +**答案**: A +**解析**: $\left\{\begin{array}{l}\cos \alpha \cos \beta-\sin \alpha \sin \beta=m \\\frac{\sin \alpha \sin \beta}{\cos \alpha \cos \beta}=2\end{array}\right.$,$\therefore \left\{\begin{array}{l}\sin \alpha \sin \beta=-2m \\\cos \alpha \cos \beta=-m\end{array}\right.$ $\cos(\alpha-\beta)=\cos \alpha \cos \beta+\sin \alpha \sin \beta=-m-2m=-3m$,选 A。 + +--- + +### 题目5 +**题目序号**: 5 +**题目内容**: 已知圆柱和圆锥的底面半径相等,侧面积相等,且它们的高均为 $\sqrt{3}$,则圆锥的体积为 +**选项**: +A. $2\sqrt{3}\pi$ +B. $3\sqrt{3}\pi$ +C. $6\sqrt{3}\pi$ +D. $9\sqrt{3}\pi$ +**答案**: B +**解析**: 设它们底面半径为 $r$,圆锥母线 $l$,$\therefore 2\pi r\sqrt{3}=\pi rl$,$\therefore l=\sqrt{3}$,则圆锥的体积为 $\frac{1}{3}\pi r^{2}h$。 \ No newline at end of file diff --git a/dsLightRag/ShiTi/T1_UploadImage.py b/dsLightRag/ShiTi/T1_UploadImage.py new file mode 100644 index 00000000..26b070ae --- /dev/null +++ b/dsLightRag/ShiTi/T1_UploadImage.py @@ -0,0 +1,76 @@ +import argparse +import alibabacloud_oss_v2 as oss + +ALY_ACCESS_KEY_ID = 'LTAI5t5jxkgJtRK8wew8fnbq' +ALY_ACCESS_KEY_SECRET = 'b8HXNGz7IkI3Dhv7BZx9BNBEZy1uku' +ALY_REGION = 'cn-hangzhou' +ALY_ENDPOINT = 'https://oss-cn-hangzhou.aliyuncs.com' +ALY_BUCKET_NAME = 'ylt' + +# 目标键值 +key = "HuangHai/Test/Source.jpg" + +def main(): + # 使用硬编码参数替换命令行参数 + args = argparse.Namespace( + region=ALY_REGION, + bucket=ALY_BUCKET_NAME, + endpoint=ALY_ENDPOINT, + key=key + ) + + # 使用硬编码凭证替代环境变量 + credentials_provider = oss.credentials.StaticCredentialsProvider( + ALY_ACCESS_KEY_ID, + ALY_ACCESS_KEY_SECRET + ) + + # 加载SDK的默认配置,并设置凭证提供者 + cfg = oss.config.load_default() + cfg.credentials_provider = credentials_provider + # 设置配置中的区域信息 + cfg.region = args.region + # 如果提供了endpoint参数,则设置配置中的endpoint + if args.endpoint is not None: + cfg.endpoint = args.endpoint + + # 使用配置好的信息创建OSS客户端 + client = oss.Client(cfg) + + # 定义一个字典变量 progress_state 用于保存上传进度状态,初始值为 0 + progress_state = {'saved': 0} + + def _progress_fn(n, written, total): + # 使用字典存储累计写入的字节数,避免使用 global 变量 + progress_state['saved'] += n + + # 计算当前上传百分比,将已写入字节数与总字节数进行除法运算后取整 + rate = int(100 * (float(written) / float(total))) + + # 打印当前上传进度,\r 表示回到行首,实现命令行中实时刷新效果 + # end='' 表示不换行,使下一次打印覆盖当前行 + print(f'\r上传进度:{rate}% ', end='') + + # 执行上传对象的请求,指定存储空间名称、对象名称和数据内容 + result = client.put_object_from_file(oss.PutObjectRequest( + bucket=args.bucket, + key=args.key, + progress_fn=_progress_fn, + ), + r"D:\dsWork\dsProject\dsLightRag\ShiTi\Docx\Source.jpg", # 指定本地文件路径 + ) + + # 输出请求的结果状态码、请求ID、内容MD5、ETag、CRC64校验码和版本ID,用于检查请求是否成功 + print(f'status code: {result.status_code},' + f' request id: {result.request_id},' + f' content md5: {result.content_md5},' + f' etag: {result.etag},' + f' hash crc64: {result.hash_crc64},' + f' version id: {result.version_id},' + ) + + # 新增:输出完整路径 + print(f'文件已上传至:https://{args.bucket}.{args.endpoint.replace("https://","")}/{args.key}') + +if __name__ == "__main__": + main() # 脚本入口,当文件被直接运行时调用main函数 diff --git a/dsLightRag/ShiTi/T2_QWenOcr.py b/dsLightRag/ShiTi/T2_QWenOcr.py new file mode 100644 index 00000000..c46f185b --- /dev/null +++ b/dsLightRag/ShiTi/T2_QWenOcr.py @@ -0,0 +1,39 @@ +from openai import OpenAI + +# 将本地的图片 ./Docx/Source.jpg 上传到 oss上 + +client = OpenAI( + api_key='sk-f6da0c787eff4b0389e4ad03a35a911f', + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", +) + +prompt = "请提取图片中的试题" +completion = client.chat.completions.create( + model="qwen-vl-ocr-latest", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": "https://ylt.oss-cn-hangzhou.aliyuncs.com/HuangHai/Test/Source.jpg", + # 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels + "min_pixels": 28 * 28 * 4, + # 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels + "max_pixels": 28 * 28 * 8192 + }, + # qwen-vl-ocr-latest支持在以下text字段中传入Prompt,若未传入,则会使用默认的Prompt:Please output only the text content from the image without any additional descriptions or formatting. + # 如调用qwen-vl-ocr-1028,模型会使用固定Prompt:Read all the text in the image.不支持用户在text中传入自定义Prompt + {"type": "text", + "text": prompt}, + ] + } + ]) + +print(completion.choices[0].message.content) + +# 将返回的内容保存到 2、识别出结果.md 中 +with open('Res/2、识别出结果.md', 'w', encoding='utf-8') as f: + f.write(completion.choices[0].message.content) + +print("保存成功!") diff --git a/dsLightRag/ShiTi/T3_Translation.py b/dsLightRag/ShiTi/T3_Translation.py new file mode 100644 index 00000000..e6eac1da --- /dev/null +++ b/dsLightRag/ShiTi/T3_Translation.py @@ -0,0 +1,45 @@ +from openai import OpenAI + +try: + client = OpenAI( + # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx", + api_key='sk-f6da0c787eff4b0389e4ad03a35a911f', + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + ) + # 读取md文件内容 + with open("Res/2、识别出结果.md", "r", encoding="utf-8") as f: + text = f.read() + + prompt = """ + 我将提供一份markdown格式的试卷,请帮我整理出每道题的以下内容: + 1. 题目序号 + 2. 题目内容(自动识别并添加$或$$包裹数学公式) + 3. 选项(如果有) + 4. 答案 + 5. 解析 + + 要求: + - 一道题一道题输出,不要使用表格 + - 自动检测数学表达式并用$或$$正确包裹 + - 确保公式中的特殊字符正确转义 + - 除题目内容外,不要输出其它无关信息 + + 内容如下: + """ + prompt += text + completion = client.chat.completions.create( + model="deepseek-v3", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", + "content": prompt}, + ], + ) + print(completion.choices[0].message.content) + # 将上面的返回内容写入到文件 3、整理后的结果.md 中 + with open("Res/3、整理后的结果.md", "w", encoding="utf-8") as f: + f.write(completion.choices[0].message.content) + print("保存成功!") +except Exception as e: + print(f"错误信息:{e}") + print("请参考文档:https://help.aliyun.com/zh/model-studio/developer-reference/error-code") diff --git a/dsLightRag/ShiTi/T4_DocxToMd.py b/dsLightRag/ShiTi/T4_DocxToMd.py new file mode 100644 index 00000000..2e372b01 --- /dev/null +++ b/dsLightRag/ShiTi/T4_DocxToMd.py @@ -0,0 +1,92 @@ +import asyncio +import os +import shutil + +from raganything import RAGAnything, RAGAnythingConfig +from Util.RagUtil import create_llm_model_func, create_vision_model_func, create_embedding_func + +import logging + +# 在程序开始时添加以下配置 +logging.basicConfig( + level=logging.INFO, # 设置日志级别为INFO + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +# 或者如果你想更详细地控制日志输出 +logger = logging.getLogger('lightrag') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) + + +async def main(): + # 要处理的文件路径 + file_path = "Docx/《动能定理》巩固练习.docx" + WORKING_DIR = "../../Topic/MathOcr" + + # 删除output目录下的所有文件 + output_dir = "../../output" + shutil.rmtree(output_dir, ignore_errors=True) + os.makedirs(output_dir, exist_ok=True) + + # 删除WORKING_DIR下的所有文件 + shutil.rmtree(WORKING_DIR, ignore_errors=True) + os.makedirs(WORKING_DIR, exist_ok=True) + + # 指定最终的索引生成目录,启动索引生成 + config = RAGAnythingConfig( + working_dir=WORKING_DIR, + mineru_parse_method="auto", + enable_image_processing=True, # 处理图片 + enable_table_processing=True, # 处理表格 + enable_equation_processing=True, # 处理公式 + ) + # 自定义的大模型函数 + llm_model_func = create_llm_model_func() + # 自定义的可视模型函数 + vision_model_func = create_vision_model_func(llm_model_func) + # 自定义的嵌入函数 + embedding_func = create_embedding_func() + rag = RAGAnything( + config=config, + llm_model_func=llm_model_func, + vision_model_func=vision_model_func, + embedding_func=embedding_func, + ) + + await rag.process_document_complete( + file_path=file_path, + output_dir=output_dir, + parse_method="auto", + # MinerU特殊参数 - 支持的所有kwargs: + lang="ch", # 文档语言优化(如:"ch", "en", "ja") + # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps" + # start_page=0, # 起始页码(0为基准,适用于PDF) + # end_page=10, # 结束页码(0为基准,适用于PDF) + formula=True, # 启用公式解析 + table=True, # 启用表格解析 + backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等 + source="local", # 模型源:"huggingface", "modelscope", "local" + + # RAGAnything标准参数 + display_stats=True, # 显示内容统计信息 + split_by_character=None, # 可选的文本分割字符 + doc_id=None, # 可选的文档ID + ) + + """ + 修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1},无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1} + """ + + finalName = r'D:\dsWork\dsProject\dsRagAnything\output\《动能定理》巩固练习\auto\《动能定理》巩固练习.md' + with open(finalName, 'r', encoding='utf-8') as f: + content = f.read() + content = content.replace(r'\textcircled', r'\enclose{circle}') + with open(finalName, 'w', encoding='utf-8') as f: + f.write(content) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/dsLightRag/ShiTi/__init__.py b/dsLightRag/ShiTi/__init__.py new file mode 100644 index 00000000..e69de29b