dsProject/dsLightRag/Dsideal/YunXiao/Fetch.py

# 请先安装必要的库: pip install tqdm requests
import os
import json
import threading
import requests
from tqdm import tqdm


def fetch_lesson_data(page_num=1, page_size=10, search_keyword=""):
    """
    发送POST请求获取云课堂课程数据
    
    参数:
        page_num: 页码，默认为1
        page_size: 每页数量，默认为10
        search_keyword: 搜索关键词，默认为空
    
    返回:
        解析后的JSON数据，如果请求失败则返回None
    """
    # API URL
    url = "https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage"

    # 请求参数
    payload = {
        "pageNum": page_num,
        "pageSize": page_size,
        "businessBookId": "3029115d4afa424f8160adb04bd10e6a",
        "businessEditionId": "19F93E4A7C9B4B589EB001FBFEE6230A",
        "excellentFlag": "",
        "nodeId": None,
        "nodeType": 1,
        "stageCode": "3",
        "subjectCode": "314",
        "sortType": 2,
        "source": "",
        "searchKeyword": search_keyword
    }

    try:
        # 发送POST请求
        response = requests.post(url, json=payload)

        # 检查响应状态
        if response.status_code == 200:
            # 解析JSON响应
            data = response.json()
            return data
        else:
            print(f"请求失败，状态码: {response.status_code}")
            return None
    except Exception as e:
        print(f"请求异常: {str(e)}")
        return None


# 创建多线程下载类
class MultiThreadDownloader:
    def __init__(self, url, file_path, num_threads=4):
        self.url = url
        self.file_path = file_path
        self.num_threads = num_threads
        self.total_size = 0
        self.progress = None

    def get_file_size(self):
        response = requests.head(self.url, allow_redirects=True)
        if 'content-length' in response.headers:
            self.total_size = int(response.headers['content-length'])
            return True
        return False

    def download_chunk(self, start, end, thread_id):
        headers = {'Range': f'bytes={start}-{end}'}
        response = requests.get(self.url, headers=headers, stream=True)

        chunk_size = 1024
        with open(f'{self.file_path}.part{thread_id}', 'wb') as f:
            for data in response.iter_content(chunk_size=chunk_size):
                f.write(data)
                if self.progress:
                    self.progress.update(len(data))

    def merge_chunks(self):
        with open(self.file_path, 'wb') as f:
            for i in range(self.num_threads):
                part_file = f'{self.file_path}.part{i}'
                if os.path.exists(part_file):
                    with open(part_file, 'rb') as pf:
                        f.write(pf.read())
                    os.remove(part_file)

    def start(self):
        # 确保目录存在
        os.makedirs(os.path.dirname(self.file_path), exist_ok=True)

        if not self.get_file_size():
            print(f"无法获取文件大小，使用单线程下载: {self.url}")
            # 回退到单线程下载
            try:
                response = requests.get(self.url, stream=True)
                with open(self.file_path, 'wb') as f:
                    total_size = int(response.headers.get('content-length', 0))
                    with tqdm(total=total_size, unit='B', unit_scale=True, desc=self.file_path.split('\\')[-1]) as pbar:
                        for data in response.iter_content(chunk_size=1024):
                            f.write(data)
                            pbar.update(len(data))
                return True
            except Exception as e:
                print(f"单线程下载失败: {str(e)}")
                return False

        # 多线程下载
        chunk_size = self.total_size // self.num_threads
        self.progress = tqdm(total=self.total_size, unit='B', unit_scale=True, desc=self.file_path.split('\\')[-1])

        threads = []
        for i in range(self.num_threads):
            start = i * chunk_size
            end = self.total_size - 1 if i == self.num_threads - 1 else (i + 1) * chunk_size - 1
            thread = threading.Thread(target=self.download_chunk, args=(start, end, i))
            threads.append(thread)
            thread.start()

        # 等待所有线程完成
        for thread in threads:
            thread.join()

        self.progress.close()
        self.merge_chunks()
        return True


# 测试函数
if __name__ == "__main__":
    # 创建下载目录
    down_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Down")
    os.makedirs(down_dir, exist_ok=True)

    # 获取总记录数
    first_page = fetch_lesson_data(page_num=1, page_size=1)
    if first_page:
        total = first_page['data']['total']
        page_size = 10  # 每页获取10条
        total_pages = (total + page_size - 1) // page_size
        print(f"获取成功，共{total}条记录")

        # 遍历所有页面
        all_lessons = []
        for page_num in range(1, total_pages + 1):
            print(f"正在获取第{page_num}/{total_pages}页数据...")
            result = fetch_lesson_data(page_num=page_num, page_size=page_size)
            if result and result['data']['rows']:
                all_lessons.extend(result['data']['rows'])
            else:
                print(f"第{page_num}页数据获取失败")

        print(f"成功获取{len(all_lessons)}条课程数据")

        # 准备保存下载信息
        download_info = []
        success_count = 0
        fail_count = 0

        # 遍历所有课程
        for idx, lesson in enumerate(all_lessons, 1):
            lesson_id = lesson.get('lessonId')
            lesson_name = lesson.get('lessonName')
            print(f"\n处理课程 {idx}/{len(all_lessons)}: {lesson_name}")

            # 提取视频URL
            lesson_resources = lesson.get('lessonResources', [])
            video_url = None
            if lesson_resources:
                # 假设第一个资源是视频
                video_url ="https://ccschool.edusoa.com/"+ lesson_resources[0].get('fileUrl', None)
                safe_name=lesson_resources[0].get('fileUrl', None).split('/')[-1]

            if video_url:
                # 生成安全的文件名
                file_path = os.path.join(down_dir, f"{safe_name}")

                # 检查文件是否已存在
                if os.path.exists(file_path):
                    print(f"文件已存在，跳过下载: {file_path}")
                    download_info.append({
                        "course_name": lesson_name,
                        "course_id": lesson_id,
                        "file_path": file_path,
                        "status": "已存在"
                    })
                    continue

                # 开始下载
                print(f"开始下载视频: {video_url}")
                downloader = MultiThreadDownloader(video_url, file_path)
                if downloader.start():
                    print(f"视频下载成功: {file_path}")
                    download_info.append({
                        "course_name": lesson_name,
                        "course_id": lesson_id,
                        "file_path": file_path,
                        "status": "成功"
                    })
                    success_count += 1
                else:
                    print(f"视频下载失败: {video_url}")
                    download_info.append({
                        "course_name": lesson_name,
                        "course_id": lesson_id,
                        "file_path": file_path,
                        "status": "失败"
                    })
                    fail_count += 1
            else:
                print("未找到视频URL")
                download_info.append({
                    "course_name": lesson_name,
                    "course_id": lesson_id,
                    "file_path": "",
                    "status": "无视频URL"
                })
                fail_count += 1

            # 每5个课程保存一次下载信息
            if idx % 5 == 0 or idx == len(all_lessons):
                info_file = os.path.join(down_dir, "download_info.json")
                with open(info_file, 'w', encoding='utf-8') as f:
                    json.dump(download_info, f, ensure_ascii=False, indent=2)
                print(f"已保存下载信息到: {info_file}")

        print(f"\n下载完成！成功: {success_count}, 失败: {fail_count}")
    else:
        print("获取课程数据失败")