'commit'

9 months ago · 8b20fdc4cd
parent a15751c9ce
commit 8b20fdc4cd
16 changed files with 0 additions and 533 deletions
--- a/Py/.idea/.gitignore
+++ b/Py/.idea/.gitignore
@ -1,8 +0,0 @@
-# 默认忽略的文件
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
--- a/Py/.idea/Py.iml
+++ b/Py/.idea/Py.iml
@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.10 (py310)" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
--- a/Py/.idea/encodings.xml
+++ b/Py/.idea/encodings.xml
@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="Encoding">
-    <file url="file://$PROJECT_DIR$/app.log" charset="GBK" />
-  </component>
-</project>
--- a/Py/.idea/inspectionProfiles/Project_Default.xml
+++ b/Py/.idea/inspectionProfiles/Project_Default.xml
@ -1,14 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredPackages">
-        <value>
-          <list size="1">
-            <item index="0" class="java.lang.String" itemvalue="st-chat" />
-          </list>
-        </value>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
--- a/Py/.idea/inspectionProfiles/profiles_settings.xml
+++ b/Py/.idea/inspectionProfiles/profiles_settings.xml
@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
--- a/Py/.idea/misc.xml
+++ b/Py/.idea/misc.xml
@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="Black">
-    <option name="sdkName" value="Python 3.10 (JianYingApi-main)" />
-  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (py310)" project-jdk-type="Python SDK" />
-</project>
--- a/Py/.idea/modules.xml
+++ b/Py/.idea/modules.xml
@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/Py.iml" filepath="$PROJECT_DIR$/.idea/Py.iml" />
-    </modules>
-  </component>
-</project>
--- a/Py/.idea/vcs.xml
+++ b/Py/.idea/vcs.xml
@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
-  </component>
-</project>
--- a/Py/DoAreaDocx.py
+++ b/Py/DoAreaDocx.py
@ -1,150 +0,0 @@
-import os
-import time
-
-import win32com
-from win32com.client import Dispatch
-import re
-import logging
-
-# pip install pywin32 openpyxl
-# pip install pywin32
-
-logging.basicConfig(
-    level=logging.DEBUG,  # 设置日志级别
-    filename='app.log',  # 设置日志文件名
-    filemode='w',  # 文件模式，'w'表示写模式，每次运行都会覆盖旧文件；'a'表示追加模式
-    format='%(name)s - %(levelname)s - %(message)s'  # 设置日志格式
-)
-
-working_dir = r"D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\县区研究报告"
-import openpyxl
-
-# 声明Word应用程序
-docApp = win32com.client.Dispatch('Word.Application')
-docApp.Visible = True
-docApp.DisplayAlerts = 0
-
-# 有问题的县区列表
-errorArea = []
-# 读取ErrorArea.txt,将每一行的县区名称读入列表中
-with open('ErrorArea.txt', 'r', encoding='utf-8') as f:
-    for line in f:
-        # 去除每行前后的空白字符，包括空格、制表符和换行符
-        line = line.strip()
-        # 将文本中的关键字替换为空字符串
-        errorArea.append(line)
-
-# 在工作目录下创建Excel目录
-excel_dir = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\Excel'
-if not os.path.exists(excel_dir):
-    os.mkdir(excel_dir)
-# 遍历working_dir目录下的所有子文件夹
-for root, dirs, files in os.walk(working_dir):
-    for dir in dirs:
-        # 获取县区名称
-        county_name = dir
-
-        # 获取县区文件夹路径
-        county_dir = os.path.join(root, dir)
-        # 遍历县区文件夹下的所有文件
-        for file in os.listdir(county_dir):
-            # 获取文件路径
-            file_path = os.path.join(county_dir, file)
-            cityName = ""
-            # 判断文件是否是Word文档
-            if file_path.endswith('.docx') and not file.startswith('~'):
-                cityName = file_path.replace(working_dir, '')[1:].split("各县")[0]
-                areaName = file
-
-                areaName = re.sub(r'[^\u4e00-\u9fa5]', '', areaName)
-                if '市' not in areaName and '县' not in areaName and '区' not in areaName:
-                    continue
-                # 打开文件文件，按行读取
-                with open('replaceBlank.txt', 'r', encoding='utf-8') as f:
-                    for line in f:
-                        # 去除每行前后的空白字符，包括空格、制表符和换行符
-                        line = line.strip()
-                        # 将文本中的关键字替换为空字符串
-                        areaName = areaName.replace(line, '')
-                # 打开文件文件，按行读取
-                with open('replaceText.txt', 'r', encoding='utf-8') as f:
-                    for line in f:
-                        # 去除每行前后的空白字符，包括空格、制表符和换行符
-                        line = line.strip()
-                        # 将文本中的关键字替换为空字符串
-                        areaName = areaName.replace(line.split(' ')[0], line.split(' ')[1])
-                # 检查Excel目录下是不是存在这个城市的文件夹，如果不存在，则创建
-                city_dir = os.path.join(excel_dir, cityName)
-                if not os.path.exists(city_dir):
-                    os.mkdir(city_dir)
-
-                # 在城市文件夹下，查看是不是存在县区的子文件夹，如果不存在则创建
-                county_sub_dir = os.path.join(city_dir, areaName)
-                if not os.path.exists(county_sub_dir):
-                    os.mkdir(county_sub_dir)
-                else:  # 如果存在，就跳过
-                    # 查看一下这个文件夹下有多少个文件
-                    file_count = len([name for name in os.listdir(county_sub_dir)])
-                    if file_count > 10:
-                        print(county_sub_dir + " 文件夹下有超过10个文件，跳过")
-                        continue
-                # 跳过错误县区
-                flag = False
-                for e in errorArea:
-                    if e in areaName:
-                        flag = True
-                if flag:
-                    print(county_sub_dir + " 跳过")
-                    continue
-                print("正在处理" + cityName + "-" + areaName + "...")
-                # 使用word读取图表的技术，保存EXCEL文件到城市的目录下
-                # 休息3秒，防止WORD打开频繁造成错误
-                time.sleep(3)
-                doc = docApp.Documents.Open(file_path)
-                # 遍历文档中所有的文字段落,判断是不是以 图+数字开头
-                idx = 1
-                # 图表的名称列表
-                tb_list = []
-                for para in doc.Paragraphs:
-                    x = para.Range.Text.strip().replace("图 ", "图").replace("  ", " ")
-                    if x.startswith("图"):
-                        tb_list.append(x)
-                        idx = idx + 1
-
-                # 遍历文档中的所有内嵌形状
-                idx = 1
-                for inline_shape in doc.InlineShapes:
-                    if inline_shape.Type == win32com.client.constants.wdInlineShapeChart:
-                        shape = doc.InlineShapes(idx)
-                        sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
-                        # 创建一个新的Excel工作簿
-                        wb = openpyxl.Workbook()
-                        ws = wb.active
-
-                        # 遍历Excel工作表中的所有单元格，并将其写入新的工作簿
-                        for row in range(1, sheet.UsedRange.Rows.Count + 1):
-                            for col in range(1, sheet.UsedRange.Columns.Count + 1):
-                                cell_value = sheet.Cells(row, col).Value
-                                ws.cell(row=row, column=col, value=cell_value)
-                        # 保存新的Excel文件
-                        try:
-                            original_string = tb_list[idx - 1]
-                            # 使用正则表达式过滤，只保留中文、英文和数字
-                            original_string = original_string[1:]
-                            if ' ' in original_string:
-                                original_string = original_string.split(" ")[1]
-                            filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
-                            fileName = '【' + str(idx) + '】' + filtered_string + ".xlsx"
-
-                            wb.save(county_sub_dir + '/' + fileName)
-                        except:
-                            pass
-                        wb.close()
-                        print("保存文件：" + fileName)
-                        # 下一个图表的索引号
-                        idx = idx + 1
-                # 关闭文档和Word应用
-                doc.Close()
-                print(f"县区处理完成：{cityName}{areaName}")
-docApp.Quit()
-print("恭喜，所有县区数据整理工作成功完成！")
--- a/Py/DoCityDocx.py
+++ b/Py/DoCityDocx.py
@ -1,84 +0,0 @@
-import os
-import win32com
-from win32com.client import Dispatch
-import re
-
-# pip install pywin32 openpyxl
-# pip install pywin32
-working_dir = r"D:/dsWork/YunNanDsBase/Doc/全省及州市县区人口与教育报告集20241023/16个州市报告2022/分析报告20240510/"
-import openpyxl
-
-# 在工作目录下创建Excel目录
-excel_dir = working_dir + 'Excel'
-if not os.path.exists(excel_dir):
-    os.mkdir(excel_dir)
-
-# 关键词
-keyword = '人口变化及其对教育的影响'
-
-# 是不是打Word显示
-docApp = win32com.client.Dispatch('Word.Application')
-# 是不是打Word显示
-docApp.Visible = False
-docApp.DisplayAlerts = 0
-
-# 遍历工作目录下所有的docx文件，将文件名用keyword进行分隔，前一半是州市名称，后一半是上报的时间，我们取前一半的州市名称
-for file in os.listdir(working_dir):
-    if file.endswith('.docx') and not file.startswith('~'):
-        file_name = file.split('.')[0]
-        # 判断一下file_name中是不是存在keyword,如果不存在，则输出错误，并结束程序
-        if keyword not in file_name:
-            print('Error: ' + file_name + ' 文件名称中并不包含:' + keyword)
-            exit()
-        # 确认包含后，提取出前半部分作为城市名称
-        city_name = file_name.split(keyword)[0]
-        # 在excel_dir目录下创建这个城市的子目录，准备将生成的excel文件放在这个子目录下
-        city_dir = excel_dir + '/' + city_name
-        if not os.path.exists(city_dir):
-            os.mkdir(city_dir)
-        # 将当前docx进行读取其中的每一个段落，要求以 "图"+数字开头，这是图例的意思
-        doc_path = working_dir + '/' + file
-        # print(doc_path)
-        doc = docApp.Documents.Open(doc_path)
-        # 遍历文档中所有的文字段落,判断是不是以 图+数字开头
-        idx = 1
-        # 图表的名称列表
-        tb_list = []
-        for para in doc.Paragraphs:
-            x = para.Range.Text.strip().replace("图 ", "图").replace("  ", " ")
-            if x.startswith("图"):
-                tb_list.append(x)
-                idx = idx + 1
-
-        # 遍历文档中的所有内嵌形状
-        idx = 1
-        for inline_shape in doc.InlineShapes:
-            if inline_shape.Type == win32com.client.constants.wdInlineShapeChart:  # 检查是否为内嵌图表
-                shape = doc.InlineShapes(idx)
-                sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
-                # 创建一个新的Excel工作簿
-                wb = openpyxl.Workbook()
-                ws = wb.active
-
-                # 遍历Excel工作表中的所有单元格，并将其写入新的工作簿
-                for row in range(1, sheet.UsedRange.Rows.Count + 1):
-                    for col in range(1, sheet.UsedRange.Columns.Count + 1):
-                        cell_value = sheet.Cells(row, col).Value
-                        ws.cell(row=row, column=col, value=cell_value)
-                # 保存新的Excel文件
-                original_string = tb_list[idx - 1]
-                # 使用正则表达式过滤，只保留中文、英文和数字
-                original_string = original_string[1:]
-                if ' ' in original_string:
-                    original_string = original_string.split(" ")[1]
-                filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
-                fileName = '【' + str(idx) + '】' + filtered_string + ".xlsx"
-                wb.save(city_dir + '/' + fileName)
-                print("保存文件：" + fileName)
-                # 下一个图表的索引号
-                idx = idx + 1
-        # print(idx - 1)
-        # 关闭文档和Word应用
-        doc.Close()
-docApp.Quit()
-print("恭喜，所有市州数据整理工作成功完成！")
--- a/Py/ErrorArea.txt
+++ b/Py/ErrorArea.txt
@ -1,12 +0,0 @@
-寻甸县
-禄劝县
-嵩明县
-富民县
-宁蒗县
-永胜县
-洱源县
-文山市
-西山区
-昭阳区
-鲁甸县
-宁洱县
--- a/Py/Test/DocxTuBiaoAdd.py
+++ b/Py/Test/DocxTuBiaoAdd.py
@ -1,61 +0,0 @@
-# pip install pywin32
-# https://blog.csdn.net/weixin_42927998/article/details/115086797
-import win32com
-from win32com.client import Dispatch
-
-docApp = win32com.client.Dispatch('Word.Application')
-docApp.Visible = True
-docApp.DisplayAlerts = 0
-doc = docApp.Documents.Open('c:/1.docx')
-
-# 创建图表，图表的插入位置为预先在word文档中插入的书签，书签名为“插入图表位置”
-shape_chart = doc.Shapes.AddChart2(Style=201, Type=51, Top=doc.Bookmarks("插入图表位置").Select())
-shape_chart.WrapFormat.Type = 7  # 设置图表为嵌入型
-
-# 设置Word中的图表
-chart = shape_chart.Chart
-# 图表数据对应的工作表
-worksheet = chart.ChartData.Workbook.Worksheets(1)
-chart.SetSourceData("Sheet1!$A$1:$C$4")  # 设置数据源范围
-
-# 簇状柱形图测试数据
-chart_data = [["", "系列A", "系列B", "系列C", "系列D"],
-              [2020, 2, 4, 2, 3],
-              [2019, 4, 5, 3, 2]]
-
-# 清空工作表默认数据
-worksheet.Range("A1:D5").value = None
-
-# 填入测试数据
-for row_index, row in enumerate(chart_data):
-    for column_index, value in enumerate(row):
-        worksheet.Cells(row_index + 1, column_index + 1).Value = value
-
-chart.SetSourceData("Sheet1!$A$1:$E$3")  # 设置数据源范围
-
-# 设置图表样式示例
-chart.ChartTitle.Text = '测试标题'  # 设置标题
-chart.FullSeriesCollection(2).Format.Fill.ForeColor.ObjectThemeColor = 10  # 设置系列2的填充颜色
-
-chart.ChartData.Workbook.Close()  # 关闭workbook窗口
-
-doc.Save()
-doc.Close()
-docApp.Quit()
-
-
-'''
-Type
-1：柱形图（Column）
-2：折线图（Line）
-3：饼图（Pie）
-51：堆叠柱形图（Stacked Column）
-52：堆叠线图（Stacked Line）
-53：堆叠区域图（Stacked Area）
-55：雷达图（Radar）
-65：树状图（Treemap）
-73：旭日图（Sunburst）
-77：水桶图（Funnel）
-109：散点图（Scatter）
-183：气泡图（Bubble）
-'''
--- a/Py/Test/DocxTuBiaoRead.py
+++ b/Py/Test/DocxTuBiaoRead.py
@ -1,51 +0,0 @@
-# pip install pywin32
-# https://blog.csdn.net/weixin_42927998/article/details/115086797
-import win32com
-from win32com.client import Dispatch
-
-docApp = win32com.client.Dispatch('Word.Application')
-# 是不是打Word显示
-docApp.Visible = False
-docApp.DisplayAlerts = 0
-working_dir = r"D:/dsWork/YunNanDsBase/Doc/全省及州市县区人口与教育报告集20241023/16个州市报告2022/分析报告20240510/"
-
-# doc = docApp.Documents.Open('c:/1.docx')
-# doc = docApp.Documents.Open('c:/昭通市人口变化及其对教育的影响20240416.docx')
-# doc = docApp.Documents.Open('c:/昆明市人口变化及其对教育的影响20240419.docx')
-doc = docApp.Documents.Open(working_dir+'红河哈尼族彝族自治州人口变化及其对教育的影响20240419.docx')
-
-
-# 遍历文档中所有的文字段落,判断是不是以 图+数字开头
-idx = 1
-for para in doc.Paragraphs:
-    x = para.Range.Text.strip().replace("图 ", "图").replace("  ", " ")
-    if x.startswith("图"):
-        print(x)
-        idx = idx + 1
-
-# 遍历文档中的所有内嵌形状
-idx = 1
-for inline_shape in doc.InlineShapes:
-    if inline_shape.Type == win32com.client.constants.wdInlineShapeChart:  # 检查是否为内嵌图表
-        shape = doc.InlineShapes(idx)
-        # 获取图表的标题,此项目中图表没有标题
-        # print(shape.Chart.ChartTitle.Text)
-        sheet = shape.Chart.ChartData.Workbook.Worksheets("Sheet1")
-        # 行数
-        row_size = sheet.UsedRange.rows.Count
-        # 列数
-        col_size = sheet.UsedRange.columns.Count
-        # 遍历获取表格中的数据
-        for i in range(1, row_size + 1):
-            for j in range(1, col_size + 1):
-                print(sheet.Cells(i, j).Value, end=" ")
-            print("")
-        print("")
-        # 下一个图表的索引号
-        idx = idx + 1
-print(idx-1)
-
-
-# 关闭文档和Word应用
-doc.Close()
-docApp.Quit()
--- a/Py/YunNan.py
+++ b/Py/YunNan.py
@ -1,68 +0,0 @@
-# pip install pymysql
-# pip install requests beautifulsoup4
-
-# 查看结果
-# select * from t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id;
-
-import time
-
-import pymysql
-import requests
-from bs4 import BeautifulSoup
-import re
-
-if __name__ == '__main__':
-    # 遍历 mysql数据库，然后开启爬虫
-    # 建立数据库连接
-    conn = pymysql.connect(
-        host='10.10.14.203',  # 主机名（或IP地址）
-        port=3306,  # 端口号，默认为3306
-        user='root',  # 用户名
-        password='Password123@mysql',  # 密码
-        charset='utf8mb4'  # 设置字符编码
-    )
-
-    # 创建游标对象
-    cursor = conn.cursor()
-    # 选择数据库
-    conn.select_db("ds_db")
-    # 执行查询操作
-    cursor.execute(
-        "SELECT id,full_name FROM t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id")
-
-    # 获取查询结果，返回元组
-    result: tuple = cursor.fetchall()
-
-    for e in result:
-        id = e[0]
-        area_name = e[1]
-        url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box"
-
-        print(url)
-        # 发送HTTP GET请求
-        response = requests.get(url)
-        # 检查请求是否成功
-        if response.status_code == 200:
-            # 使用BeautifulSoup解析HTML内容
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # 假设我们要抓取的是<h1>标签中的文字
-            # 你可以根据需要修改选择器来抓取不同的内容
-            specific_divs = soup.select('div.para_YYuCh.summary_nfAdr.MARK_MODULE')
-            # 遍历找到的所有特定div标签，并打印它们的文本内容
-            for div in specific_divs:
-                text = div.get_text(strip=True)  # 使用get_text()方法获取文本，并去除
-                # 使用正则表达式移除所有形如[数字]和[数字-数字]的字符串
-                cleaned_text = re.sub(r'\[\d+(?:-\d+)?\]', '', text)
-                sql = "update t_dm_area set memo=%s where id=%s"
-                cursor.execute(sql, (cleaned_text, id))
-                conn.commit()
-                print("更新"+area_name+"数据成功")
-                break
-        else:
-            print('Failed to retrieve the webpage')
-
-        time.sleep(2)
-    # 关闭游标和连接
-    cursor.close()
-    conn.close()
-    print("结束")
--- a/Py/replaceBlank.txt
+++ b/Py/replaceBlank.txt
@ -1,33 +0,0 @@
-人口变化及其对教育的影响
-辖区人口变化趋势对基础教育的影响
-样稿
-市教育数据统计
-区报告
-人口变化趋势对基础教育的影响修改
-人口变化趋势对基础教育的影响审稿
-人口变化趋势对基础教育的影响陈副改终稿
-县区最终版
-人口变化趋势对基础教育的影响
-人口变化及其对基础教育影响的报告
-修改终稿
-人口变化及其对教育影响的报告
-正确
-附件
-定稿
-省级课题
-人口变化及其对基础教育的影响报告
-县区
-人口变化对教育的影响
-报告
-研究报告
-文本
-修改稿
-已审核
-已经审核
-报告
-总人口数常住人口数统计局提供
-初稿
-人口变化及其对教育影响的研究
-人口变化趋势对基础教育影响的研究
-研究
-的
--- a/Py/replaceText.txt
+++ b/Py/replaceText.txt
@ -1,11 +0,0 @@
-县县 县
-曲靖市马龙区 马龙区
-曲靖市麒麟区 麒麟区
-曲靖市沾益区 沾益区
-江城江城县 江城县
-墨江 墨江县
-盐津 盐津县
-盈江 盈江县
-芒市 芒市县
-宾川 宾川县
-镇康 镇康县