main
kgdxpr 9 months ago
parent ec1753f56e
commit 9693c6e245

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="aspose-words-24.10-jdk17">
<CLASSES>
<root url="jar://C:/aspose-words-24.10-jdk17.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.10 (py310)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/app.log" charset="GBK" />
</component>
</project>

@ -0,0 +1,14 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="st-chat" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.10 (JianYingApi-main)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (py310)" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Py.iml" filepath="$PROJECT_DIR$/.idea/Py.iml" />
</modules>
</component>
</project>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

@ -0,0 +1,51 @@
# pip install pywin32
# https://blog.csdn.net/weixin_42927998/article/details/115086797
import os
import win32com
from win32com.client import Dispatch
# 工作目录
workingPath = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\16个州市报告2022\分析报告20240510'
# 修复Word文档
# 经过反复测试发现WORD文档中的图表有些POI是无法正确读取的本来是Sheet1,结果它不认识说只有一个Sheet0,此时就无法正确读取数据了。
# 而我通过python+win32com.client.Dispatch可以读取到直接保存就修复了这个BUG真是太神奇了
def repairWord(docPath):
docApp = win32com.client.Dispatch('Word.Application')
# 是不是打Word显示
docApp.Visible = False
docApp.DisplayAlerts = 0
doc = docApp.Documents.Open(docPath)
#
# # 遍历文档中的所有内嵌形状
idx = 1
for inline_shape in doc.InlineShapes:
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart: # 检查是否为内嵌图表
shape = doc.InlineShapes(idx)
sheet = shape.Chart.ChartData.Workbook.Worksheets("Sheet1")
# 下一个图表的索引号
idx = idx + 1
# 关闭文档和Word应用
doc.Close()
docApp.Quit()
if __name__ == '__main__':
# 1、修复两层扩展名.docx
for file in os.listdir(workingPath):
if file.endswith('.docx.docx'):
# 完整的路径名称
docPath = os.path.join(workingPath, file)
print("文件名有误,已修复:" + docPath)
os.rename(docPath, docPath.replace('.docx.docx', '.docx'))
# 2、修复图表异常问题
for file in os.listdir(workingPath):
if file.endswith('.docx'):
# 开始修复文档
repairWord(docPath)
print("修复完成")

@ -0,0 +1,150 @@
import os
import time
import win32com
from win32com.client import Dispatch
import re
import logging
# pip install pywin32 openpyxl
# pip install pywin32
logging.basicConfig(
level=logging.DEBUG, # 设置日志级别
filename='app.log', # 设置日志文件名
filemode='w', # 文件模式,'w'表示写模式,每次运行都会覆盖旧文件;'a'表示追加模式
format='%(name)s - %(levelname)s - %(message)s' # 设置日志格式
)
working_dir = r"D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\县区研究报告"
import openpyxl
# 声明Word应用程序
docApp = win32com.client.Dispatch('Word.Application')
docApp.Visible = True
docApp.DisplayAlerts = 0
# 有问题的县区列表
errorArea = []
# 读取ErrorArea.txt,将每一行的县区名称读入列表中
with open('ErrorArea.txt', 'r', encoding='utf-8') as f:
for line in f:
# 去除每行前后的空白字符,包括空格、制表符和换行符
line = line.strip()
# 将文本中的关键字替换为空字符串
errorArea.append(line)
# 在工作目录下创建Excel目录
excel_dir = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\Excel'
if not os.path.exists(excel_dir):
os.mkdir(excel_dir)
# 遍历working_dir目录下的所有子文件夹
for root, dirs, files in os.walk(working_dir):
for dir in dirs:
# 获取县区名称
county_name = dir
# 获取县区文件夹路径
county_dir = os.path.join(root, dir)
# 遍历县区文件夹下的所有文件
for file in os.listdir(county_dir):
# 获取文件路径
file_path = os.path.join(county_dir, file)
cityName = ""
# 判断文件是否是Word文档
if file_path.endswith('.docx') and not file.startswith('~'):
cityName = file_path.replace(working_dir, '')[1:].split("各县")[0]
areaName = file
areaName = re.sub(r'[^\u4e00-\u9fa5]', '', areaName)
if '' not in areaName and '' not in areaName and '' not in areaName:
continue
# 打开文件文件,按行读取
with open('replaceBlank.txt', 'r', encoding='utf-8') as f:
for line in f:
# 去除每行前后的空白字符,包括空格、制表符和换行符
line = line.strip()
# 将文本中的关键字替换为空字符串
areaName = areaName.replace(line, '')
# 打开文件文件,按行读取
with open('replaceText.txt', 'r', encoding='utf-8') as f:
for line in f:
# 去除每行前后的空白字符,包括空格、制表符和换行符
line = line.strip()
# 将文本中的关键字替换为空字符串
areaName = areaName.replace(line.split(' ')[0], line.split(' ')[1])
# 检查Excel目录下是不是存在这个城市的文件夹如果不存在则创建
city_dir = os.path.join(excel_dir, cityName)
if not os.path.exists(city_dir):
os.mkdir(city_dir)
# 在城市文件夹下,查看是不是存在县区的子文件夹,如果不存在则创建
county_sub_dir = os.path.join(city_dir, areaName)
if not os.path.exists(county_sub_dir):
os.mkdir(county_sub_dir)
else: # 如果存在,就跳过
# 查看一下这个文件夹下有多少个文件
file_count = len([name for name in os.listdir(county_sub_dir)])
if file_count > 10:
print(county_sub_dir + " 文件夹下有超过10个文件跳过")
continue
# 跳过错误县区
flag = False
for e in errorArea:
if e in areaName:
flag = True
if flag:
print(county_sub_dir + " 跳过")
continue
print("正在处理" + cityName + "-" + areaName + "...")
# 使用word读取图表的技术保存EXCEL文件到城市的目录下
# 休息3秒防止WORD打开频繁造成错误
time.sleep(3)
doc = docApp.Documents.Open(file_path)
# 遍历文档中所有的文字段落,判断是不是以 图+数字开头
idx = 1
# 图表的名称列表
tb_list = []
for para in doc.Paragraphs:
x = para.Range.Text.strip().replace("", "").replace(" ", " ")
if x.startswith(""):
tb_list.append(x)
idx = idx + 1
# 遍历文档中的所有内嵌形状
idx = 1
for inline_shape in doc.InlineShapes:
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart:
shape = doc.InlineShapes(idx)
sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
# 创建一个新的Excel工作簿
wb = openpyxl.Workbook()
ws = wb.active
# 遍历Excel工作表中的所有单元格并将其写入新的工作簿
for row in range(1, sheet.UsedRange.Rows.Count + 1):
for col in range(1, sheet.UsedRange.Columns.Count + 1):
cell_value = sheet.Cells(row, col).Value
ws.cell(row=row, column=col, value=cell_value)
# 保存新的Excel文件
try:
original_string = tb_list[idx - 1]
# 使用正则表达式过滤,只保留中文、英文和数字
original_string = original_string[1:]
if ' ' in original_string:
original_string = original_string.split(" ")[1]
filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
fileName = '' + str(idx) + '' + filtered_string + ".xlsx"
wb.save(county_sub_dir + '/' + fileName)
except:
pass
wb.close()
print("保存文件:" + fileName)
# 下一个图表的索引号
idx = idx + 1
# 关闭文档和Word应用
doc.Close()
print(f"县区处理完成:{cityName}{areaName}")
docApp.Quit()
print("恭喜,所有县区数据整理工作成功完成!")

@ -0,0 +1,84 @@
import os
import win32com
from win32com.client import Dispatch
import re
# pip install pywin32 openpyxl
# pip install pywin32
working_dir = r"D:/dsWork/YunNanDsBase/Doc/全省及州市县区人口与教育报告集20241023/16个州市报告2022/分析报告20240510/"
import openpyxl
# 在工作目录下创建Excel目录
excel_dir = working_dir + 'Excel'
if not os.path.exists(excel_dir):
os.mkdir(excel_dir)
# 关键词
keyword = '人口变化及其对教育的影响'
# 是不是打Word显示
docApp = win32com.client.Dispatch('Word.Application')
# 是不是打Word显示
docApp.Visible = False
docApp.DisplayAlerts = 0
# 遍历工作目录下所有的docx文件将文件名用keyword进行分隔前一半是州市名称后一半是上报的时间我们取前一半的州市名称
for file in os.listdir(working_dir):
if file.endswith('.docx') and not file.startswith('~'):
file_name = file.split('.')[0]
# 判断一下file_name中是不是存在keyword,如果不存在,则输出错误,并结束程序
if keyword not in file_name:
print('Error: ' + file_name + ' 文件名称中并不包含:' + keyword)
exit()
# 确认包含后,提取出前半部分作为城市名称
city_name = file_name.split(keyword)[0]
# 在excel_dir目录下创建这个城市的子目录准备将生成的excel文件放在这个子目录下
city_dir = excel_dir + '/' + city_name
if not os.path.exists(city_dir):
os.mkdir(city_dir)
# 将当前docx进行读取其中的每一个段落要求以 "图"+数字开头,这是图例的意思
doc_path = working_dir + '/' + file
# print(doc_path)
doc = docApp.Documents.Open(doc_path)
# 遍历文档中所有的文字段落,判断是不是以 图+数字开头
idx = 1
# 图表的名称列表
tb_list = []
for para in doc.Paragraphs:
x = para.Range.Text.strip().replace("", "").replace(" ", " ")
if x.startswith(""):
tb_list.append(x)
idx = idx + 1
# 遍历文档中的所有内嵌形状
idx = 1
for inline_shape in doc.InlineShapes:
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart: # 检查是否为内嵌图表
shape = doc.InlineShapes(idx)
sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
# 创建一个新的Excel工作簿
wb = openpyxl.Workbook()
ws = wb.active
# 遍历Excel工作表中的所有单元格并将其写入新的工作簿
for row in range(1, sheet.UsedRange.Rows.Count + 1):
for col in range(1, sheet.UsedRange.Columns.Count + 1):
cell_value = sheet.Cells(row, col).Value
ws.cell(row=row, column=col, value=cell_value)
# 保存新的Excel文件
original_string = tb_list[idx - 1]
# 使用正则表达式过滤,只保留中文、英文和数字
original_string = original_string[1:]
if ' ' in original_string:
original_string = original_string.split(" ")[1]
filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
fileName = '' + str(idx) + '' + filtered_string + ".xlsx"
wb.save(city_dir + '/' + fileName)
print("保存文件:" + fileName)
# 下一个图表的索引号
idx = idx + 1
# print(idx - 1)
# 关闭文档和Word应用
doc.Close()
docApp.Quit()
print("恭喜,所有市州数据整理工作成功完成!")

@ -0,0 +1,61 @@
# pip install pywin32
# https://blog.csdn.net/weixin_42927998/article/details/115086797
import win32com
from win32com.client import Dispatch
docApp = win32com.client.Dispatch('Word.Application')
docApp.Visible = True
docApp.DisplayAlerts = 0
doc = docApp.Documents.Open('c:/1.docx')
# 创建图表图表的插入位置为预先在word文档中插入的书签书签名为“插入图表位置”
shape_chart = doc.Shapes.AddChart2(Style=201, Type=51, Top=doc.Bookmarks("插入图表位置").Select())
shape_chart.WrapFormat.Type = 7 # 设置图表为嵌入型
# 设置Word中的图表
chart = shape_chart.Chart
# 图表数据对应的工作表
worksheet = chart.ChartData.Workbook.Worksheets(1)
chart.SetSourceData("Sheet1!$A$1:$C$4") # 设置数据源范围
# 簇状柱形图测试数据
chart_data = [["", "系列A", "系列B", "系列C", "系列D"],
[2020, 2, 4, 2, 3],
[2019, 4, 5, 3, 2]]
# 清空工作表默认数据
worksheet.Range("A1:D5").value = None
# 填入测试数据
for row_index, row in enumerate(chart_data):
for column_index, value in enumerate(row):
worksheet.Cells(row_index + 1, column_index + 1).Value = value
chart.SetSourceData("Sheet1!$A$1:$E$3") # 设置数据源范围
# 设置图表样式示例
chart.ChartTitle.Text = '测试标题' # 设置标题
chart.FullSeriesCollection(2).Format.Fill.ForeColor.ObjectThemeColor = 10 # 设置系列2的填充颜色
chart.ChartData.Workbook.Close() # 关闭workbook窗口
doc.Save()
doc.Close()
docApp.Quit()
'''
Type
1柱形图Column
2折线图Line
3饼图Pie
51堆叠柱形图Stacked Column
52堆叠线图Stacked Line
53堆叠区域图Stacked Area
55雷达图Radar
65树状图Treemap
73旭日图Sunburst
77水桶图Funnel
109散点图Scatter
183气泡图Bubble
'''

@ -0,0 +1,25 @@
# pip install pywin32
# https://blog.csdn.net/weixin_42927998/article/details/115086797
import win32com
from win32com.client import Dispatch
docApp = win32com.client.Dispatch('Word.Application')
# 是不是打Word显示
docApp.Visible = False
docApp.DisplayAlerts = 0
doc = docApp.Documents.Open("c:/b.docx")
#
# # 遍历文档中的所有内嵌形状
idx = 1
for inline_shape in doc.InlineShapes:
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart: # 检查是否为内嵌图表
# 获取图表的标题,此项目中图表没有标题
shape = doc.InlineShapes(idx)
sheet = shape.Chart.ChartData.Workbook.Worksheets("Sheet1")
# 下一个图表的索引号
idx = idx + 1
# 关闭文档和Word应用
doc.Close()
docApp.Quit()

@ -0,0 +1,12 @@
寻甸县
禄劝县
嵩明县
富民县
宁蒗县
永胜县
洱源县
文山市
西山区
昭阳区
鲁甸县
宁洱县

@ -0,0 +1,68 @@
# pip install pymysql
# pip install requests beautifulsoup4
# 查看结果
# select * from t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id;
import time
import pymysql
import requests
from bs4 import BeautifulSoup
import re
if __name__ == '__main__':
# 遍历 mysql数据库然后开启爬虫
# 建立数据库连接
conn = pymysql.connect(
host='10.10.14.203', # 主机名或IP地址
port=3306, # 端口号默认为3306
user='root', # 用户名
password='Password123@mysql', # 密码
charset='utf8mb4' # 设置字符编码
)
# 创建游标对象
cursor = conn.cursor()
# 选择数据库
conn.select_db("ds_db")
# 执行查询操作
cursor.execute(
"SELECT id,full_name FROM t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id")
# 获取查询结果,返回元组
result: tuple = cursor.fetchall()
for e in result:
id = e[0]
area_name = e[1]
url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box"
print(url)
# 发送HTTP GET请求
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 假设我们要抓取的是<h1>标签中的文字
# 你可以根据需要修改选择器来抓取不同的内容
specific_divs = soup.select('div.para_YYuCh.summary_nfAdr.MARK_MODULE')
# 遍历找到的所有特定div标签并打印它们的文本内容
for div in specific_divs:
text = div.get_text(strip=True) # 使用get_text()方法获取文本,并去除
# 使用正则表达式移除所有形如[数字]和[数字-数字]的字符串
cleaned_text = re.sub(r'\[\d+(?:-\d+)?\]', '', text)
sql = "update t_dm_area set memo=%s where id=%s"
cursor.execute(sql, (cleaned_text, id))
conn.commit()
print("更新"+area_name+"数据成功")
break
else:
print('Failed to retrieve the webpage')
time.sleep(2)
# 关闭游标和连接
cursor.close()
conn.close()
print("结束")

@ -0,0 +1,33 @@
人口变化及其对教育的影响
辖区人口变化趋势对基础教育的影响
样稿
市教育数据统计
区报告
人口变化趋势对基础教育的影响修改
人口变化趋势对基础教育的影响审稿
人口变化趋势对基础教育的影响陈副改终稿
县区最终版
人口变化趋势对基础教育的影响
人口变化及其对基础教育影响的报告
修改终稿
人口变化及其对教育影响的报告
正确
附件
定稿
省级课题
人口变化及其对基础教育的影响报告
县区
人口变化对教育的影响
报告
研究报告
文本
修改稿
已审核
已经审核
报告
总人口数常住人口数统计局提供
初稿
人口变化及其对教育影响的研究
人口变化趋势对基础教育影响的研究
研究

@ -0,0 +1,11 @@
县县 县
曲靖市马龙区 马龙区
曲靖市麒麟区 麒麟区
曲靖市沾益区 沾益区
江城江城县 江城县
墨江 墨江县
盐津 盐津县
盈江 盈江县
芒市 芒市县
宾川 宾川县
镇康 镇康县

@ -0,0 +1,124 @@
package com.dsideal.base.Tools.FillData.City;
import cn.hutool.core.io.FileUtil;
import com.dsideal.base.Tools.FillData.ExcelKit.ExcelKit;
import com.dsideal.base.Tools.Util.LocalMysqlConnectUtil;
import com.dsideal.base.Tools.Util.ReadDocxUtil;
import com.jfinal.kit.StrKit;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFChart;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class C10 {
//开始读取市州word文档
static String parentPath = "D:\\dsWork\\YunNanDsBase\\Doc\\全省及州市县区人口与教育报告集20241023\\16个州市报告2022\\分析报告20240510";
//示例Excel
static String sampleExcelPath = "D:\\dsWork\\YunNanDsBase\\Doc\\待处理\\市\\【10】城镇&乡村人口变化及预测-双\\城镇&乡村人口变化及预测-双.xlsx";
public static void main(String[] args) throws IOException, InvalidFormatException {
//初始化数据库连接
LocalMysqlConnectUtil.Init();
//实例化
ReadDocxUtil ru = new ReadDocxUtil();
//目标Excel,就是把文件名解析出来后,后面添加上【成果】,需要动态计算获取,不能写死
String excelPath = sampleExcelPath.replace(".xlsx", "【成果】.xlsx");
ExcelKit.delExcel(excelPath);
//结果Excel
XSSFWorkbook outWorkbook = new XSSFWorkbook();
//结果Sheet
XSSFSheet outSheet = ExcelKit.CreateSheet(outWorkbook);
//样式
XSSFCellStyle headerStyle = ExcelKit.getHeaderStyle(outWorkbook);
XSSFCellStyle dataStyle = ExcelKit.getDataStyle(outWorkbook);
//拷贝文件头
ExcelKit.CopyHead(sampleExcelPath, outSheet, headerStyle);
//找到parentPath下一级目录中所有文件
List<File> files = FileUtil.loopFiles(parentPath, file -> true);
int rowIndex = 0;
//处理这个目录
if (files != null) {
for (File file : files) {
//判断file是不是目录是目录的需要跳过
if (file.isDirectory()) continue;
//城市名称
String cityName = ru.getCityOrAreaName(file.getName());
String fileName = file.getName();
//判断是否为docx文件
if (fileName.endsWith(".docx") && !fileName.startsWith("~")) {
System.out.println("正在处理" + cityName + "市州文件:" + fileName);
//读取文件
String inputUrl = file.getAbsolutePath();
InputStream is = new FileInputStream(inputUrl);
ZipSecureFile.setMinInflateRatio(-1.0d);
XWPFDocument doc = new XWPFDocument(is);
//排序后的图表
List<XWPFChart> charts = ExcelKit.getSortListForXWPFChart(doc.getCharts());
//数据在图表2,图3
int firstChartNumber = 2, secondChartNumber = 3;
XSSFWorkbook workbook = charts.get(firstChartNumber - 1).getWorkbook();
List<List<String>> source1 = ExcelKit.readSheet(workbook, 6);//从2017年开始
workbook = charts.get(secondChartNumber - 1).getWorkbook();
List<List<String>> source3 = ExcelKit.readSheet(workbook, 2);//从2023年开始
//遍历source1
for (List<String> r : source1) {
// 导出数据
//上级行政区划,行政区划,年份,城镇人口变化,城镇人口预测,乡村人口变化,乡村人口预测
int year = Integer.parseInt(r.getFirst());
//城镇
double cvalue = Double.parseDouble(r.get(1));
//乡村
double xvalue = Double.parseDouble(r.get(2));
if (year < 2023) {
Row outRow = outSheet.createRow(++rowIndex);
ExcelKit.putData(outRow, new ArrayList<>(Arrays.asList("云南省", cityName, r.getFirst(), String.format("%.2f", cvalue), "", String.format("%.2f", xvalue), "")), dataStyle);
}
}
//遍历source3
for (List<String> r : source3) {
// 导出数据
//上级行政区划,行政区划,年份,城镇人口变化,城镇人口预测,乡村人口变化,乡村人口预测
int year = Integer.parseInt(r.getFirst());
//城镇
double cvalue = 0;
if (!StrKit.isBlank(r.get(1))) cvalue = Double.parseDouble(r.get(1));
//乡村
double xvalue = 0;
if (!StrKit.isBlank(r.get(2))) xvalue = Double.parseDouble(r.get(2));
if (year >= 2023) {
Row outRow = outSheet.createRow(++rowIndex);
ExcelKit.putData(outRow, new ArrayList<>(Arrays.asList("云南省", cityName, r.getFirst(), "", String.format("%.2f", cvalue), "", String.format("%.2f", xvalue))), dataStyle);
}
}
}
}
}
//保存文件
ExcelKit.saveExcel(excelPath, outWorkbook);
System.out.println("市州所有文件处理完成!");
}
}

@ -0,0 +1,131 @@
package com.dsideal.base.Tools.FillData.City;
import cn.hutool.core.io.FileUtil;
import com.dsideal.base.Tools.FillData.ExcelKit.ExcelKit;
import com.dsideal.base.Tools.Util.LocalMysqlConnectUtil;
import com.dsideal.base.Tools.Util.ReadDocxUtil;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFChart;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class C11 {
//开始读取市州word文档
static String parentPath = "D:\\dsWork\\YunNanDsBase\\Doc\\全省及州市县区人口与教育报告集20241023\\16个州市报告2022\\分析报告20240510";
//示例Excel
static String sampleExcelPath = "D:\\dsWork\\YunNanDsBase\\Doc\\待处理\\市\\【11】教育资源配置发展预测\\教育资源配置发展预测(人).xlsx";
public static void main(String[] args) throws IOException, InvalidFormatException {
//初始化数据库连接
LocalMysqlConnectUtil.Init();
//实例化
ReadDocxUtil ru = new ReadDocxUtil();
//目标Excel,就是把文件名解析出来后,后面添加上【成果】,需要动态计算获取,不能写死
String excelPath = sampleExcelPath.replace(".xlsx", "【成果】.xlsx");
ExcelKit.delExcel(excelPath);
//结果Excel
XSSFWorkbook outWorkbook = new XSSFWorkbook();
//结果Sheet
XSSFSheet outSheet = ExcelKit.CreateSheet(outWorkbook);
//样式
XSSFCellStyle headerStyle = ExcelKit.getHeaderStyle(outWorkbook);
XSSFCellStyle dataStyle = ExcelKit.getDataStyle(outWorkbook);
//拷贝文件头
ExcelKit.CopyHead(sampleExcelPath, outSheet, headerStyle);
//找到parentPath下一级目录中所有文件
List<File> files = FileUtil.loopFiles(parentPath, file -> true);
int rowIndex = 0;
//处理这个目录
if (files != null) {
for (File file : files) {
//判断file是不是目录是目录的需要跳过
if (file.isDirectory()) continue;
//城市名称
String cityName = ru.getCityOrAreaName(file.getName());
String fileName = file.getName();
//判断是否为docx文件
if (fileName.endsWith(".docx") && !fileName.startsWith("~")) {
System.out.println("正在处理" + cityName + "市州文件...");
//读取文件
String inputUrl = file.getAbsolutePath();
InputStream is = new FileInputStream(inputUrl);
ZipSecureFile.setMinInflateRatio(-1.0d);
XWPFDocument doc = new XWPFDocument(is);
//排序后的图表
List<XWPFChart> charts = ExcelKit.getSortListForXWPFChart(doc.getCharts());
//数据在图表36教职工总量
int firstChartNumber = 36, secondChartNumber = 37;
//占地面积37
XSSFWorkbook workbook = charts.get(firstChartNumber - 1).getWorkbook();
List<List<String>> source1 = ExcelKit.readSheet(workbook, 1);
//年份,学前,小学,初中,高中
// 0 1 2 3 4
//所有相关2022基数的列都使用2022的数据
//学前教职工2022基数 小学教职工2022基数 初中教职工2022基数 高中用房2022年基数
//取出第一条数据即2022年数据
List<String> firRow = source1.getFirst();
String xqjzg = firRow.get(1);//学前教职工2022基数
String xxjzg = firRow.get(2);//小学教职工2022基数
String zxjzg = firRow.get(3);//初中教职工2022基数
String gxjzg = firRow.get(4);//高中教职工2022基数
workbook = charts.get(secondChartNumber - 1).getWorkbook();
List<List<String>> source2 = ExcelKit.readSheet(workbook, 1);
//年份,学前,小学,初中,高中
// 0 1 2 3 4
List<String> secRow = source2.getFirst();
String xqjzgMJ = secRow.get(1);//学前面积2023年基数
String xxjzgMJ = secRow.get(2);//小学面积2023年基数
String zxjzgMJ = secRow.get(3);//初中面积2023年基数
String gxjzgMJ = secRow.get(4);//高中面积2023年基数
//遍历source1
for (int i = 0; i < source1.size(); i++) {
List<String> r1 = source1.get(i);
List<String> r2 = source2.get(i);
Row outRow = outSheet.createRow(++rowIndex);
ExcelKit.putData(outRow, new ArrayList<>(
Arrays.asList(cityName, r1.getFirst(),
r1.get(1), xqjzg, String.format("%.2f",Double.parseDouble(xqjzg) - Double.parseDouble(r1.get(1))),//学前教职工数2022基数减去当前值得到预测值
r2.get(1), xqjzgMJ, String.format("%.2f",Double.parseDouble(xqjzgMJ) - Double.parseDouble(r2.get(1))),//学前占地面积2023基数减去当前值得到预测值
r1.get(2), xxjzg, String.format("%.2f",Double.parseDouble(xxjzg) - Double.parseDouble(r1.get(2))),//小学教职工数2022基数减去当前值得到预测值
r2.get(2), xxjzgMJ, String.format("%.2f",Double.parseDouble(xxjzgMJ) - Double.parseDouble(r2.get(2))),//小学占地面积2023基数减去当前值得到预测值
r1.get(3), zxjzg, String.format("%.2f",Double.parseDouble(zxjzg) - Double.parseDouble(r1.get(3))),//初中教职工数2022基数减去当前值得到预测值
r2.get(3), zxjzgMJ, String.format("%.2f",Double.parseDouble(zxjzgMJ) - Double.parseDouble(r2.get(3))),//初中占地面积2023基数减去当前值得到预测值
r1.get(4), gxjzg, String.format("%.2f",Double.parseDouble(gxjzg) - Double.parseDouble(r1.get(4))),//高中教职工数2022基数减去当前值得到预测值
r2.get(4), gxjzgMJ, String.format("%.2f",Double.parseDouble(gxjzgMJ) - Double.parseDouble(r2.get(4))),//高中占地面积2023基数减去当前值得到预测值
"", "", "", "", "", "", "云南省")), dataStyle);
}
}
}
}
//保存文件
ExcelKit.saveExcel(excelPath, outWorkbook);
System.out.println("市州所有文件处理完成!");
}
}

@ -0,0 +1,239 @@
package com.dsideal.base.Tools.FillData.ExcelKit;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class ExcelCoreUtil {
/**
* xlsxlsx
*
* @param inputFilePath Excel
* @param outputFilePath Excel
* @throws IOException
*/
public static void xlsChangeXlsx(String inputFilePath, String outputFilePath) throws IOException {
// 使用 try-with-resources 自动管理资源
try (FileInputStream fis = new FileInputStream(inputFilePath)) {
// 获得xls模板
HSSFWorkbook wb = new HSSFWorkbook(fis);
XSSFWorkbook swb = new XSSFWorkbook();
for (int i = 0; i < wb.getNumberOfSheets(); ++i) {
HSSFSheet sheet = wb.getSheetAt(i);
// 复制sheet合并栏和冻结窗格之类
Sheet sheet1 = copySheet(wb, swb, i);
// 写入xls模板
ExcelCoreUtil builder = new ExcelCoreUtil(wb);
int rowNum = sheet.getLastRowNum();
// 复制单元格值与样式
builder.copyRows(swb, sheet, sheet1, 0, rowNum + 1, 0);
}
try (FileOutputStream fos = new FileOutputStream(outputFilePath)) {
swb.write(fos);
fos.flush();
System.out.println("转换成功,文件已保存至:" + outputFilePath);
}
}
}
private Workbook template = null;
private final Map<Integer, Font> fonts = new HashMap<>();
private final Map<Integer, CellStyle> styles = new HashMap<>();
public ExcelCoreUtil(Workbook template) {
this.template = template;
}
public void switchStyles(Workbook dstWorkbook, CellStyle[] styles) {
for (int i = 0; i < styles.length; i++) {
styles[i] = getStyle(dstWorkbook, styles[i]);
}
}
private Font getFont(Workbook dstWorkbook, Font font) {
return fonts.computeIfAbsent(font.hashCode(), k -> cloneFont(dstWorkbook, font));
}
private CellStyle getStyle(Workbook dstWorkbook, CellStyle style) {
Font font = getFont(dstWorkbook, template.getFontAt(style.getFontIndexAsInt()));
return styles.computeIfAbsent(style.hashCode(), k -> cloneStyle(dstWorkbook, style, dstWorkbook.createDataFormat(), font));
}
public void copyRows(Workbook dstWorkbook, Sheet srcSheet, Sheet dstSheet, int from, int to, int offset) {
for (int r = from; r < to; r++) {
Row srcRow = srcSheet.getRow(r);
if (srcRow != null) {
CellStyle style = srcRow.getRowStyle();
Row dstRow = dstSheet.createRow(r + offset);
dstRow.setHeight(srcRow.getHeight());
if (style != null) {
dstRow.setRowStyle(getStyle(dstWorkbook, style));
}
for (int c = 0; c < srcRow.getLastCellNum(); c++) {
Cell srcCell = srcRow.getCell(c);
if (srcCell != null) {
CellType type = getCellType(srcCell);
Object value = getCellValue(srcCell);
style = srcCell.getCellStyle();
Cell newCell = dstRow.createCell(c, type);
setCellValue(newCell, value, type);
newCell.setCellStyle(getStyle(dstWorkbook, style));
}
}
}
}
}
public static Sheet copySheet(Workbook srcWorkbook, Workbook dstWorkbook, int sheetIndex) {
Sheet srcSheet = srcWorkbook.getSheetAt(sheetIndex);
Sheet dstSheet = dstWorkbook.createSheet(srcSheet.getSheetName());
dstSheet.setDisplayFormulas(srcSheet.isDisplayFormulas());
dstSheet.setDisplayGridlines(srcSheet.isDisplayGridlines());
dstSheet.setDisplayGuts(srcSheet.getDisplayGuts());
dstSheet.setDisplayRowColHeadings(srcSheet.isDisplayRowColHeadings());
dstSheet.setDisplayZeros(srcSheet.isDisplayZeros());
dstSheet.setFitToPage(srcSheet.getFitToPage());
dstSheet.setForceFormulaRecalculation(srcSheet.getForceFormulaRecalculation());
dstSheet.setHorizontallyCenter(srcSheet.getHorizontallyCenter());
dstSheet.setMargin(Sheet.BottomMargin, srcSheet.getMargin(Sheet.BottomMargin));
dstSheet.setMargin(Sheet.FooterMargin, srcSheet.getMargin(Sheet.FooterMargin));
dstSheet.setMargin(Sheet.HeaderMargin, srcSheet.getMargin(Sheet.HeaderMargin));
dstSheet.setMargin(Sheet.LeftMargin, srcSheet.getMargin(Sheet.LeftMargin));
dstSheet.setMargin(Sheet.RightMargin, srcSheet.getMargin(Sheet.RightMargin));
dstSheet.setMargin(Sheet.TopMargin, srcSheet.getMargin(Sheet.TopMargin));
dstSheet.setPrintGridlines(srcSheet.isPrintGridlines());
dstSheet.setRightToLeft(srcSheet.isRightToLeft());
dstSheet.setRowSumsBelow(srcSheet.getRowSumsBelow());
dstSheet.setRowSumsRight(srcSheet.getRowSumsRight());
dstSheet.setVerticallyCenter(srcSheet.getVerticallyCenter());
for (int i = 0; i < 20; i++) {
dstSheet.setColumnWidth(i, srcSheet.getColumnWidth(i));
dstSheet.setColumnHidden(i, srcSheet.isColumnHidden(i));
}
srcSheet.getMergedRegions().forEach(dstSheet::addMergedRegion);
Drawing<?> d1 = srcSheet.getDrawingPatriarch();
if (d1 != null) {
Drawing<?> d2 = dstSheet.getDrawingPatriarch();
if (d2 == null) {
d2 = dstSheet.createDrawingPatriarch();
}
for (Shape shape : d1) {
if (shape instanceof Picture) {
Picture p = (Picture) shape;
ClientAnchor a1 = p.getClientAnchor();
int pictureId = dstWorkbook.addPicture(p.getPictureData().getData(), p.getPictureData().getPictureType());
ClientAnchor a2 = d2.createAnchor(a1.getDx1(), a1.getDy1(), a1.getDx2(), a1.getDy2(), a1.getCol1(), a1.getRow1(), a1.getCol2(), a1.getRow2());
d2.createPicture(a2, pictureId);
}
}
}
return dstSheet;
}
public static Font cloneFont(Workbook dstWorkbook, Font font) {
Font clone = dstWorkbook.createFont();
clone.setBold(font.getBold());
clone.setCharSet(font.getCharSet());
clone.setColor(font.getColor());
clone.setFontHeight(font.getFontHeight());
clone.setFontName(font.getFontName());
clone.setItalic(font.getItalic());
clone.setStrikeout(font.getStrikeout());
clone.setTypeOffset(font.getTypeOffset());
clone.setUnderline(font.getUnderline());
return clone;
}
public static CellStyle cloneStyle(Workbook dstWorkbook, CellStyle style, DataFormat formatter, Font font) {
CellStyle clone = dstWorkbook.createCellStyle();
clone.setAlignment(style.getAlignment());
clone.setBorderBottom(style.getBorderBottom());
clone.setBorderLeft(style.getBorderLeft());
clone.setBorderRight(style.getBorderRight());
clone.setBorderTop(style.getBorderTop());
// 复制数据格式
String formatString = style.getDataFormatString();
DataFormat targetDataFormat = dstWorkbook.createDataFormat();
short targetFormatIndex = targetDataFormat.getFormat(formatString);
clone.setDataFormat(targetFormatIndex);
clone.setDataFormat(formatter.getFormat(style.getDataFormatString()));
clone.setFillBackgroundColor(style.getFillBackgroundColor());
clone.setFillForegroundColor(style.getFillForegroundColor());
clone.setFillPattern(style.getFillPattern());
clone.setFont(font);
clone.setHidden(style.getHidden());
clone.setIndention(style.getIndention());
clone.setLocked(style.getLocked());
clone.setVerticalAlignment(style.getVerticalAlignment());
clone.setWrapText(style.getWrapText());
return clone;
}
protected static CellType getCellType(Cell cell) {
CellType cellType = cell.getCellType();
if (cellType == CellType.FORMULA) {
cellType = cell.getSheet().getWorkbook().getCreationHelper().createFormulaEvaluator()
.evaluateFormulaCell(cell);
}
return cellType;
}
protected static Object getCellValue(Cell cell) {
switch (getCellType(cell)) {
case BLANK:
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return cell.getBooleanCellValue();
case ERROR:
return cell.getErrorCellValue();
case NUMERIC:
return cell.getNumericCellValue();
}
return null;
}
protected static void setCellValue(Cell cell, Object value, CellType type) {
switch (type) {
case BLANK:
return;
case STRING:
cell.setCellValue((String) value);
return;
case BOOLEAN:
cell.setCellValue((Boolean) value);
return;
case ERROR:
cell.setCellErrorValue((Byte) value);
return;
case NUMERIC:
if (value instanceof Double) {
Double d = (Double) value;
// 判断是否为整数,如果是整数,转换为 long否则保留小数
if (d == Math.floor(d)) {
cell.setCellValue(String.valueOf(d.longValue())); // 整数时去掉 .0
} else {
cell.setCellValue(d); // 保留小数
}
}
return;
default:
break;
}
}
}
Loading…
Cancel
Save