kgdxpr 9 months ago
commit 030fa75bf0

@ -0,0 +1,68 @@
# pip install pymysql
# pip install requests beautifulsoup4
# 查看结果
# select * from t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id;
import time
import pymysql
import requests
from bs4 import BeautifulSoup
import re
if __name__ == '__main__':
# 遍历 mysql数据库然后开启爬虫
# 建立数据库连接
conn = pymysql.connect(
host='10.10.14.203', # 主机名或IP地址
port=3306, # 端口号默认为3306
user='root', # 用户名
password='Password123@mysql', # 密码
charset='utf8mb4' # 设置字符编码
)
# 创建游标对象
cursor = conn.cursor()
# 选择数据库
conn.select_db("ds_db")
# 执行查询操作
cursor.execute(
"SELECT id,full_name FROM t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id")
# 获取查询结果,返回元组
result: tuple = cursor.fetchall()
for e in result:
id = e[0]
area_name = e[1]
url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box"
print(url)
# 发送HTTP GET请求
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 假设我们要抓取的是<h1>标签中的文字
# 你可以根据需要修改选择器来抓取不同的内容
specific_divs = soup.select('div.para_YYuCh.summary_nfAdr.MARK_MODULE')
# 遍历找到的所有特定div标签并打印它们的文本内容
for div in specific_divs:
text = div.get_text(strip=True) # 使用get_text()方法获取文本,并去除
# 使用正则表达式移除所有形如[数字]和[数字-数字]的字符串
cleaned_text = re.sub(r'\[\d+(?:-\d+)?\]', '', text)
sql = "update t_dm_area set memo=%s where id=%s"
cursor.execute(sql, (cleaned_text, id))
conn.commit()
print("更新"+area_name+"数据成功")
break
else:
print('Failed to retrieve the webpage')
time.sleep(2)
# 关闭游标和连接
cursor.close()
conn.close()
print("结束")

@ -3,6 +3,7 @@ package com.dsideal.base.DataEase.Controller;
import com.dsideal.base.Base.Model.BaseModel;
import com.dsideal.base.DataEase.Model.DataEaseModel;
import com.dsideal.base.Interceptor.IsLoginInterface;
import com.dsideal.base.Interceptor.IsNumericInterface;
import com.dsideal.base.Util.CommonUtil;
import com.dsideal.base.Util.CookieUtil;
import com.jfinal.aop.Before;
@ -40,6 +41,7 @@ public class DataEaseController extends Controller {
*/
@Before(GET.class)
@IsLoginInterface({})
@IsNumericInterface({"dataset_id"})
public void getDataSetTable(int dataset_id) {
List<Record> list = dm.getDataSetTableContent(dataset_id);
renderJson(CommonUtil.renderJsonForLayUI(list));
@ -63,7 +65,7 @@ public class DataEaseController extends Controller {
JSONArray jsonArray = JSONArray.fromObject(data);
for (Object o : jsonArray) {
net.sf.json.JSONObject jo = (net.sf.json.JSONObject) o;
if (jo.containsKey("行政区划") && !jo.getString("行政区划").equals(xmqh)) {
if (identity_id>1 && jo.containsKey("行政区划") && !jo.getString("行政区划").equals(xmqh)) {
renderJson(CommonUtil.returnMessageJson(false, "数据集数据与当前登录人员所属行政区划不一致,请重新选择数据集!"));
return;
}

@ -55,6 +55,9 @@ public class DataEaseModel {
if (identity_id > 1) {
String sql = "delete from dataease.`" + tableName + "` where `行政区划`=?";
Db.update(sql, xmqh);
} else {
String sql = "delete from dataease.`" + tableName + "`";
Db.update(sql);
}
List<com.jfinal.plugin.activerecord.Record> list = new ArrayList<>();
for (int i = 0; i < ja.size(); i++) {
@ -68,6 +71,6 @@ public class DataEaseModel {
list.add(record);
}
}
Db.use("dataease").batchSave("`" + tableName + "`", list, 100);
Db.use("dataease").batchSave(tableName, list, 100);
}
}

Loading…
Cancel
Save