You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
3.7 KiB

import json
from selenium import webdriver # 用来驱动浏览器的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
from Util import MysqlUtil
# 保存职业
from Util.CommonUtil import printf
def saveZhiYe(ZhiYeCode, zhiYeName, zyjs):
sql = 'insert into t_university_zhiye(ZhiYeCode,zhiYeName,zyjs) values(%s,%s,%s)'
db.execute(sql, (ZhiYeCode, zhiYeName, zyjs))
db.commit()
# 职业对应的岗位
def saveZhiYeGW(ZhiYeCode, gw_name, gw_salary, gw_area):
sql = 'insert into t_university_zhiye_gw(ZhiYeCode,gw_name,gw_salary,gw_area) values(%s,%s,%s,%s)'
db.execute(sql, (ZhiYeCode, gw_name, gw_salary, gw_area))
db.commit()
# 职业对应的专业
def saveZhiYeZY(ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl):
sql = 'insert into t_university_zhiye_zy(ZhiYeCode,ZhuanYeCode,ZhuanYeName,XueZhi,Nvbl) values(%s,%s,%s,%s,%s)'
db.execute(sql, (ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl))
db.commit()
def getZhiYe(code):
url = 'https://pv4y-pc-tob.youzy.cn/careers/detail/index?code=' + code
driver.get(url)
# 职业名称
zhiYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text
print(zhiYeName)
# 职业介绍
zyjs = driver.find_elements_by_css_selector("[class='f14 text-666']")[0].text
# 保存职业信息
saveZhiYe(code, zhiYeName, zyjs)
# 相关岗位 s[1]
s = driver.find_elements_by_css_selector("[class='mb40']")
s1 = s[1].find_elements_by_css_selector("[class='post-item mb20']")
for one in s1:
arr = one.text.split('\n')
gw = arr[0]
salary = ''
rmdq = ''
if len(arr) == 2:
rmdq = arr[1]
elif len(arr) == 3:
salary = arr[1]
rmdq = arr[2]
saveZhiYeGW(code, gw, salary, rmdq)
# 相关专业
s = driver.find_elements_by_css_selector("[class='pb60']")
s2 = s[0].find_elements_by_css_selector("[class='post-item mb20']")
for one in s2:
arr = one.text.split('\n')
nvbl = ''
if len(arr) == 4:
nvbl = arr[3]
xz = ''
if len(arr) > 2:
xz = arr[2]
saveZhiYeZY(code, arr[1].replace("国标代码:", ""), arr[0], xz, nvbl)
if __name__ == '__main__':
# 初始化连接
with open("Config.json", 'r') as load_f:
connect = json.load(load_f)
db = MysqlUtil.MySQLConnect(connect)
# 打开webdriver
driver = webdriver.Chrome() # 调用带参数的谷歌浏览器
driver.implicitly_wait(2)
WebDriverWait(driver, 2)
driver.maximize_window()
authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList'
driver.get(authPage)
# 清表
sql = "truncate table t_university_zhiye"
db.execute(sql)
db.commit()
sql = "truncate table t_university_zhiye_gw"
db.execute(sql)
db.commit()
sql = "truncate table t_university_zhiye_zy"
db.execute(sql)
db.commit()
# 爬取职业
sql = 'select zhiyecode from t_university_zhiye_tree where length(zhiyecode)=8'
s = db.fetchall(sql)
num = 1
for x in s:
try:
getZhiYe(x['zhiyecode'])
printf("正在爬取第" + str(num) + "个职业信息,共" + str(len(s)) + "个职业信息。")
except Exception as err:
printf(x['zhiyecode'] + "发生异常,请检查!")
print(err)
num = num + 1
# getZhiYe('06051001')
# 关闭数据库
db.close()
# 关闭selenium
driver.close()
driver.quit()
printf("恭喜,所有操作成功完成!")