You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
124 lines
3.7 KiB
124 lines
3.7 KiB
import json
|
|
|
|
from selenium import webdriver # 用来驱动浏览器的
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
|
|
|
|
from Util import MysqlUtil
|
|
|
|
# 保存职业
|
|
from Util.CommonUtil import printf
|
|
|
|
|
|
def saveZhiYe(ZhiYeCode, zhiYeName, zyjs):
|
|
sql = 'insert into t_university_zhiye(ZhiYeCode,zhiYeName,zyjs) values(%s,%s,%s)'
|
|
db.execute(sql, (ZhiYeCode, zhiYeName, zyjs))
|
|
db.commit()
|
|
|
|
|
|
# 职业对应的岗位
|
|
def saveZhiYeGW(ZhiYeCode, gw_name, gw_salary, gw_area):
|
|
sql = 'insert into t_university_zhiye_gw(ZhiYeCode,gw_name,gw_salary,gw_area) values(%s,%s,%s,%s)'
|
|
db.execute(sql, (ZhiYeCode, gw_name, gw_salary, gw_area))
|
|
db.commit()
|
|
|
|
|
|
# 职业对应的专业
|
|
def saveZhiYeZY(ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl):
|
|
sql = 'insert into t_university_zhiye_zy(ZhiYeCode,ZhuanYeCode,ZhuanYeName,XueZhi,Nvbl) values(%s,%s,%s,%s,%s)'
|
|
db.execute(sql, (ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl))
|
|
db.commit()
|
|
|
|
|
|
def getZhiYe(code):
|
|
url = 'https://pv4y-pc-tob.youzy.cn/careers/detail/index?code=' + code
|
|
driver.get(url)
|
|
|
|
# 职业名称
|
|
zhiYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text
|
|
print(zhiYeName)
|
|
|
|
# 职业介绍
|
|
zyjs = driver.find_elements_by_css_selector("[class='f14 text-666']")[0].text
|
|
# 保存职业信息
|
|
saveZhiYe(code, zhiYeName, zyjs)
|
|
|
|
# 相关岗位 s[1]
|
|
s = driver.find_elements_by_css_selector("[class='mb40']")
|
|
s1 = s[1].find_elements_by_css_selector("[class='post-item mb20']")
|
|
for one in s1:
|
|
arr = one.text.split('\n')
|
|
gw = arr[0]
|
|
salary = ''
|
|
rmdq = ''
|
|
|
|
if len(arr) == 2:
|
|
rmdq = arr[1]
|
|
elif len(arr) == 3:
|
|
salary = arr[1]
|
|
rmdq = arr[2]
|
|
|
|
saveZhiYeGW(code, gw, salary, rmdq)
|
|
|
|
# 相关专业
|
|
s = driver.find_elements_by_css_selector("[class='pb60']")
|
|
s2 = s[0].find_elements_by_css_selector("[class='post-item mb20']")
|
|
for one in s2:
|
|
arr = one.text.split('\n')
|
|
nvbl = ''
|
|
if len(arr) == 4:
|
|
nvbl = arr[3]
|
|
xz = ''
|
|
if len(arr) > 2:
|
|
xz = arr[2]
|
|
saveZhiYeZY(code, arr[1].replace("国标代码:", ""), arr[0], xz, nvbl)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 初始化连接
|
|
with open("Config.json", 'r') as load_f:
|
|
connect = json.load(load_f)
|
|
db = MysqlUtil.MySQLConnect(connect)
|
|
|
|
# 打开webdriver
|
|
driver = webdriver.Chrome() # 调用带参数的谷歌浏览器
|
|
driver.implicitly_wait(2)
|
|
WebDriverWait(driver, 2)
|
|
driver.maximize_window()
|
|
|
|
authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList'
|
|
driver.get(authPage)
|
|
|
|
# 清表
|
|
sql = "truncate table t_university_zhiye"
|
|
db.execute(sql)
|
|
db.commit()
|
|
sql = "truncate table t_university_zhiye_gw"
|
|
db.execute(sql)
|
|
db.commit()
|
|
sql = "truncate table t_university_zhiye_zy"
|
|
db.execute(sql)
|
|
db.commit()
|
|
|
|
# 爬取职业
|
|
sql = 'select zhiyecode from t_university_zhiye_tree where length(zhiyecode)=8'
|
|
s = db.fetchall(sql)
|
|
num = 1
|
|
for x in s:
|
|
try:
|
|
getZhiYe(x['zhiyecode'])
|
|
printf("正在爬取第" + str(num) + "个职业信息,共" + str(len(s)) + "个职业信息。")
|
|
except Exception as err:
|
|
printf(x['zhiyecode'] + "发生异常,请检查!")
|
|
print(err)
|
|
num = num + 1
|
|
|
|
# getZhiYe('06051001')
|
|
# 关闭数据库
|
|
db.close()
|
|
|
|
# 关闭selenium
|
|
driver.close()
|
|
driver.quit()
|
|
printf("恭喜,所有操作成功完成!")
|