import json from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 from Util import MysqlUtil # 保存职业 from Util.CommonUtil import printf def saveZhiYe(ZhiYeCode, zhiYeName, zyjs): sql = 'insert into t_university_zhiye(ZhiYeCode,zhiYeName,zyjs) values(%s,%s,%s)' db.execute(sql, (ZhiYeCode, zhiYeName, zyjs)) db.commit() # 职业对应的岗位 def saveZhiYeGW(ZhiYeCode, gw_name, gw_salary, gw_area): sql = 'insert into t_university_zhiye_gw(ZhiYeCode,gw_name,gw_salary,gw_area) values(%s,%s,%s,%s)' db.execute(sql, (ZhiYeCode, gw_name, gw_salary, gw_area)) db.commit() # 职业对应的专业 def saveZhiYeZY(ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl): sql = 'insert into t_university_zhiye_zy(ZhiYeCode,ZhuanYeCode,ZhuanYeName,XueZhi,Nvbl) values(%s,%s,%s,%s,%s)' db.execute(sql, (ZhiYeCode, ZhuanYeCode, ZhuanYeName, XueZhi, Nvbl)) db.commit() def getZhiYe(code): url = 'https://pv4y-pc-tob.youzy.cn/careers/detail/index?code=' + code driver.get(url) # 职业名称 zhiYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text print(zhiYeName) # 职业介绍 zyjs = driver.find_elements_by_css_selector("[class='f14 text-666']")[0].text # 保存职业信息 saveZhiYe(code, zhiYeName, zyjs) # 相关岗位 s[1] s = driver.find_elements_by_css_selector("[class='mb40']") s1 = s[1].find_elements_by_css_selector("[class='post-item mb20']") for one in s1: arr = one.text.split('\n') gw = arr[0] salary = '' rmdq = '' if len(arr) == 2: rmdq = arr[1] elif len(arr) == 3: salary = arr[1] rmdq = arr[2] saveZhiYeGW(code, gw, salary, rmdq) # 相关专业 s = driver.find_elements_by_css_selector("[class='pb60']") s2 = s[0].find_elements_by_css_selector("[class='post-item mb20']") for one in s2: arr = one.text.split('\n') nvbl = '' if len(arr) == 4: nvbl = arr[3] xz = '' if len(arr) > 2: xz = arr[2] saveZhiYeZY(code, arr[1].replace("国标代码:", ""), arr[0], xz, nvbl) if __name__ == '__main__': # 初始化连接 with open("Config.json", 'r') as load_f: connect = json.load(load_f) db = MysqlUtil.MySQLConnect(connect) # 打开webdriver driver = webdriver.Chrome() # 调用带参数的谷歌浏览器 driver.implicitly_wait(2) WebDriverWait(driver, 2) driver.maximize_window() authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList' driver.get(authPage) # 清表 sql = "truncate table t_university_zhiye" db.execute(sql) db.commit() sql = "truncate table t_university_zhiye_gw" db.execute(sql) db.commit() sql = "truncate table t_university_zhiye_zy" db.execute(sql) db.commit() # 爬取职业 sql = 'select zhiyecode from t_university_zhiye_tree where length(zhiyecode)=8' s = db.fetchall(sql) num = 1 for x in s: try: getZhiYe(x['zhiyecode']) printf("正在爬取第" + str(num) + "个职业信息,共" + str(len(s)) + "个职业信息。") except Exception as err: printf(x['zhiyecode'] + "发生异常,请检查!") print(err) num = num + 1 # getZhiYe('06051001') # 关闭数据库 db.close() # 关闭selenium driver.close() driver.quit() printf("恭喜,所有操作成功完成!")