import json import time from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 from Util import MysqlUtil def getZhuanYe(code): url = 'https://pv4y-pc-tob.youzy.cn/majors/small/index?code=' + code driver.get(url) # 科目名称 zhuanYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text print(zhuanYeName) # 国标代码 gbCode = driver.find_element_by_css_selector("[class='mt15 lh20 f14 text-999']").text zhuanYeCode = "".join(list(filter(str.isdigit, gbCode))) # 专业概况 l1 = driver.find_elements_by_css_selector("[class='f18']") # 学历层次 xlcc = l1[0].text # 修业年限 xynx = l1[1].text # 授予学位 syxw = l1[2].text # 男女比例 nvbl = l1[3].text # 专业介绍 # l2 = driver.find_element_by_id('gaikuang').find_elements_by_css_selector("[class='sub-title']") # num = 0 # for o in l2: # print(o.text + " " + str(num)) # num = num + 1 # 专业介绍 zyjs = '' # 培养目标 pymb = '' # 培养要求 pyyq = '' # 学科要求 xkyq = '' # 知识能力 zsnl = '' # 主要课程 zykc = '' # 社会名人 shmr = '' l2 = driver.find_element_by_id('gaikuang').find_elements_by_tag_name("p") num = 0 for o in l2: if num == 0: zyjs = o.text elif num == 1: pymb = o.text elif num == 2: pyyq = o.text elif num == 3: xkyq = o.text elif num == 6: zsnl = o.text elif num == 9: zykc = o.text elif num == 10: shmr = o.text num = num + 1 # 点击就业前景 jyfx = '' try: driver.find_element_by_id("tab-second").click() time.sleep(3) # 就业方向 jyfx = driver.find_element_by_xpath("/html/body/div/div[1]/section/div/div/div[2]/div[4]/div/div[1]/p").text except: print("没有取到就业方向:" + code) pass # 入库 # 先删除后插入 sql = "delete from t_university_zhuanye where zhuanYeCode=%s" db.execute(sql, zhuanYeCode) db.commit() # 插入 sql = "insert into t_university_zhuanye(zhuanYeName,zhuanYeCode,xlcc,xynx,syxw,nvbl,zyjs,pymb,pyyq,xkyq,zsnl,zykc,shmr,jyfx) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" db.execute(sql, (zhuanYeName, zhuanYeCode, xlcc, xynx, syxw, nvbl, zyjs, pymb, pyyq, xkyq, zsnl, zykc, shmr, jyfx)) db.commit() if __name__ == '__main__': # 初始化连接 with open("Config.json", 'r') as load_f: connect = json.load(load_f) db = MysqlUtil.MySQLConnect(connect) # 打开webdriver option = Options() # option.add_argument('--headless') # option.add_argument('--disable-gpu') driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器 driver.implicitly_wait(2) WebDriverWait(driver, 2) driver.maximize_window() authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList' driver.get(authPage) # 爬取专业 # sql = 'select level3_code from t_university_major' # 补一下 sql = 'select zhuanYeCode as level3_code from t_university_zhuanye where length(jyfx)=0' s = db.fetchall(sql) cnt = 1 for x in s: try: getZhuanYe(x['level3_code']) print("正在爬取第" + str(cnt) + "个专业信息,共" + str(len(s)) + "个专业信息。") except: print(x['level3_code'] + "发生异常,请检查!") cnt = cnt + 1 # # 关闭数据库 db.close() # 关闭selenium driver.close() driver.quit()