You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
3.9 KiB

import json
import time
from selenium import webdriver # 用来驱动浏览器的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
from Util import MysqlUtil
def getZhuanYe(code):
url = 'https://pv4y-pc-tob.youzy.cn/majors/small/index?code=' + code
driver.get(url)
# 科目名称
zhuanYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text
print(zhuanYeName)
# 国标代码
gbCode = driver.find_element_by_css_selector("[class='mt15 lh20 f14 text-999']").text
zhuanYeCode = "".join(list(filter(str.isdigit, gbCode)))
# 专业概况
l1 = driver.find_elements_by_css_selector("[class='f18']")
# 学历层次
xlcc = l1[0].text
# 修业年限
xynx = l1[1].text
# 授予学位
syxw = l1[2].text
# 男女比例
nvbl = l1[3].text
# 专业介绍
# l2 = driver.find_element_by_id('gaikuang').find_elements_by_css_selector("[class='sub-title']")
# num = 0
# for o in l2:
# print(o.text + " " + str(num))
# num = num + 1
# 专业介绍
zyjs = ''
# 培养目标
pymb = ''
# 培养要求
pyyq = ''
# 学科要求
xkyq = ''
# 知识能力
zsnl = ''
# 主要课程
zykc = ''
# 社会名人
shmr = ''
l2 = driver.find_element_by_id('gaikuang').find_elements_by_tag_name("p")
num = 0
for o in l2:
if num == 0:
zyjs = o.text
elif num == 1:
pymb = o.text
elif num == 2:
pyyq = o.text
elif num == 3:
xkyq = o.text
elif num == 6:
zsnl = o.text
elif num == 9:
zykc = o.text
elif num == 10:
shmr = o.text
num = num + 1
# 点击就业前景
jyfx = ''
try:
driver.find_element_by_id("tab-second").click()
time.sleep(3)
# 就业方向
jyfx = driver.find_element_by_xpath("/html/body/div/div[1]/section/div/div/div[2]/div[4]/div/div[1]/p").text
except:
print("没有取到就业方向:" + code)
pass
# 入库
# 先删除后插入
sql = "delete from t_university_zhuanye where zhuanYeCode=%s"
db.execute(sql, zhuanYeCode)
db.commit()
# 插入
sql = "insert into t_university_zhuanye(zhuanYeName,zhuanYeCode,xlcc,xynx,syxw,nvbl,zyjs,pymb,pyyq,xkyq,zsnl,zykc,shmr,jyfx) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
db.execute(sql,
(zhuanYeName, zhuanYeCode, xlcc, xynx, syxw, nvbl, zyjs, pymb, pyyq, xkyq, zsnl, zykc, shmr, jyfx))
db.commit()
if __name__ == '__main__':
# 初始化连接
with open("Config.json", 'r') as load_f:
connect = json.load(load_f)
db = MysqlUtil.MySQLConnect(connect)
# 打开webdriver
option = Options()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
driver.implicitly_wait(2)
WebDriverWait(driver, 2)
driver.maximize_window()
authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList'
driver.get(authPage)
# 爬取专业
# sql = 'select level3_code from t_university_major'
# 补一下
sql = 'select zhuanYeCode as level3_code from t_university_zhuanye where length(jyfx)=0'
s = db.fetchall(sql)
cnt = 1
for x in s:
try:
getZhuanYe(x['level3_code'])
print("正在爬取第" + str(cnt) + "个专业信息,共" + str(len(s)) + "个专业信息。")
except:
print(x['level3_code'] + "发生异常,请检查!")
cnt = cnt + 1
# # 关闭数据库
db.close()
# 关闭selenium
driver.close()
driver.quit()