You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
134 lines
3.9 KiB
134 lines
3.9 KiB
import json
|
|
import time
|
|
|
|
from selenium import webdriver # 用来驱动浏览器的
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
|
|
|
|
from Util import MysqlUtil
|
|
|
|
|
|
def getZhuanYe(code):
|
|
url = 'https://pv4y-pc-tob.youzy.cn/majors/small/index?code=' + code
|
|
driver.get(url)
|
|
|
|
# 科目名称
|
|
zhuanYeName = driver.find_element_by_css_selector("[class='f28 lh40 text-333']").text
|
|
print(zhuanYeName)
|
|
# 国标代码
|
|
gbCode = driver.find_element_by_css_selector("[class='mt15 lh20 f14 text-999']").text
|
|
zhuanYeCode = "".join(list(filter(str.isdigit, gbCode)))
|
|
# 专业概况
|
|
l1 = driver.find_elements_by_css_selector("[class='f18']")
|
|
# 学历层次
|
|
xlcc = l1[0].text
|
|
# 修业年限
|
|
xynx = l1[1].text
|
|
# 授予学位
|
|
syxw = l1[2].text
|
|
# 男女比例
|
|
nvbl = l1[3].text
|
|
|
|
# 专业介绍
|
|
# l2 = driver.find_element_by_id('gaikuang').find_elements_by_css_selector("[class='sub-title']")
|
|
# num = 0
|
|
# for o in l2:
|
|
# print(o.text + " " + str(num))
|
|
# num = num + 1
|
|
|
|
# 专业介绍
|
|
zyjs = ''
|
|
# 培养目标
|
|
pymb = ''
|
|
# 培养要求
|
|
pyyq = ''
|
|
# 学科要求
|
|
xkyq = ''
|
|
# 知识能力
|
|
zsnl = ''
|
|
# 主要课程
|
|
zykc = ''
|
|
# 社会名人
|
|
shmr = ''
|
|
|
|
l2 = driver.find_element_by_id('gaikuang').find_elements_by_tag_name("p")
|
|
num = 0
|
|
for o in l2:
|
|
if num == 0:
|
|
zyjs = o.text
|
|
elif num == 1:
|
|
pymb = o.text
|
|
elif num == 2:
|
|
pyyq = o.text
|
|
elif num == 3:
|
|
xkyq = o.text
|
|
elif num == 6:
|
|
zsnl = o.text
|
|
elif num == 9:
|
|
zykc = o.text
|
|
elif num == 10:
|
|
shmr = o.text
|
|
num = num + 1
|
|
|
|
# 点击就业前景
|
|
jyfx = ''
|
|
try:
|
|
driver.find_element_by_id("tab-second").click()
|
|
time.sleep(3)
|
|
# 就业方向
|
|
jyfx = driver.find_element_by_xpath("/html/body/div/div[1]/section/div/div/div[2]/div[4]/div/div[1]/p").text
|
|
except:
|
|
print("没有取到就业方向:" + code)
|
|
pass
|
|
# 入库
|
|
|
|
# 先删除后插入
|
|
sql = "delete from t_university_zhuanye where zhuanYeCode=%s"
|
|
db.execute(sql, zhuanYeCode)
|
|
db.commit()
|
|
# 插入
|
|
sql = "insert into t_university_zhuanye(zhuanYeName,zhuanYeCode,xlcc,xynx,syxw,nvbl,zyjs,pymb,pyyq,xkyq,zsnl,zykc,shmr,jyfx) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
|
|
db.execute(sql,
|
|
(zhuanYeName, zhuanYeCode, xlcc, xynx, syxw, nvbl, zyjs, pymb, pyyq, xkyq, zsnl, zykc, shmr, jyfx))
|
|
db.commit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 初始化连接
|
|
with open("Config.json", 'r') as load_f:
|
|
connect = json.load(load_f)
|
|
db = MysqlUtil.MySQLConnect(connect)
|
|
|
|
# 打开webdriver
|
|
option = Options()
|
|
# option.add_argument('--headless')
|
|
# option.add_argument('--disable-gpu')
|
|
|
|
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
|
|
driver.implicitly_wait(2)
|
|
WebDriverWait(driver, 2)
|
|
driver.maximize_window()
|
|
authPage = 'https://www.youzy.cn/tzy/search/colleges/collegeList'
|
|
driver.get(authPage)
|
|
|
|
# 爬取专业
|
|
# sql = 'select level3_code from t_university_major'
|
|
|
|
# 补一下
|
|
sql = 'select zhuanYeCode as level3_code from t_university_zhuanye where length(jyfx)=0'
|
|
s = db.fetchall(sql)
|
|
cnt = 1
|
|
for x in s:
|
|
try:
|
|
getZhuanYe(x['level3_code'])
|
|
print("正在爬取第" + str(cnt) + "个专业信息,共" + str(len(s)) + "个专业信息。")
|
|
except:
|
|
print(x['level3_code'] + "发生异常,请检查!")
|
|
cnt = cnt + 1
|
|
|
|
# # 关闭数据库
|
|
db.close()
|
|
# 关闭selenium
|
|
driver.close()
|
|
driver.quit()
|