python爬虫获取学信网学校与对应专业信息

最新推荐文章于 2024-02-23 11:56:33 发布

JevonK

最新推荐文章于 2024-02-23 11:56:33 发布

阅读量3.5k

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/zanj0525/article/details/84749327

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

python 学习有一段时间，于是开始着手写了一个入门爬虫试试，话不多说直接上源码了。

import requests 
import bs4
import pymysql
import json
from xpinyin import Pinyin

host = "localhost"
user = "root"
password = "123456"
database = "test"
# 打开数据库连接
db = pymysql.connect(host, user, password, database)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
#判断是否存在学校表（school）和专业表（major）
cursor.execute("drop table if exists school")
cursor.execute("drop table if exists major") 
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
#第一次打开网页
res = requests.get('https://gaokao.chsi.com.cn/sch/search.do?start=0') 

res.raise_for_status() 
soup = bs4.BeautifulSoup(res.text, 'html.parser') 
#获取总页数
page_num = soup.select('.ch-page li')[7].string
th = soup.select('th')
th_herd = []
#申明汉子转换拼音对象
pin = Pinyin() 
field = ""
for item in th:
	field += (","+pin.get_pinyin(item.get_text(), '_')+" varchar(100)")
	th_herd.append(pin.get_pinyin(item.get_text(), '_'))
#学校表创建sql语句
sql = """CREATE TABLE school (id int NOT NULL AUTO_INCREMENT """+field+""",PRIMARY KEY ( id ))"""
sql1 = """CREATE TABLE major (s_id int , name varchar(100))"""
#执行语句
cursor.execute(sql);
cursor.execute(sql1);
# 转化数组为逗号分隔的字符串
delimiter = ','
field = delimiter.join(th_herd)
j = 0
m = 1
while j<int(page_num):
	#输出数据量
	print(j*20) 
	res = requests.get('https://gaokao.chsi.com.cn/sch/search.do?start=%s'%(j*20)) 
	res.raise_for_status() 
	soup = bs4.BeautifulSoup(res.text, 'html.parser') 
	tr = soup.select("table tr")
	i = 0
	a = ""
	a1 = ""
	while i<len(tr):
		if i == 0:
			i += 1
			continue
		b = ''
		for x in tr[i].select('td')[5].select('span'):
			b = b + "    " + str(x.get_text())
		c = (1 if (tr[i].select('td')[6].string) is None else 0)
		d = (";" if (i+1) is len(tr) else ",")
		a += "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')%s" % (tr[i].select('td')[0].get_text().strip(), tr[i].select('td')[1].get_text().strip(), tr[i].select('td')[2].get_text().strip(),tr[i].select('td')[3].get_text().strip(), tr[i].select('td')[4].get_text().strip(), b, c, tr[i].select('td')[7].get_text().strip(), d)
		#获取学校专业
		##查询学校
		res1 = requests.get('https://gaokao.chsi.com.cn/zyk/pub/myd/specAppraisalTop.action?yxmc=%s'%(tr[i].select('td')[0].get_text().strip())) 
		res1.raise_for_status() 
		soup1 = bs4.BeautifulSoup(res1.text, 'html.parser') 
		url = soup1.select('.check_detail')
		##查看全部专业
		u = 0
		for x in url:
			if u > 1:
				break
			u = u + 1
			res1 = requests.get('https://gaokao.chsi.com.cn/%s'%(x['href'])) 
			res1.raise_for_status() 
			soup1 = bs4.BeautifulSoup(res1.text, 'html.parser') 
			td = soup1.select(".first_td")
			j1 = 0
			while j1<len(td):
				if j1 == 0:
					j1 = j1 + 1
					continue
				d1 = ','
				a1 = a1 + "(%s, '%s')%s" % (m, td[j1].get_text().strip(), d1)
				j1 = j1 + 1
		i = i + 1
		m = m + 1
	#执行新增语句
	sql = """INSERT INTO school("""+field+""") VALUES """ + a
	sql1 = """INSERT INTO major(s_id,name) VALUES """ + a1
	j = j + 1
	try:
	   	# 执行sql语句	
	   	cursor.execute(sql)
	   	cursor.execute(sql1[:-1])
	   	# 提交到数据库执行
	   	db.commit()
	except:
	   	# 如果发生错误则回滚
	   	print(sql1)
	   	exit()
	   	db.rollback()
db.close()

如有不同理解，欢迎评论沟通。获取以运行好的数据库文件请点击这里。

JevonK

关注

0
点赞
踩
15

收藏

觉得还不错? 一键收藏
2
评论
python爬虫获取学信网学校与对应专业信息

python 学习有一段时间，于是开始着手写了一个入门爬虫试试，话不多说直接上源码了。 import requests import bs4import pymysqlimport jsonfrom xpinyin import Pinyinhost = "localhost"user = "root"password = "123456"database = "tes...
复制链接

扫一扫