中国大学排行榜相关信息爬取,Python实现
目录
网页信息爬取
import requests
url = "https://www.shanghairanking.cn/rankings/bcur/2020.html"
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
爬取主榜及医药类排行榜数据,并保存在文件中
from typing import List
import requests
from bs4 import BeautifulSoup
import bs4
import pandas as pd
# 输入获取的url的信息,输出是url的内容
def getHTMLText(url):
try:
r = requests.get(url, timeout=30) # 30s
r.raise_for_status() # 返回异常信息
r.encoding = r.apparent_encoding # 修改编码
return r.text
except:
return "出现异常"
# 将页面放入list列表中
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr.findAll('td')
ulist.append([tds[0].text, tds[1].text, tds[2].text, tds[3].text, tds[4].text, tds[5].text])
# 将ulist打印出来
def printUnivList(ulist):
tplt = "{0:^10}\t{1:{6}^10}\t{2:^10}\t{3:^10}\t{4:^9}\t{5:^10}"
print(tplt.format("排名", "学校名称", "省市", "类型", "总分", "办学层次", chr(12288)))
for i in range(len(ulist)):
u = ulist[i]
print(tplt.format(str(u[0]).replace("\n", " ").strip(), u[1].replace("\n", " ").strip(),
str(u[2]).replace("\n", " ").strip(), str(u[3]).replace("\n", " ").strip(),
str(u[4]).replace("\n", " ").strip(), str(u[5]).replace("\n", " ").strip(), chr(12288)))
def format_list(l) -> list:
for i in range(len(l)):
for j in range(len(l[i])):
l[i][j] = str(l[i][j]).strip().replace("\n", "")
return l
def China_main():
uinfo = [] # 将大学信息放入
url = 'https://www.shanghairanking.cn/rankings/bcur/2020.html'#大学排名链接
html = getHTMLText(url) # 将url转换成html
fillUnivList(uinfo, html)
printUnivList(uinfo)
uinfo = format_list(uinfo)
data = pd.DataFrame(uinfo)
data.to_csv("China University.csv")
data_ = open("China University.csv",'r+',encoding='utf-8')
lines = data_.readlines()
data_.close()
lines[0] =',排名,学校名称,省市,类型,总分,办学层次\n'
f_new =open("China University.csv", 'w+',encoding='utf-8')
f_new.writelines(lines)
f_new.close()
def Medical_main():
uinfo = [] # 将大学信息放入
url = 'https://www.shanghairanking.cn/rankings/bcur/202021.html' #大学排名链接
html = getHTMLText(url) #将url转换成html
fillUnivList(uinfo, html)
printUnivList(uinfo)
uinfo = format_list(uinfo)
data = pd.DataFrame(uinfo)
data.to_csv("medical university.csv")
data_ = open("medical university.csv",'r+',encoding='utf-8')
lines = data_.readlines()
data_.close()
lines[0] =',排名,学校名称,省市,类型,总分,办学层次\n'
f_new =open("medical university.csv", 'w+',encoding='utf-8')
f_new.writelines(lines)
data_.close()
主榜数据
China_main()
分析每个地区上榜大学的数量,保存在文件中
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
path = pd.read_csv(
'China University.csv')
province_num = path["省市"].value_counts()
# 转换成DataFrame
province_dict = {"省市": province_num.index, "高校数量": province_num.values}
province_num = pd.DataFrame(province_dict)
province_num.to_csv('province.csv')
print('*'*20)
print('省市\t高校数量')
print(path["省市"].value_counts())
print('*'*25)
#前10 数据
data = pd.read_csv('province.csv')
top_10 = data.head(10)
分析前十名的地区的大学数量,绘制柱状图
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
path = pd.read_csv(
'China University.csv')
province_num = path["省市"].value_counts()
# 转换成DataFrame
province_dict = {"省市": province_num.index, "高校数量": province_num.values}
province_num = pd.DataFrame(province_dict)
province_num.to_csv('province.csv')
print('*'*20)
print('省市\t高校数量')
print(path["省市"].value_counts())
print('*'*25)
#前10 数据
data = pd.read_csv('province.csv')
top_10 = data.head(10)
绘制各省市3D热力地图
3D地图的绘制采用了pyecharts这个库,有现成的代码只需要修改相应的参数
pyechartts网页链接
from pyecharts import options as opts
from pyecharts.charts import Map3D
from pyecharts.globals import ChartType
from pyecharts.commons.utils import JsCode
example_data = [
("黑龙江", [127.9688, 45.368, 19]),
("内蒙古", [110.3467, 41.4899, 12]),
("吉林", [125.8154, 44.2584, 18]),
("辽宁", [123.1238, 42.1216, 27]),
("河北", [114.4995, 38.1006, 25]),
("天津", [117.4219, 39.4189, 11]),
("山西", [112.3352, 37.9413, 17]),
("陕西", [109.1162, 34.2004, 26]),
("甘肃", [103.5901, 36.3043, 12]),
("宁夏", [106.3586, 38.1775, 2]),
("青海", [101.4038, 36.8207, 2]),
("新疆", [87.9236, 43.5883, 10]),
("西藏", [91.11, 29.97, 2]),
("四川", [103.9526, 30.7617, 25]),
("重庆", [108.384366, 30.439702, 11]),
("山东", [117.1582, 36.8701, 32]),
("河南", [113.4668, 34.6234, 31]),
("江苏", [118.8062, 31.9208, 35]),
("安徽", [117.29, 32.0581, 24]),
("湖北", [114.3896, 30.6628, 25]),
("浙江", [119.5313, 29.8773, 21]),
("福建", [119.4543, 25.9222, 17]),
("江西", [116.0046, 28.6633, 20]),
("湖南", [113.0823, 28.2568, 25]),
("贵州", [106.6992, 26.7682, 13]),
("广西", [108.479, 23.1152, 14]),
("海南", [110.3893, 19.8516, 4]),
("上海", [121.4648, 31.2891, 18]),
("北京", [116.4600, 39.9200, 28]),
("广东", [113.2300, 23.1600, 25]),
("云南", [102.7300, 25.0400, 16]),
]
c = (
Map3D()
.add_schema(
itemstyle_opts=opts.ItemStyleOpts(
color="rgb(5,101,123)",
opacity=1,#透明度
border_width=0.8,
border_color="rgb(62,215,213)",
),
map3d_label=opts.Map3DLabelOpts(
is_show=False,
formatter=JsCode("function(data){return data.name + " " + data.value[2];}"),
),
emphasis_label_opts=opts.LabelOpts(
is_show=False,
color="#fff",
font_size=10,
background_color="rgba(0,23,11,0)",
),
light_opts=opts.Map3DLightOpts(
main_color="#fff",
main_intensity=1.2,
main_shadow_quality="high",
is_main_shadow=False,
main_beta=10,
ambient_intensity=0.3,
),
)
.add(
series_name="高校数量_3D",
data_pair=example_data,
type_=ChartType.BAR3D,
bar_size=1,
shading="lambert",
label_opts=opts.LabelOpts(
is_show=False,
formatter=JsCode("function(data){return data.name + ' ' + data.value[2];}"),
),
)
.set_global_opts(title_opts=opts.TitleOpts(title="中国高校分布3D热力地图"))
.render("map3d_with_bar3d.html")
)
医药类大学排行榜
Medical_main()
医药类前十分布
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
path = pd.read_csv(
'medical university.csv')
province_num = path["省市"].value_counts()
# 转换成DataFrame
province_dict = {"省市": province_num.index, "高校数量": province_num.values}
province_num = pd.DataFrame(province_dict)
province_num.to_csv('medical_province.csv')
print('*'*20)
print('省市\t高校数量')
print(path["省市"].value_counts())
print('*'*25)
#前10 数据
data = pd.read_csv('medical_province.csv')
top_10 = data.head(10)
plt.figure()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = sns.barplot(data=top_10, x="省市", y="高校数量").get_figure()
plt.show()
医药类3D热力地图
from pyecharts import options as opts
from pyecharts.charts import Map3D
from pyecharts.globals import ChartType
from pyecharts.commons.utils import JsCode
example_data = [
("黑龙江", [127.9688, 45.368,4]),
("辽宁", [123.1238, 42.1216, 6]),
("河北", [114.4995, 38.1006, 4]),
("四川", [103.9526, 30.7617, 4]),
("山东", [117.1582, 36.8701, 5]),
("江苏", [118.8062, 31.9208, 4]),
("安徽", [117.29, 32.0581, 4]),
("广西", [108.479, 23.1152, 4]),
("海南", [110.3893, 19.8516, 4]),
("广东", [113.2300, 23.1600, 5])
]
c = (
Map3D()
.add_schema(
itemstyle_opts=opts.ItemStyleOpts(
color="rgb(5,101,123)",
opacity=1,#透明度
border_width=0.8,
border_color="rgb(62,215,213)",
),
map3d_label=opts.Map3DLabelOpts(
is_show=False,
formatter=JsCode("function(data){return data.name + " " + data.value[2];}"),
),
emphasis_label_opts=opts.LabelOpts(
is_show=False,
color="#fff",
font_size=10,
background_color="rgba(0,23,11,0)",
),
light_opts=opts.Map3DLightOpts(
main_color="#fff",
main_intensity=1.2,
main_shadow_quality="high",
is_main_shadow=False,
main_beta=10,
ambient_intensity=0.3,
),
)
.add(
series_name="医药类高校数量_3D",
data_pair=example_data,
type_=ChartType.BAR3D,
bar_size=1,
shading="lambert",
label_opts=opts.LabelOpts(
is_show=False,
formatter=JsCode("function(data){return data.name + ' ' + data.value[2];}"),
),
)
.set_global_opts(title_opts=opts.TitleOpts(title="医药类高校前九分布3D热力地图"))
.render("medical.html")
)
后记
这是笔者大二时期Python课程设计的题目之一,爬取中国大学排行榜的相关内容。
在这个过程中遇到过不少疑难杂症和Bug,得到过大佬的帮助,也有幸帮助过别人。
这是题目所有的代码,作为一个记录,也希望帮助到需要的人。
代码能力依然有待加强,笔者也会继续努力。
欢迎看到这篇文章的读者朋友们,加以指正。
谨以此作为笔者的第一篇博客。