导入模块
from bs4 import BeautifulSoup as bfs
import matplotlib.pyplot as plt
import requests
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set_style()
直接爬取网页源代码
url = "http://master.pbcsf.tsinghua.edu.cn/content/details303_14172.html"
response = requests.get(url)
html = response.text.encode('iso-8859-1').decode('utf-8')
BS4解析
soup = bfs(html, 'html.parser')
datas = soup.select("table")[0].text.replace("\n"," ").split()
保存数据
values = [[] for i in range(7)]
n=len(datas)
for i in range(7,n):
values[i%7].append(datas[i])
keys=[]
for i in range(0,7):
keys.append(datas[i])
df = dict(zip(keys, values))
data_frame=pd.DataFrame(df)
data_frame.to_excel("THU2018.xlsx")
读取数据
data = pd.read_excel("THU2018.xlsx")
data.head()
提取目标数据
df = pd.DataFrame([data.PreExam,data.ReExam,data.Total],index=['PreExam', 'ReExam','Total'])
df = df.T
df.head()
df.describe()
查看数据统计信息
总共录取114名考试,初试平均分387分,最低分370分,最高分422分,50%的数据为386分,数据分析与前面一样就不做了
df.plot(kind='density', subplots=True, layout=(2,2), sharex=False, figsize=(15,10))