- 今天闲来无事,写了个小爬虫,爬取了最好大学网的排名数据。
- 最好大学网
- 主要是使用requests库进行爬取,使用xpath和正则表达式提取数据,然后将数据保存到文件中。
- 源码
from lxml import etree
import requests
import re
import csv
class RankSpider():
def __init__(self, url):
self.url = url
def load_page(self):
"""
加载页面内容
"""
res = requests.get(self.url)
return res.content.decode()
def get_content(self, res):
"""
获取需要的排名内容
"""
title = re.search(r'<h3 class="post-title">(.*)</h3>', res)
html = etree.HTML(res)
head = html.xpath('//thead/tr/th/text()')[:4]
select_head = html.xpath('//thead/tr/th[5]/select/option/text()')
body = html.xpath('//tbody/tr')
body = self.get_body(body)
rank_info = dict()
rank_info['title'] = title.group(1)
rank_info['head'] = head + select_head
rank_info['body'] = body
return rank_info
def get_body(self, body):
"""
获取tbody中主要的排名数据
"""
data_list = list()
for tr in body:
data = tr.xpath('./td/text()')
school = tr.xpath('./td/div/text()')
data.insert(1, school[0])
data_list.append(data)
return data_list
def save(self, data, format_="txt"):
"""
保存数据
"""
title = data.get('title', 'file')
head = data.get('head')
body = data.get('body')
if format_ == "txt":
with open('{}.txt'.format(title), 'w') as f:
head = ",".join(head)
f.write(head+'\n')
for b in body:
body = ','.join(b)
f.write(body+'\n')
elif format_ == "csv":
with open('{}.csv'.format(title), 'w') as f:
csv_write = csv.writer(f)
csv_head = head
csv_write.writerow(csv_head)
for b in body:
csv_b = b
csv_write.writerow(csv_b)
return "success"
if __name__ == "__main__":
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
rs = RankSpider(url)
res = rs.load_page()
content = rs.get_content(res)
rs.save(content, "csv")
排名,学校名称,省市,总分,生源质量(新生高考成绩得分),培养结果(毕业生就业率),社会声誉(社会捐赠收入·千元),科研规模(论文数量·篇),科研质量(论文质量·FWCI),顶尖成果(高被引论文·篇),顶尖人才(高被引学者·人),科技服务(企业科研经费·千元),成果转化(技术转让收入·千元),学生国际化(留学生比例)
1,清华大学,北京,94.6,100.0,98.30%,1589319,48698,1.512,1810,126,1697330,302898,6.81%
2,北京大学,北京,76.5,95.2,98.07%,570497,47161,1.409,1402,100,554680,14445,6.15%
3,浙江大学,浙江,72.9,84.2,96.05%,352880,52249,1.197,1140,91,1179094,101208,5.71%
4,上海交通大学,上海,72.1,91.1,97.92%,275235,54447,1.201,1108,81,875715,12574,6.05%
5,复旦大学,上海,65.6,91.6,96.09%,251272,35028,1.384,828,57,348338,65034,6.77%
6,中国科学技术大学,安徽,60.9,91.1,93.40%,71038,25036,1.560,992,43,0,6400,2.32%