python爬取机械专业考研大学排名
今天写了一个简单简单的爬虫,用来爬取自己专业的考研大学排名,思想比较简单,但是可能是初学者的原因,写的过程比较繁琐,希望大家理解,下面看一下代码
# -*- coding:utf-8 -*-
# -Author-= JamesBen
# Email: 1597757775@qq.com
import requests
import re
import time
def get_HTMLText(url):
try :
use = {'User-Agent': 'Mozilla/5.0'} #此行代码骗过服务器我们是使用浏览器进行访问的,防止有些网站对我们进行拦截
r = requests.get(url, timeout = 30,headers = use)
r.raise_for_status() #如果状态不是200引发HTTPError异常
r.encoding = r.apparent_encoding #将文本的编辑方式传给头,防止造成编码错路出现乱码
print (r.encoding)
return r.text
except :
return "产生异常"
#获取网页中学校的排名函数
def get_U_num(html):
num_all = re.findall(r'<table border="1" cellpadding="0" cellspacing="0" style="width:100%;">(.*?)</table>',html)
num = re.findall(r'<td height="23" width="172" style="text-align:center;">(.*?)</td>', num_all[0])
return num
#获取网页中的学校名字
def get_U_name(html):
name_all = re.findall(r'<td width="138" style="text-align:center;">(.*?)</td>',html)
i = 0
a = [] #定义一个数组,用来存放名字
for name in name_all:
str = "class='keyWord'"
i = i + 1
if str in name:
name1 = re.findall(r" target='_blank'>(.*?)</a>",name)
name = name + "1"
name2 = re.findall(r"</a>(.*?)1", name)
name = name1+name2
if str in name:
name = name+"1"
name3= re.findall(r'</a>(.*?)1',name[2])
name = name + name3
name = name[0]+name[1]+name[2]
a.append(name)
if len(name_all) == i:
return a
else:
name = name[0]+name[1]
a.append(name)
if len(name_all) == i:
return a
else :
a.append(name)
if len(name_all) == i:
return a
def get_type(html):
type_all = re.findall(r'<td width="72" style="text-align:center;">(.*?)</td>',html)
return type_all
def Print_U():
print ("+{:-^80}".format("-"), "+")
i = 0
U_num.insert(0, "排名")
for i in range(len(U_num)):
#print (" |", U_num[i].center(10, " "), "{:^10}".format(" "), U_name[i].ljust(20), "{:^20}".format(" "),U_type[i].center(10))
save_all = "{0:^10}\t{1:{3}^20}\t{2:^6}".format(U_num[i], U_name[i], U_type[i], chr(12288))
print("{0:^10}\t{1:{3}^20}\t{2:^6}".format(U_num[i], U_name[i], U_type[i], chr(12288)))
f = open('Ranking of Mechanical University.txt', 'a', encoding='GB2312')
f.writelines([save_all ,"\n"])
f.close()
i = i + 1
print ("+{:-^80}".format("-"), "+")
if __name__ == "__main__":
open('Ranking of Mechanical University.txt', 'w').close()
url = "https://www.dxsbb.com/news/2851.html"
html = get_HTMLText(url)
U_num = get_U_num(html)
U_name = get_U_name(html)
U_type = get_type(html)
Print_U()
以上即为全部代码,代码运行如下:
希望上文对你有用,感谢你的阅读。