python网页爬虫Excel自动保存
import requests
import matplotlib as plt
import re
import pandas as pd
import time
import random
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
}
#在索引页对该职称下所有老师的链接进行获取
firsturl = 'https://bme.scu.edu.cn/szdw1/gccrc.htm'
webdata = requests.get(firsturl,headers = headers,timeout = 1)
webdata.encoding = "utf-8"
MyHtml = webdata.text
print(MyHtml)
#爬取网页不同页面标签
pat = re.compile('info(.*?).htm')
fujiaoshou=[]
fujiaoshou.extend(pat.findall(MyHtml))
print(fujiaoshou)
url2 = 'https://bme.scu.edu.cn/info/1145/1329.htm'
data = requests.get(url2,headers = headers,timeout = 1)
data.encoding = "utf-8"
MyHtml = data.text
print(MyHtml)
type_1=[]
type_2=[]
type_3=[]
type_4=[]
type_5=[]
type_6=[]
type_7=[]
url2 = 'https://bme.scu.edu.cn/info'+str(fujiaoshou[10])+'.htm'
print(url2)
#http://cres.ustb.edu.cn/n36/n77/?expert_id=534
type_6=[]
pat6 = re.compile('邮箱(.*?)</')
type_6.extend(pat6.findall(MyHtml))
print(type_6)
print(fujiaoshou.__len__())
# 最终爬虫
type_1=[]
type_2=[]
type_3=[]
type_4=[]
type_5=[]
for i in range(0,fujiaoshou.__len__()):
#https://www.bjut.edu.cn/info/1060/1537.htm
url2 = 'https://bme.scu.edu.cn/info'+str(fujiaoshou[i])+'.htm'
#time.sleep(random.randint(10,12)) #如果爬大网站时需要用到,防止反爬虫机制
data = requests.get(url2,headers = headers,timeout = 1)
data.encoding = "utf-8"
MyHtml = data.text
#姓名
pat1 = re.compile('ArticleTitle" Content="(.*?)"')
type_1.extend(pat1.findall(MyHtml))
#学院
pat2 = re.compile('SiteName" Content="(.*?)"')
type_2.extend(pat2.findall(MyHtml))
#学位职称
pat3 = re.compile('ColumnName" Content="(.*?)"')
type_3.extend(pat3.findall(MyHtml))
#主要研究方向
pat4 = re.compile('主要研究方向:(.*?)</span></p>')
type_4.extend(pat4.findall(MyHtml))
#邮箱
pat5 = re.compile('邮箱(.*?)"')
type_5.extend(pat5.findall(MyHtml))
type_5=[type_5[0]]
#研究领域:
dict = {'姓名':type_1,
'学院':type_2,
'职称':type_3,
'主要研究方向':type_4,
'邮箱:':type_5}
new_frame = pd.DataFrame.from_dict(dict, orient='index') #from_dict修正单元格问题
print(new_frame)
import sys
import xlwt
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('四川大学-生物医学工程学院')
# 写入excel
# 参数对应 行, 列, 值
worksheet.write(0,0,'姓名') #在1行1列写名字
worksheet.write(0,1,'学院')#大学在1行2列写行政职务
worksheet.write(0,2,'职称') #在1行3列写系别
worksheet.write(0,3,'主要研究方向') #在1行4列写职称
worksheet.write(0,4,'邮箱')
for i in range(1,fujiaoshou.__len__()):
worksheet.write(i,0,new_frame[i-1][0])
worksheet.write(i,1,new_frame[i-1][1])
worksheet.write(i,2,new_frame[i-1][2])
worksheet.write(i,3,new_frame[i-1][3])
worksheet.write(i,4,new_frame[i-1][4])
# 保存
workbook.save('四川大学-生物医学工程学院.xls')