python网页爬虫Excel自动保存

最新推荐文章于 2023-02-13 00:07:36 发布

CHARLIEVA

最新推荐文章于 2023-02-13 00:07:36 发布

阅读量821

点赞数

分类专栏：笔记文章标签： python 爬虫

本文链接：https://blog.csdn.net/pigyellow98/article/details/124100853

版权

笔记专栏收录该内容

2 篇文章 0 订阅

订阅专栏

python网页爬虫Excel自动保存

import requests
import matplotlib as plt
import re
import pandas as pd
import time
import random

	
    headers = {
  	  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
	}

#在索引页对该职称下所有老师的链接进行获取
firsturl = 'https://bme.scu.edu.cn/szdw1/gccrc.htm'
webdata = requests.get(firsturl,headers = headers,timeout = 1)
webdata.encoding = "utf-8"
MyHtml = webdata.text
print(MyHtml)

#爬取网页不同页面标签
pat = re.compile('info(.*?).htm')
fujiaoshou=[]
fujiaoshou.extend(pat.findall(MyHtml))
print(fujiaoshou)

	url2 = 'https://bme.scu.edu.cn/info/1145/1329.htm'
	data = requests.get(url2,headers = headers,timeout = 1)
	data.encoding = "utf-8"
	MyHtml = data.text
	print(MyHtml)

type_1=[]
type_2=[]
type_3=[]
type_4=[]
type_5=[]
type_6=[]
type_7=[]

url2 = 'https://bme.scu.edu.cn/info'+str(fujiaoshou[10])+'.htm'
print(url2)

#http://cres.ustb.edu.cn/n36/n77/?expert_id=534

type_6=[]
pat6 = re.compile('邮箱(.*?)</')
type_6.extend(pat6.findall(MyHtml))
print(type_6)

print(fujiaoshou.__len__())

# 最终爬虫
type_1=[]
type_2=[]
type_3=[]
type_4=[]
type_5=[]
for i in range(0,fujiaoshou.__len__()):
    #https://www.bjut.edu.cn/info/1060/1537.htm
    url2 = 'https://bme.scu.edu.cn/info'+str(fujiaoshou[i])+'.htm'
    #time.sleep(random.randint(10,12)) #如果爬大网站时需要用到，防止反爬虫机制
    data = requests.get(url2,headers = headers,timeout = 1)
    data.encoding = "utf-8"
    MyHtml = data.text
    #姓名
    pat1 = re.compile('ArticleTitle" Content="(.*?)"')
    type_1.extend(pat1.findall(MyHtml))
    #学院
    pat2 = re.compile('SiteName" Content="(.*?)"')
    type_2.extend(pat2.findall(MyHtml))
    #学位职称
    pat3 = re.compile('ColumnName" Content="(.*?)"')
    type_3.extend(pat3.findall(MyHtml))
    #主要研究方向
    pat4 = re.compile('主要研究方向：(.*?)</span></p>')
    type_4.extend(pat4.findall(MyHtml))
    #邮箱
    pat5 = re.compile('邮箱(.*?)"')
    type_5.extend(pat5.findall(MyHtml))
    type_5=[type_5[0]]
    #研究领域：

    dict = {'姓名':type_1,
       '学院':type_2,
       '职称':type_3,
       '主要研究方向':type_4,
       '邮箱：':type_5}
new_frame = pd.DataFrame.from_dict(dict, orient='index') #from_dict修正单元格问题
print(new_frame)

import sys
import xlwt
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('四川大学-生物医学工程学院')
# 写入excel
# 参数对应 行, 列, 值
worksheet.write(0,0,'姓名') #在1行1列写名字
worksheet.write(0,1,'学院')#大学在1行2列写行政职务
worksheet.write(0,2,'职称') #在1行3列写系别
worksheet.write(0,3,'主要研究方向') #在1行4列写职称
worksheet.write(0,4,'邮箱')

for i in range(1,fujiaoshou.__len__()):
    worksheet.write(i,0,new_frame[i-1][0])
    worksheet.write(i,1,new_frame[i-1][1])
    worksheet.write(i,2,new_frame[i-1][2])
    worksheet.write(i,3,new_frame[i-1][3])
    worksheet.write(i,4,new_frame[i-1][4])
# 保存
workbook.save('四川大学-生物医学工程学院.xls')

CHARLIEVA

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python网页爬虫Excel自动保存

python网页爬虫Excel自动保存import requestsimport matplotlib as pltimport reimport pandas as pdimport timeimport random headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66
复制链接

扫一扫

专栏目录