requests库
用requests库爬取网站很简单,但现在是有很多网站都是https协议,再用之前的爬取方法就会报错,requests.exceptions.SSLError: 。想要改进其实也很简单,只要按照如下方法就可以了
import requests
url = "https://www.baidu.com/"
res = requests.get(url,verify=False)
print(res.text)
代码
# coding=utf-8
from bs4 import BeautifulSoup
import requests
import xlrd
import xlwt
from xlutils.copy import copy
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
# 使用beautifulsoup爬取想要的内容,这里爬了电影名,评分,评论
def prase(url):
res = requests.get(url,headers=headers,verify=False).text
soup = BeautifulSoup(res,'lxml')
items = soup.findAll('div',attrs={'class':'item'})
global outcome
outcome = []
for item in items:
title = item.find('span',attrs={'class','title'}).text
ranking = item.find('span',attrs={'class','rating_num'}).text
# 有的电影没有评论,要做出判断
if item.find('span',attrs={'class','inq'}):
comments = item.find('span',attrs={'class','inq'}).text
else:
comments = ''
outcome.append((title,ranking,comments))
print(outcome)
return outcome
# 写入标题用到的函数
def write_excel_xls(path, sheet_name, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlwt.Workbook() # 新建一个工作簿
sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
sheet.write(i, j, value[i][j]) # 向表格中写入数据(对应的行和列)
workbook.save(path) # 保存工作簿
print("xls格式表格写入数据成功!")
# 后续内容的写入
def write_excel_xls_append(path, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i + rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
print("xls格式表格【追加】写入数据成功!")
if __name__ == "__main__":
value_title = [['电影名', '评分', '评论']]
book_name_xls = 'test.xls'
sheet_name_xls = 'xls格式测试表'
write_excel_xls(book_name_xls, sheet_name_xls, value_title)
for i in range(0, 226, 25):
url = "https://movie.douban.com/top250?start={}&filter=".format(i)
prase(url)
write_excel_xls_append(book_name_xls, outcome)
参考文章
https://blog.csdn.net/u013250071/article/details/81911434