这次主要是excel的操作
1. 我想把爬下来的内容存在一个excel文件中。
2.由于可能会出问题,我决定爬 取一个网页就把内容追加到表格中。
3.不同的类型(书籍)创建不同的sheet表格
import requests
import re
from bs4 import BeautifulSoup
import urllib.request
import urllib
# import xlsxwriter as xw
#
# import xlrd
# import xlwt
# from xlutils.copy import copy
from openpyxl import load_workbook
from openpyxl import Workbook
import os
" 想爬一下豆瓣网站上的评论数据 "
baseUrl = "https://book.douban.com/tag/?view=cloud"
T = "type=T" # 综合排序
R = "type=R" # 综合排序
S = "type=S" # 综合排序
# start=0 start=20 第一页,第二页
tagUrl = "https://book.douban.com/tag/%E5%84%BF%E7%AB%A5%E6%96%87%E5%AD%A6?start=20&type=T"
# 网址有中文的处理urllib.parse.quote()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.105 Safari/537.36 '
}
def getHtmlText(url, encoding='utf-8'):
r = requests.get(url, headers=headers)
try:
r.raise_for_status()
r.encoding = encoding
return r.text
except:
return ""
def getInfoFromHtml(uLIst, html):
soup = BeautifulSoup(html, "html.parser")
it = soup.find_all("tbody")
if len(it) > 0:
# print("len(it) = ", len(it))
for i in range(len(it)): # 可能出现几个表格
for itm in it[0].find_all("td"):
list = [] # 定义一个空列表
a = itm.find("a")
if not a is None:
# print(a["href"])
# print(urllib.parse.quote(a["href"])) # 中文转换
# print(a.string)
# 如果a 没分析到,b就不分析了
list.append(a.string)
list.append(urllib.parse.quote(a["href"]))
b = itm.find("b")
if not b is None:
# print(re.search(r"\d+",b.string).group(0))
list.append(int(re.search(r"\d+", b.string).group(0))) # 转为整数
else: # 默认值为0
list.append(0)
uLIst.append(list)
# 用于下载详情的网址
urlBaseGetInfo = 'https://book.douban.com'
# 获得每一页的数据
def getInfoFromSubHtml(list, url, startpage = 0):
url = urlBaseGetInfo + url + "?start=" + str(startpage)
print("url = ", url)
html = getHtmlText(url) # 下载页内数据
soup = BeautifulSoup(html, "lxml")
it = soup.find_all("ul", class_="subject-list") # ul class="subject-list"
print("len it = ", len(it))
for i in range(len(it)):
# print(it[i])
itms = it[i].find_all("div", class_="info")
# print("++++++++++len(itms) = ", len(itms))
for itm in itms:
list1 = []
# print(itm)
a = itm.find("a")
# print("href = ", a["href"])
# print("title = ", a["title"])
# print("p1 = ", itm.find("span", class_="pl").string.strip())
# print("rating_nums = ", itm.find("span", class_="rating_nums").string)
# print("pub = ", itm.find("div", class_="pub").string.strip())
# print("content = ", itm.find("p").string)
if not a is None:
list1.append(a["href"])
list1.append(a["title"])
else:
continue # 下一个
# list1.append(itm.find("span", class_="pl").string.strip())
b = itm.find("span", class_="pl")
if not b is None:
b = b.string.strip()
list1.append(re.search(r"\d+", b).group(0))
else:
list1.append(0)
c = itm.find("span", class_="rating_nums")
if not c is None:
c = c.string
list1.append(c)
else:
list1.append(0)
# list1.append(re.search(r"\d+",itm.find("span", class_="rating_nums").string).group(0))
d = itm.find("div", class_="pub")
if not d is None:
d = d.string.strip()
list1.append(d)
else:
list1.append("none")
e = itm.find("p")
if not e is None:
e = e.string
list1.append(e)
# print("------------------------------------+++++")
list.append(list1)
# print("------------")
def creatExcelFile(fileName,sheetName="sheet1"):
workbook = Workbook(fileName) # 打开工作簿
print("创建工作表", fileName)
# ws = workbook.active
# ws.title = sheetName
workbook.save(fileName)
workbook.close() # 关闭表
def saveListToExcel(LIst, fileName, sheetName="sheet1", lineNo=2):
if not os.path.exists(fileName): # 文件不存在就创建
creatExcelFile(fileName,sheetName) # 文件不存在先要创建,不然无法保存表格
# 文件存在
workbook = load_workbook(fileName)
print("打开工作表", fileName)
try:
sheet = workbook[sheetName] # 有没有这样的表格,没有则产生异常,在异常中创建表格
# print(sheet)
except:
print("创建子表 = ", sheetName)
sheet = workbook.create_sheet(sheetName) # 表格不存在,则创
workbook.save(fileName)
title = ["详情网址", "书籍名称", "评价人数", "豆瓣得分", "出版信息", "简介"] # 设置表头
for col in range(len(title)):
c = col + 1
sheet.cell(row=1, column=c).value = title[col]
for it in LIst:
sheet.append(it) # 从A2单元格开始写入数据
workbook.save(fileName)
workbook.close() # 关闭表
print("------------------")
if __name__ == '__main__':
infoList = []
try:
fp = open("db_save", mode="r", encoding="utf-8") # 尝试去打开文件,打开失败就重新下载
html = fp.read() # 全部读出来
if len(html) < 1000: # if 不需要使用括号,读取的内容小于1000个字节,也是重新下载
raise () # 人为触发异常
except: # 文件打开失败的时候,就触发重新爬取一次
print("文件打开失败,下载网页")
# url = "https://www.shanghairanking.cn/rankings/bcur/2021"
html = getHtmlText(baseUrl)
# print(len(html))
fp = open("db_save", mode="w", encoding="utf-8") # 把爬取的内容写到文件中,供下一次运行使用
fp.write(html)
fp.close()
if len(html) > 1000:
getInfoFromHtml(infoList, html)
i = 0
for it in infoList:
print(it)
while i < it[2]:
list1 = []
getInfoFromSubHtml(list1, it[1], i)
saveListToExcel(list1, "豆瓣读书.xlsx", it[0])
# write_excel_xls_append("豆瓣读书.xlsx", list1, it[0])
i += 20 # 用于计数
# print(list1)
if i > 20:
break
else:
print("文件内容小于1000个字节,不做解析处理")
友情提示:豆 《》》》》 瓣 封ip