python自学6-excel内容的追加

最新推荐文章于 2024-03-27 17:07:17 发布

大智兄

最新推荐文章于 2024-03-27 17:07:17 发布

阅读量892

点赞数

分类专栏： python自学文章标签： python 开发语言后端

本文链接：https://blog.csdn.net/zhaozhi0810/article/details/123133566

版权

python自学专栏收录该内容

7 篇文章 0 订阅

订阅专栏

这次主要是excel的操作

1. 我想把爬下来的内容存在一个excel文件中。

2.由于可能会出问题，我决定爬取一个网页就把内容追加到表格中。

3.不同的类型（书籍）创建不同的sheet表格

import requests
import re
from bs4 import BeautifulSoup
import urllib.request
import urllib
# import xlsxwriter as xw
#
# import xlrd
# import xlwt
# from xlutils.copy import copy
from openpyxl import load_workbook
from openpyxl import Workbook
import os

" 想爬一下豆瓣网站上的评论数据 "

baseUrl = "https://book.douban.com/tag/?view=cloud"

T = "type=T"  # 综合排序
R = "type=R"  # 综合排序
S = "type=S"  # 综合排序
# start=0 start=20  第一页，第二页
tagUrl = "https://book.douban.com/tag/%E5%84%BF%E7%AB%A5%E6%96%87%E5%AD%A6?start=20&type=T"

# 网址有中文的处理urllib.parse.quote()


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/84.0.4147.105 Safari/537.36 '
}


def getHtmlText(url, encoding='utf-8'):
    r = requests.get(url, headers=headers)
    try:
        r.raise_for_status()
        r.encoding = encoding
        return r.text
    except:
        return ""


def getInfoFromHtml(uLIst, html):
    soup = BeautifulSoup(html, "html.parser")
    it = soup.find_all("tbody")
    if len(it) > 0:
        # print("len(it) = ", len(it))
        for i in range(len(it)):  # 可能出现几个表格
            for itm in it[0].find_all("td"):
                list = []  # 定义一个空列表
                a = itm.find("a")
                if not a is None:
                    # print(a["href"])
                    # print(urllib.parse.quote(a["href"]))  # 中文转换
                    # print(a.string)
                    # 如果a 没分析到，b就不分析了
                    list.append(a.string)
                    list.append(urllib.parse.quote(a["href"]))
                    b = itm.find("b")
                    if not b is None:
                        # print(re.search(r"\d+",b.string).group(0))
                        list.append(int(re.search(r"\d+", b.string).group(0)))  # 转为整数
                    else:  # 默认值为0
                        list.append(0)
                uLIst.append(list)


# 用于下载详情的网址
urlBaseGetInfo = 'https://book.douban.com'


# 获得每一页的数据
def getInfoFromSubHtml(list, url, startpage = 0):
    url = urlBaseGetInfo + url + "?start=" + str(startpage)
    print("url = ", url)
    html = getHtmlText(url)  # 下载页内数据
    soup = BeautifulSoup(html, "lxml")
    it = soup.find_all("ul", class_="subject-list")  # ul class="subject-list"
    print("len it = ", len(it))
    for i in range(len(it)):
        # print(it[i])
        itms = it[i].find_all("div", class_="info")
        # print("++++++++++len(itms) = ", len(itms))
        for itm in itms:
            list1 = []
            # print(itm)
            a = itm.find("a")
            # print("href = ", a["href"])
            # print("title = ", a["title"])
            # print("p1 = ", itm.find("span", class_="pl").string.strip())
            # print("rating_nums = ", itm.find("span", class_="rating_nums").string)
            # print("pub = ", itm.find("div", class_="pub").string.strip())
            # print("content = ", itm.find("p").string)
            if not a is None:
                list1.append(a["href"])
                list1.append(a["title"])
            else:
                continue   # 下一个
            # list1.append(itm.find("span", class_="pl").string.strip())
            b = itm.find("span", class_="pl")
            if not b is None:
                b = b.string.strip()
                list1.append(re.search(r"\d+", b).group(0))
            else:
                list1.append(0)
            c =  itm.find("span", class_="rating_nums")
            if not c is None:
                c = c.string
                list1.append(c)
            else:
                list1.append(0)
            # list1.append(re.search(r"\d+",itm.find("span", class_="rating_nums").string).group(0))
            d = itm.find("div", class_="pub")
            if not d is None:
                d = d.string.strip()
                list1.append(d)
            else:
                list1.append("none")

            e = itm.find("p")
            if not e is None:
                e = e.string
                list1.append(e)
            # print("------------------------------------+++++")
            list.append(list1)
        # print("------------")



def creatExcelFile(fileName,sheetName="sheet1"):
    workbook = Workbook(fileName)  # 打开工作簿
    print("创建工作表", fileName)
    # ws = workbook.active
    # ws.title = sheetName
    workbook.save(fileName)
    workbook.close()  # 关闭表

def saveListToExcel(LIst, fileName, sheetName="sheet1", lineNo=2):
    if not os.path.exists(fileName):   # 文件不存在就创建
        creatExcelFile(fileName,sheetName)   # 文件不存在先要创建，不然无法保存表格
    # 文件存在
    workbook = load_workbook(fileName)
    print("打开工作表", fileName)

    try:
        sheet = workbook[sheetName]  # 有没有这样的表格，没有则产生异常，在异常中创建表格
        # print(sheet)
    except:
        print("创建子表 = ", sheetName)
        sheet = workbook.create_sheet(sheetName)  # 表格不存在，则创
        workbook.save(fileName)
        title = ["详情网址", "书籍名称", "评价人数", "豆瓣得分", "出版信息", "简介"]  # 设置表头
        for col in range(len(title)):
            c = col + 1
            sheet.cell(row=1, column=c).value = title[col]
    for it in LIst:
        sheet.append(it)  # 从A2单元格开始写入数据

    workbook.save(fileName)
    workbook.close() # 关闭表
    print("------------------")



if __name__ == '__main__':
    infoList = []
    try:
        fp = open("db_save", mode="r", encoding="utf-8")  # 尝试去打开文件，打开失败就重新下载
        html = fp.read()  # 全部读出来
        if len(html) < 1000:  # if 不需要使用括号，读取的内容小于1000个字节，也是重新下载
            raise ()  # 人为触发异常

    except:  # 文件打开失败的时候，就触发重新爬取一次
        print("文件打开失败，下载网页")
        # url = "https://www.shanghairanking.cn/rankings/bcur/2021"
        html = getHtmlText(baseUrl)
        # print(len(html))
        fp = open("db_save", mode="w", encoding="utf-8")  # 把爬取的内容写到文件中，供下一次运行使用
        fp.write(html)
        fp.close()

    if len(html) > 1000:
        getInfoFromHtml(infoList, html)
        i = 0
        for it in infoList:
            print(it)
            while i < it[2]:
                list1 = []
                getInfoFromSubHtml(list1, it[1], i)
                saveListToExcel(list1, "豆瓣读书.xlsx", it[0])
            # write_excel_xls_append("豆瓣读书.xlsx", list1, it[0])
                i += 20  # 用于计数
            # print(list1)
            if i > 20:
                break

    else:
        print("文件内容小于1000个字节，不做解析处理")

友情提示：豆《》》》》瓣封ip

大智兄

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
python自学6-excel内容的追加

这次主要是excel的操作1. 我想把爬下来的内容存在一个excel文件中。2.由于可能会出问题，我决定爬取一个网页就把内容追加到表格中。3.不同的类型（书籍）创建不同的sheet表格import requestsimport refrom bs4 import BeautifulSoupimport urllib.requestimport urllib# import xlsxwriter as xw## import xlrd# import xlwt#
复制链接

扫一扫