【python爬虫】读写、追加到excel文件中

爬取糗事百科热门

安装 读写excel 依赖 pip install xlwt
安装 追加excel文件内容 依赖 pip install xlutils
安装 lxml

import csv
import requests
from lxml import etree
import time
import xlwt
import os
from xlutils.copy import copy
import xlrd

data_infos_list = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}

# f = open('C:\\Users\\Administrator\\Desktop\\qiubaibook.csv', 'a+', newline='', encoding='utf-8')
# writer = csv.writer(f)
# writer.writerow(('author', 'sex', 'rank', 'content', 'great', 'comment', 'time'))

filename = 'C:\\Users\\Administrator\\Desktop\\qiubaibook.xls'


def get_info(url):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    # print(res.text)
    htmls = selector.xpath('//div[contains(@class,"article block untagged mb15")]')
    # // *[ @ id = "qiushi_tag_120024357"] / a[1] / div / span 内容
    # //*[@id="qiushi_tag_120024357"]/div[2]/span[1]/i 好笑
    # //*[@id="c-120024357"]/i 评论
    # //*[@id="qiushi_tag_120024357"]/div[1]/a[2]/h2 作者
    # //*[@id="qiushi_tag_120024357"]/div[1]/div 等级
    # // womenIcon manIcon 性别
    for html in htmls:
        author = html.xpath('div[1]/a[2]/h2/text()')
        if len(author) == 0:
            author = html.xpath('div[1]/span[2]/h2/text()')
        rank = html.xpath('div[1]/div/text()')
        sex = html.xpath('div[1]/div/@class')
        if len(sex) == 0:
            sex = '未知'
        elif 'manIcon' in sex[0]:
            sex = '男'
        elif 'womenIcon' in sex[0]:
            sex = '女'
        if len(rank) == 0:
            rank = '-1'
        contents = html.xpath('a[1]/div/span/text()')
        great = html.xpath('div[2]/span[1]/i/text()')  # //*[@id="qiushi_tag_112746244"]/div[3]/span[1]/i
        if len(great) == 0:
            great = html.xpath('div[3]/span[1]/i/text()')
        comment = html.xpath('div[2]/span[2]/a/i/text()')  # //*[@id="c-112746244"]/i
        if len(comment) == 0:
            comment = html.xpath('div[3]/span[2]/a/i/text()')
        # classes = html.xpath('a[1]/@class')
        # writer.writerow((author[0].strip(), sex, rank[0].strip(), contents[0].strip(), great[0].strip(),
        #                  comment[0].strip(), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
        data_infos = [author[0].strip(), sex, rank[0].strip(), contents[0].strip(), great[0].strip(),
                      comment[0].strip(), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))]
        data_infos_list.append(data_infos)


def write_data(sheet, row):
    for data_infos in data_infos_list:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


if __name__ == '__main__':
    urls = ['https://www.qiushibaike.com/8hr/page/{}/'.format(num) for num in range(1, 14)]
    for url in urls:
        print(url)
        get_info(url)
        time.sleep(2)
    # 如果文件存在,则追加。如果文件不存在,则新建
    if os.path.exists(filename):
        # 打开excel
        rb = xlrd.open_workbook(filename, formatting_info=True)  # formatting_info=True 保留原有字体颜色等样式
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn)
        # 删除原先的文件
        os.remove(filename)
        # 保存
        wb.save(filename)
    else:
        header = ['author', 'sex', 'rank', 'content', 'great', 'comment', 'time']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('糗百')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1)
        book.save(filename)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值