python爬取网站内容写入xls

木瓜星灵

已于 2024-02-26 11:51:34 修改

阅读量340

点赞数 4

文章标签： python 爬虫网络爬虫

于 2024-02-26 11:06:21 首次发布

本文链接：https://blog.csdn.net/qq_38312411/article/details/136293858

版权

目标

现需要对下面网站资源进行爬取，文学人物-名人明星网

获取人物名字
获取人物头像
获取人物简介

资源获取

通过requests库，我们可以让 Python 程序向浏览器一样向 Web 服务器发起请求，并接收服务器返回的响应，从响应中我们就可以提取出想要的数据。浏览器呈现给我们的网页是用HTML 编写的，浏览器相当于是 HTML 的解释器环境，我们看到的网页中的内容都包含在 HTML 的标签中。在获取到 HTML 代码后，就可以从标签的属性或标签体中提取内容。

for page in range(1, 3):
    resp = requests.get(
        url=f'https://www.hhwl88.com/type/literature.html?page={page}',
        # 如果不设置HTTP请求头中的User-Agent，网站可能会检测出不是浏览器而阻止我们的请求。
        # 通过get函数的headers参数设置User-Agent的值，具体的值可以在浏览器的开发者工具查看到。
        # 用爬虫访问大部分网站时，将爬虫伪装成来自浏览器的请求都是非常重要的一步。
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                               'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    )
    text = resp.text

    # 匹配到人名并写入xls
    # a href="/people/suxun/" title="苏洵" rel="nofollow" target="_blank">
    # <img src="https://img.hhwl88.com/upload/people/2022/11/04/648b7d0604a8fd2edb778a2fbe79752d.jpg" alt="苏洵" /></a>
    pattern1 = re.compile(
        r'<a href=".*?" title="(.*?)" rel="nofollow" target="_blank"><img src=".*?" alt=".*?" /></a>')
    titles = pattern1.findall(text)

保存资源

    # 写入xls
    f = xlwt.Workbook('encoding = utf-8')
    sheet1 = f.add_sheet('文学', cell_overwrite_ok=True)
    for i in range(len(titles)):
        sheet1.write(20*i + (page-1)*50*20, 0, titles[i])
    f.save('C:\\Users\\Administrator\\Desktop\\test.xls')

除了文本内容，我们也可以使用requests库通过 URL 获取二进制资源。下面演示了如何获取头像链接并下载保存到本地文件中。

    for i in range(len(imgs)):
        # 图片下载到本地
        resp = requests.get(imgs[i])
        img_name = 'literature' + str(i) + '.jpg'
        with open(img_name, 'wb') as file:
            file.write(resp.content)

由于xlwt插入图片只支持bitmap插入，需要将jpg转bitmap

    for i in range(len(imgs)):
        # 图片下载到本地
        resp = requests.get(imgs[i])
        img_name = 'literature' + str(i) + '.jpg'
        with open(img_name, 'wb') as file:
            file.write(resp.content)
            try:
                # xlwt只支持bitmap插入，jpg转bitmap
                img = Image.open(img_name)
                image_parts = img.split()
                r = image_parts[0]
                g = image_parts[1]
                b = image_parts[2]
                img = Image.merge("RGB", (r, g, b))
                fo = BytesIO()
                img.save(fo, format='bmp')
                sheet1.insert_bitmap_data(fo.getvalue(), 1+20*i + (page-1)*50*20, 1, scale_x=0.5, scale_y=0.5)
                img.close()
            except:
                sheet1.write(20*i + 1 + (page-1)*50*20, 1, imgs[i])  # 写入数据参数对应 行, 列, 值
    f.save('C:\\Users\\Administrator\\Desktop\\test.xls')  # 保存.xls到当前工作目录

获取到的网络资源如下

爬虫代码

import random
import re
import time
import requests
import xlwt
from PIL import Image
from io import BytesIO

# xls新建sheet
f = xlwt.Workbook('encoding = utf-8')
sheet1 = f.add_sheet('business', cell_overwrite_ok=True)

for page in range(1, 3):
    resp = requests.get(
        url=f'https://www.hhwl88.com/type/business.html?page={page}',
        # 如果不设置HTTP请求头中的User-Agent，网站可能会检测出不是浏览器而阻止我们的请求。
        # 通过get函数的headers参数设置User-Agent的值，具体的值可以在浏览器的开发者工具查看到。
        # 用爬虫访问大部分网站时，将爬虫伪装成来自浏览器的请求都是非常重要的一步。
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                               'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    )
    text = resp.text

    # 匹配到人名并写入xls
    # a href="/people/suxun/" title="苏洵" rel="nofollow" target="_blank">
    # <img src="https://img.hhwl88.com/upload/people/2022/11/04/648b7d0604a8fd2edb778a2fbe79752d.jpg" alt="苏洵" /></a>
    pattern1 = re.compile(
        r'<a href=".*?" title="(.*?)" rel="nofollow" target="_blank"><img src=".*?" alt=".*?" /></a>')
    titles = pattern1.findall(text)
    # # 写入xls
    # f = xlwt.Workbook('encoding = utf-8')
    # sheet1 = f.add_sheet('business', cell_overwrite_ok=True)
    for i in range(len(titles)):
        sheet1.write(20*i + (page-1)*50*20, 0, titles[i])
    # f.save('C:\\Users\\Administrator\\Desktop\\test.xls')

    # 匹配到头像并写入xls
    # a href="/people/suxun/" title="苏洵" rel="nofollow" target="_blank">
    # <img src="https://img.hhwl88.com/upload/people/2022/11/04/648b7d0604a8fd2edb778a2fbe79752d.jpg" alt="苏洵" /></a>
    pattern2 = re.compile(r'<a href=".*?" title=".*?" rel="nofollow" target="_blank"><img src="(.*?)" alt=".*?" /></a>')
    imgs = pattern2.findall(text)
    for i in range(len(imgs)):
        # 图片下载到本地
        resp = requests.get(imgs[i])
        img_name = 'business' + str(i) + '.jpg'
        with open(img_name, 'wb') as file:
            file.write(resp.content)
            try:
                # xlwt只支持bitmap插入，jpg转bitmap
                img = Image.open(img_name)
                image_parts = img.split()
                r = image_parts[0]
                g = image_parts[1]
                b = image_parts[2]
                img = Image.merge("RGB", (r, g, b))
                fo = BytesIO()
                img.save(fo, format='bmp')
                sheet1.insert_bitmap_data(fo.getvalue(), 1+20*i + (page-1)*50*20, 1, scale_x=0.5, scale_y=0.5)
                img.close()
            except:
                sheet1.write(20*i + 1 + (page-1)*50*20, 1, imgs[i])  # 写入数据参数对应 行, 列, 值
                print('faile')
    # f.save('C:\\Users\\Administrator\\Desktop\\test.xls')  # 保存.xls到当前工作目录

    # 人物描述写入xls
    pattern3 = re.compile(r'<span class="desc">(.*?)</span>')
    decs = pattern3.findall(text)
    for i in range(len(decs)):
        sheet1.write(20*i + (page-1)*50*20, 1, decs[i])
    # f.save('C:\\Users\\Administrator\\Desktop\\test.xls')

    # 随机休眠1-5秒，避免爬取页面过于频繁
    time.sleep(random.random() * 4 + 1)

f.save('C:\\Users\\Administrator\\Desktop\\test.xls')