爬虫数据生成htmltabl表格

说明:

1、爬虫是违法行为,请谨慎,个人是为了学习研究爬虫原理

2、生成htmltable表格,并保存图片发送企业微信群

3、设置表格字体、边框、单元格等样式

#! /usr/bin/python3
# -*- coding: utf-8 -*-

import requests
import base64
import json
import hashlib
import random
from bs4 import BeautifulSoup
import imgkit
from HTMLTable import HTMLTable

def get_agent():
    # 客户端身份认证
    useragent1 = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    useragent2 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
    useragent3 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57"
    useragent4 = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
    useragent5 = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
    useragent6 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"
    useragent7 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

    ulist = [useragent1, useragent2, useragent3, useragent4, useragent5, useragent6, useragent7]
    useragent = random.choice(ulist)

    return useragent

def get_data(num):
    url = f"http://www.mucaihome.com/quotation/price?page={num}"
    useragent = get_agent()
    headers = {
        "User-Agent": useragent,
        "Host": "www.mucaihome.com"
    }
    print(url)
    req = requests.request('GET', url=url, headers=headers)
    return req

def send_image(key, image):
    wx_url = f"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={key}"
    with open(image, 'rb') as fp:
        # 获取base64位数据
        data = fp.read()
        get_base64 = base64.b64encode(data)
        base64_data = str(get_base64, 'utf-8')
        # 获取md5值
        md = hashlib.md5()
        md.update(data)
        image_md5 = md.hexdigest()
    headers = {
        "Content-Type": "application/json",
    }
    body = {
        "msgtype": "image",
        "image": {
            "base64": base64_data,
            "md5": image_md5
        }
    }
    req = requests.post(url=wx_url, data=json.dumps(body), headers=headers)
    print(req.text)
    return req

def htmltabl(page):
    # 标题内容
    table = HTMLTable(caption=f"木材价格表{page}")
    # 表头内容
    table.append_header_rows((
        ("产品", "规格", "价格", "产地", "时间"),
    ))

    # 添加数据
    req = get_data(page)
    if req.status_code == 200:
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')
        titles = soup.find_all('td')
        data_list = []
        for items in titles:
           data = items.text
           data_list.append(data)
        for num in range(0, len(data_list), 5):
            row_data = tuple(data_list[num:num+5])
            # print(row_data)
            table.append_data_rows((
                row_data,
            ))
    # 设置标题样式
    table.caption.set_style({
        'color': '#871F78',
        # 设置字体并加粗
        'font-weight': 'bold',
        'font-size': '30px',
    })
    # 设置表头样式
    table.set_header_row_style({
        'color': '#fff',
        'background-color': '#48a6fb',
        'font-weight': 'bold',
        'font-size': '24px',
    })
    # 设置表格样式
    table.set_style({
        'border-collapse': 'collapse',
        'word-break': 'keep-all',
        'white-space': 'nowrap',
        # 单元格字体并加粗
        'font-weight': 'bold',
        'font-size': '21px',
    })
    # 设置单元格样式
    table.set_cell_style({
        # 居中
        'text-align': 'center',
        'width': "200px",
        'border-color': '#000',
        'border-width': '1px',
        'border-style': 'solid',
        'padding': '10px',
    })

    # 设置边框样式
    border_style = {
        'border-color': '#FF0000',
        'border-width': '2px',
        'border-style': 'solid',
        'border-collapse': 'collapse',
        # 实现表格居中
        'margin': 'auto',
    }
    
    # 设置外边框
    table.set_style(border_style)

    # 设置单元格边框
    table.set_cell_style(border_style)

    # 价格低于3500元, 标红色
    for row in table.iter_data_rows():
        data = row[2].value
        res = data.strip('¥')
        if res <= "3500":
            row.set_style({
                'background-color': '#D9D9F3',
            })
    html = table.to_html()
    imgkit.from_string(html, f"fruit/2023木材价格表{page}.jpg")

    # 定义变量并发送企业微信群
    image_path = f"fruit/2023木材价格表{page}.jpg"
    weixin_key = "企业微信keyID"
    send_image(weixin_key, image_path)

if __name__ == '__main__':
    for page in range(3):
        print("爬虫第{}页数据".format(page + 1))
        page = page + 1
        htmltabl(page)

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

秋天枫叶35

希望能帮到你,谢谢你能阅读~

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值