说明:
1、爬虫是违法行为,请谨慎,个人是为了学习研究爬虫原理
2、生成htmltable表格,并保存图片发送企业微信群
3、设置表格字体、边框、单元格等样式
#! /usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import base64
import json
import hashlib
import random
from bs4 import BeautifulSoup
import imgkit
from HTMLTable import HTMLTable
def get_agent():
# 客户端身份认证
useragent1 = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
useragent2 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
useragent3 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57"
useragent4 = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
useragent5 = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
useragent6 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"
useragent7 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"
ulist = [useragent1, useragent2, useragent3, useragent4, useragent5, useragent6, useragent7]
useragent = random.choice(ulist)
return useragent
def get_data(num):
url = f"http://www.mucaihome.com/quotation/price?page={num}"
useragent = get_agent()
headers = {
"User-Agent": useragent,
"Host": "www.mucaihome.com"
}
print(url)
req = requests.request('GET', url=url, headers=headers)
return req
def send_image(key, image):
wx_url = f"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={key}"
with open(image, 'rb') as fp:
# 获取base64位数据
data = fp.read()
get_base64 = base64.b64encode(data)
base64_data = str(get_base64, 'utf-8')
# 获取md5值
md = hashlib.md5()
md.update(data)
image_md5 = md.hexdigest()
headers = {
"Content-Type": "application/json",
}
body = {
"msgtype": "image",
"image": {
"base64": base64_data,
"md5": image_md5
}
}
req = requests.post(url=wx_url, data=json.dumps(body), headers=headers)
print(req.text)
return req
def htmltabl(page):
# 标题内容
table = HTMLTable(caption=f"木材价格表{page}")
# 表头内容
table.append_header_rows((
("产品", "规格", "价格", "产地", "时间"),
))
# 添加数据
req = get_data(page)
if req.status_code == 200:
html = req.text
soup = BeautifulSoup(html, 'html.parser')
titles = soup.find_all('td')
data_list = []
for items in titles:
data = items.text
data_list.append(data)
for num in range(0, len(data_list), 5):
row_data = tuple(data_list[num:num+5])
# print(row_data)
table.append_data_rows((
row_data,
))
# 设置标题样式
table.caption.set_style({
'color': '#871F78',
# 设置字体并加粗
'font-weight': 'bold',
'font-size': '30px',
})
# 设置表头样式
table.set_header_row_style({
'color': '#fff',
'background-color': '#48a6fb',
'font-weight': 'bold',
'font-size': '24px',
})
# 设置表格样式
table.set_style({
'border-collapse': 'collapse',
'word-break': 'keep-all',
'white-space': 'nowrap',
# 单元格字体并加粗
'font-weight': 'bold',
'font-size': '21px',
})
# 设置单元格样式
table.set_cell_style({
# 居中
'text-align': 'center',
'width': "200px",
'border-color': '#000',
'border-width': '1px',
'border-style': 'solid',
'padding': '10px',
})
# 设置边框样式
border_style = {
'border-color': '#FF0000',
'border-width': '2px',
'border-style': 'solid',
'border-collapse': 'collapse',
# 实现表格居中
'margin': 'auto',
}
# 设置外边框
table.set_style(border_style)
# 设置单元格边框
table.set_cell_style(border_style)
# 价格低于3500元, 标红色
for row in table.iter_data_rows():
data = row[2].value
res = data.strip('¥')
if res <= "3500":
row.set_style({
'background-color': '#D9D9F3',
})
html = table.to_html()
imgkit.from_string(html, f"fruit/2023木材价格表{page}.jpg")
# 定义变量并发送企业微信群
image_path = f"fruit/2023木材价格表{page}.jpg"
weixin_key = "企业微信keyID"
send_image(weixin_key, image_path)
if __name__ == '__main__':
for page in range(3):
print("爬虫第{}页数据".format(page + 1))
page = page + 1
htmltabl(page)