以下为爬虫脚本代码 resuqsts_page.py 脚本容器目录所存放位置已做linux系统更改
import datetime
import requests
import openpyxl
import pymysql
import time
from lxml import etree
from dbutils.pooled_db import PooledDB
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# create_table = """
#
# CREATE TABLE res_info.res_info (
# id int(11) auto_increment NOT NULL COMMENT 'ID',
# title varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '标题',
# content varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '详情',
# price varchar(30) NOT NULL COMMENT '价格',
# image varchar(30) NOT NULL COMMENT '图片',
# created_at timestamp DEFAULT CURRENT_TIMESTAMP NULL COMMENT '创建时间',
# CONSTRAINT `PRIMARY` PRIMARY KEY (id)
# )
# ENGINE=InnoDB
# DEFAULT CHARSET=utf8
# COLLATE=utf8_general_ci
# COMMENT='数据表';
#
# """
pool = PooledDB(pymysql, maxconnections=5, host='192.168.14.93', user='root', password='abc123', database='res_info',
charset='utf8')
def res_info(page):
time.sleep(2)
if page == 1:
url = 'https://cs.lianjia.com/ershoufang/'
else:
url = 'https://cs.lianjia.com/ershoufang/pg{}'.format(page)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'}
response = requests.get (url,headers=headers,verify=False)
res = response.text
# xpath 使用变量
etree_html = etree.HTML(res)
# 获取房源标题
xpath_title = etree_html.xpath(r"//div[@class='info clear']/div[@class='title']/a/text()")
# 获取房源地址
axpath_address = etree_html.xpath(r"//div[@class='info clear']/div[@class='flood']/div[@class='positionInfo']/a/text()")
address_values = [str(axpath_address[i]).strip() + '-' + str(axpath_address[i + 1]) for i in range(0, len(axpath_address), 2)]
# 获取房源详情
content = etree_html.xpath("//div[@class='info clear']/div[@class='address']/div[@class='houseInfo']/text()")
content_values = [i.replace('|', ',').replace(" ", "") for i in content]
# 获取房源总价格
price = etree_html.xpath("//div[@class='info clear']/div[@class='priceInfo']/div[@class='totalPrice totalPrice2']/span/text()")
price_values = [str(i) + '万' for i in price]
# 获取房源图片
image = etree_html.xpath("//img[@class='lj-lazy']/@data-original")
# 构建数据字典格式为入库做准备
result = [{'title': i[0], 'address': i[1], 'content': i[2], 'price': i[3], 'image': i[4]} for i in
zip(xpath_title, address_values, content_values, price_values, image)]
print('----------------正在采集第{}页数据----------------'.format(page))
# 创建一个新的工作簿
workbook = openpyxl.Workbook()
# 创建一个新的工作表
worksheet = workbook.active
# 添加表头
worksheet.append(['房源标题', '房源地址', '房源详情', '房源价格', '房源图片'])
# 将数据写入工作表
for i in result:
worksheet.append([i['title'], i['address'], i['content'], i['price'], i['image']])
# 保存工作簿为 Excel 文件
workbook.save(r'/image_docker/static/房源第{}页数据.xlsx'.format(page))
# 连接
conn = pool.connection()
for i in result:
sql = 'INSERT INTO `res_info` (`title`, `address`, `content`, `price`, `image`) VALUES ("%s", "%s", "%s", "%s", "%s")' % (
i['title'], i['address'], i['content'], i['price'], i['image']
)
try:
# 执行插入操作
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
# 提交事务
print('数据插入成功!')
except Exception as e:
# 处理其他异常
pass
# print('数据插入失败!数据库中数据已存在!', str(e))
print('数据插入成功!')
print('----------------采集第{}页数据完成----------------'.format(page))
return result
import threading
def execute_res_info(start_page, end_page):
for page in range(start_page, end_page + 1):
res_info(page)
while 1:
# 创建两个线程来执行1到10页和2到20页的记录
thread1 = threading.Thread(target=execute_res_info, args=(1, 10))
thread2 = threading.Thread(target=execute_res_info, args=(2, 20))
# 启动线程
thread1.start()
thread2.start()
# 等待线程执行完毕
thread1.join()
thread2.join()
print("所有记录执行完成",datetime.datetime.now())
time.sleep(60)
部署所用dockerfile
# python版本,可根据需求进行修改
FROM python:3.7
# 安装 tzdata 包,并设置上海时区
# 设置时区为中国标准时间
ENV TZ=Asia/Shanghai
# 创建工作目录 /image_docker,并将python程序和依赖添加到镜像
RUN mkdir /image_docker
COPY resuqsts_page.py /image_docker/
COPY requirements.txt /image_docker/
COPY static /image_docker/static/
# 设置工作目录为 /image_docker
WORKDIR /image_docker
# 安装 Python 依赖包
RUN pip install --no-cache-dir -r requirements.txt
# ENTRYPOINT 将 Socket_Client.py 设置为入口点程序
ENTRYPOINT ["python", "resuqsts_page.py"]
# CMD 将 Image_ex.py 设置为默认执行的命令
CMD ["python", "resuqsts_page.py"]
下列是所用到的包放在requirements.txt
PyMySQL~=1.0.3
DBUtils~=3.0.3
requests
openpyxl
lxml
部署根目录以及文件存放位置
docker 容器运行命令
https://chat10.aichatos.xyz/#/chat/1691139414994
docker run --name res_info -d --dns=8.8.8.8 res_info
仅供参考!!!