第十五天-爬虫项目实战

本文介绍了如何使用Python编写一个多线程爬虫程序,包括PageSpider、DetailSpider和DataParse模块,用于从指定网站抓取页面、解析详情并保存到Excel中,展示了如何处理请求、解析HTML和并发控制。
摘要由CSDN通过智能技术生成

目录

1.介绍

2.代码

1.main.py

2.PageSider.py

3.DetailSpider.py

4.DataParse.py

5.Constant.py

6.HanderRequest.py


1.介绍

1. 使用多线程爬取网站

2.爬取数据后保存至excel

3.爬取网站(仅做测试)网创类项目爬取:https://www.maomp.com/

4..实现效果

2.代码

1.main.py

# coding:utf-8
import threading

import requests
from  queue import Queue
from PageSpider import PageSpider
from DetailSpider import DetailSpider
from DataParse import DataParse
import xlsxwriter
import time
"""
爬取网站:https://www.maomp.com/wzjc/
爬取信息,保存至Excel
"""

def start_page(threadsize,page_queue,detail_queue):
    # 开启线程,开始采集page页面
    page_spider_threadsize = threadsize
    page_spider_list = []
    for i in range(1,page_spider_threadsize+1):
        pageSpiderThread = PageSpider(thread_name="页面采集线程"+str(i), page_queue=page_queue, detail_queue=detail_queue)
        # 启动线程
        pageSpiderThread.start()
        page_spider_list.append(pageSpiderThread)
    # 查看队列是否有数据
    while not page_queue:
        pass
    # 释放资源
    for page_spider in page_spider_list:
        if page_spider.is_alive():
            page_spider.join()


def start_detail(threadsize,detail_queue,data_queue):
    # 开启线程,开始采集page页面
    detail_spider_threadsize = threadsize
    detail_spider_list = []
    for i in range(1, detail_spider_threadsize + 1):
        detailSpiderThread = DetailSpider(thread_name="详情页采集线程" + str(i), detail_queue=detail_queue,
                                      data_queue=data_queue)
        # 启动线程
        detailSpiderThread.start()
        detail_spider_list.append(detailSpiderThread)
    # 查看队列是否有数据
    while not detail_queue:
        pass
    # 释放资源
    for detail_spider in detail_spider_list:
        if detail_spider.is_alive():
            detail_spider.join()

def start_data_parse(threadsize,data_queue,book):
    # 开启线程,开始采集page页面
    lock=threading.Lock()
    sheet1 = book.add_worksheet("sheet1")
    title_data = ("网址", "标题", "发布时间", "内容")
    # 添加表头
    for index, title_datum in enumerate(title_data):
        sheet1.write(0, index, title_datum)

    spider_list = []
    for i in range(1, threadsize + 1):
        thread = DataParse(thread_name="数据解析线程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)
        # 启动线程
        thread.start()
        spider_list.append(thread)
    # 查看队列是否有数据
    while not data_queue:
        pass
    # 释放资源
    for parse in spider_list:
        if parse.is_alive():
            parse.join()

def main(xlswriter=None):
    #定义页面队列,存放page页信息
    page_queue = Queue()
    #定义详情页队列
    detail_queue = Queue()
    #定义详情页数据队列
    data_queue = Queue()
    page_start=1
    page_end=1
    for i in range(page_start,page_end+1):
        page_url="https://www.maomp.com/wzjc/page/{}/".format(i)
        page_queue.put(page_url)
    print("页面队列:",page_queue.qsize())

    #启动采集分页
    start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)
    #启动详情页采集
    start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)
    # 启动数据解析
    #创建存放excel文件夹
    book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")
    start_data_parse(threadsize=5,data_queue=data_queue,book=book)
    book.close()
    print("分页数据个数:",page_queue.qsize())
    print("详情页数据个数:", detail_queue.qsize())
    print("数据数据个数:", data_queue.qsize())

if __name__ == '__main__':
   main()

2.PageSider.py

# coding:utf-8
import threading
from lxml import etree
import HanderRequest


class PageSpider(threading.Thread):
    """
    页面url,请求多线程类
    """

    def __init__(self,thread_name,page_queue,detail_queue):
        super(PageSpider,self).__init__()
        self.thread_name=thread_name
        self.page_queue=page_queue
        self.detail_queue=detail_queue

    def parse_detail_url(self,content):
        """
        解析page页获取详情页url
        :param content:  page页text
        :return:  返回详情页url
        """
        #页码返回数据html实例化
        item_html=etree.HTML(content)
        #解析出索引详情页URL
        detail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")
        for url in detail_urls:
            #将详情页url存放到队列中
            self.detail_queue.put(url)

    def run(self):
        #实际发送请求
        print("{}启动".format(self.thread_name))
        #需要从page_queue队列中获取数据
        try:
            while not self.page_queue.empty():
            #从队列中获取数据,并设置为非阻塞状态
               page_url= self.page_queue.get(block=False)
               #请求页面链接
               response_text=HanderRequest.send_reqeust(page_url)
               if response_text:
                   #解析详情url
                   self.parse_detail_url(response_text)
        except Exception as e:
            print("{} 执行异常:{}".format(self.thread_name,e))

        print("{}结束".format(self.thread_name))

3.DetailSpider.py

# coding:utf-8
import threading
from lxml import etree
import HanderRequest


class DetailSpider(threading.Thread):
    """
    详情页url,请求详情页
    """

    def __init__(self,thread_name,detail_queue,data_queue):
        super(DetailSpider,self).__init__()
        self.thread_name=thread_name
        self.data_queue=data_queue
        self.detail_queue=detail_queue


    def run(self):
        #实际发送请求
        print("{}启动".format(self.thread_name))
        #需要从page_queue队列中获取数据
        try:
            while not self.detail_queue.empty():
            #从队列中获取数据,并设置为非阻塞状态
               detail_url= self.detail_queue.get(block=False)
               #请求页面链接
               response_text=HanderRequest.send_reqeust(detail_url)
               if response_text:
                   data={
                       "url":detail_url,
                       "html_content":response_text
                   }
                   #存放data_queuq数据
                   self.data_queue.put(data)

        except Exception as e:
            print("{} 执行异常:{}".format(self.thread_name,e))

        print("{}结束".format(self.thread_name))

4.DataParse.py

# coding:utf-8
import threading
from lxml import etree
import Constant



class DataParse(threading.Thread):
    """
    详情页数据处理
    """

    def __init__(self,thread_name,data_queue,lock,sheet):
        super(DataParse,self).__init__()
        self.thread_name=thread_name
        self.data_queue=data_queue
        self.lock=lock
        self.sheet=sheet


    def __list_join(self,list):
        return "".join(list)

    def __parse(self,data):
        """
        解析data_queue数据
        保存至excel中
        :return:
        """

        html= etree.HTML(data.get("html_content"))
        data={
            "url":data.get("url"),
            "title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),
            "put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),
            "content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))
        }
        #多线程,使用lock来进行控制并发
        with self.lock:
            #写入Excel
            for index,e in enumerate(data):
                self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))
            Constant.CURR_EXCEL_COL += 1

    def run(self):
        #实际发送请求
        print("{}启动".format(self.thread_name))
        #需要从page_queue队列中获取数据
        try:
            while not self.data_queue.empty():
                #从队列中获取数据,并设置为非阻塞状态
                data_content= self.data_queue.get(block=False)
                #解析html数据
                self.__parse(data_content)

        except Exception as e:
            print("{} 执行异常:{}".format(self.thread_name,e))

        print("{}结束".format(self.thread_name))

5.Constant.py

# coding:utf-8

# excel写入到第几列
CURR_EXCEL_COL=1

6.HanderRequest.py

注意修改cookie

# coding:utf-8

import requests

def send_reqeust(url):
    #发送数据
    headers={
        "Cookie":"xxx",
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    }
    response=requests.get(url,headers=headers)
    if response.status_code==200 and response:
        return response.text

  • 12
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值