python爬虫 多线程任务爬取全站数据内容 大项目 多行代码易混淆代码警告

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/11/26 18:33
# @Author  : huni
# @File    : 爬全站.py
# @Software: PyCharm

from threading import Thread
from queue import Queue
import requests
from lxml import etree
import os


class CrawlInfo(Thread):

    # 重写构造函数
    def __init__(self,url_queue_list,html_queue,html_queue_list,href_queue,href_queue_list):
        Thread.__init__(self)
        #声明两个类属性
        self.url_queue_list = url_queue_list        # 存储index标签队列的队列
        self.html_queue = html_queue                # 存储访问列表页的队列
        self.html_queue_list = html_queue_list      # 存储 存储访问列表页的队列
        self.href_queue = href_queue                # 存储详情页的队列
        self.href_queue_list = href_queue_list      # 存储 存储详情页的队列

    # 重写run方法
    def run(self):
        #爬虫代码
        headers = {
            'Connection': 'close',
            'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/86.0.4240.183 Safari/537.36'
        }
        while self.url_queue_list.empty() == False:          # 当url队列中不是空的就继续爬
            url_q = self.url_queue_list.get()            # 从队列中获取一个url
            while url_q.empty() == False:
                url = url_q.get()
                reponse = requests.get(url=url,headers=headers)
                # 处理中文乱码问题
                page_text = reponse.text.encode('ISO-8859-1').decode('utf-8')
                if reponse.status_code == 200:
                    self.html_queue.put(page_text)       # 访问成功把html文件放进html队列中
                    self.href_queue.put(page_text)
            self.html_queue_list.put(self.html_queue)
            self.href_queue_list.put(self.href_queue)



class ParseInfo(Thread):
    def __init__(self,html_queue_list,href_queue,href_queue_list):
        Thread.__init__(self)
        self.html_queue_list = html_queue_list
        self.href_queue = href_queue
        self.href_queue_list = href_queue_list

    # 重写run方法
    def run(self):
        headers = {
            'Connection': 'close',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
        while self.html_queue_list.empty() == False:
            html_q = self.html_queue_list.get()
            tree1 = etree.HTML(html_q.get())           # 从html队列中获取一个html使用etree解析
            # 下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
            page_list = tree1.xpath('//div[@class="c_page"]//@href')

            for page_g in page_list:
                page_g = 'http://www.quantuwang.cc' + page_g

                page_r = requests.get(url=page_g, headers=headers)
                # 处理中文乱码问题
                page_re = page_r.text.encode('ISO-8859-1').decode('utf-8')

                if page_r.status_code == 200:
                    self.href_queue.put(page_re)
            self.href_queue_list.put(self.href_queue)

class GetDownLoad(Thread):
    def __init__(self,href_queue_list):
        Thread.__init__(self)
        self.href_queue_list = href_queue_list

    # 重写run方法
    def run(self):
        requests.adapters.DEFAULT_RETRIES = 5   #增加重试次数
        s = requests.Session()          #保持连接
        s.keep_alive = False  # 关闭多余连接
        headers = {
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
        while self.href_queue_list.empty() == False:
            href_q = self.href_queue_list.get()
            while href_q.empty() == False:
                tree_g = etree.HTML(href_q.get())
                href_list_li = tree_g.xpath('//ul[@class="ul960c"]//@href')
                for href_li in href_list_li:
                    count_href = []
                    href_li = 'http://www.quantuwang.cc' + href_li
                    count_href.append(href_li)

                    page2_g = s.get(url=href_li, headers=headers)

                    # 处理中文乱码问题
                    page_text2_g = page2_g.text.encode('ISO-8859-1').decode('utf-8')

                    tree2 = etree.HTML(page_text2_g)

                    href_list2 = tree2.xpath('//div[@class="c_page"]//@href')
                    title = tree2.xpath('/html/head/title/text()')[0]
                    title = title.replace("/",'')

                    title_path = './全图网' + f'/{title}'
                    if not os.path.exists(title_path):
                        os.mkdir(title_path)

                    for href2 in href_list2:
                        href2 = 'http://www.quantuwang.cc' + href2
                        count_href.append(href2)

                    for href3 in count_href:
                        page_text3 = s.get(url=href3,headers=headers).text
                        tree3 = etree.HTML(page_text3)
                        src = tree3.xpath('//div[@class="c_img"]//@src')
                        if len(src) != 0:
                            jpg_data = s.get(url=src[0],headers=headers).content
                            jpg_name = src[0].split('/')[-1]

                            jpg_path = title_path + '/' + jpg_name
                            with open(jpg_path, 'wb') as fp:
                                fp.write(jpg_data)
                                print(jpg_name, '下载完成')


if __name__ == '__main__':
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }
    # 全图网的meinv版块url
    html = 'http://www.quantuwang.cc/meinv/'
    # 请求这个版块
    cata = requests.get(url=html, headers=headers)
    # 处理中文乱码问题
    cata = cata.text.encode('ISO-8859-1').decode('utf-8')
    # 解析
    tree = etree.HTML(cata)
    # 获取索引页面所有的li标签
    li_list = tree.xpath('/html/body/div[10]/div[2]/div[1]/div/ul[1]/li')
    all_href_list = []
    # 遍历li标签
    for li in li_list:
        href0_list = []  # 用来存放详情页页码的url
        href0 = 'http://www.quantuwang.cc' + li.xpath('./a/@href')[0]  # 获取详情页的一个页面
        href0_list.append(href0)  # 把详情页的一个页面加到详情页页码列表中


        # 请求一个详情页数据
        page_text0 = requests.get(url=href0, headers=headers)
        # 处理中文乱码问题
        page_text0 = page_text0.text.encode('ISO-8859-1').decode('utf-8')
        # 解析详情页
        tree0 = etree.HTML(page_text0)
        # 从详情页中获取套图的页码
        href0_0_list = tree0.xpath('//div[@class="c_page"]//@href')

        for href0_0 in href0_0_list:
            href0_0 = 'http://www.quantuwang.cc' + href0_0

            href0_list.append(href0_0)
        all_href_list.append(href0_list)

    # 创建一个存储有url和html的容器:队列
    url_queue_list = Queue()
    html_queue_list = Queue()
    href_queue_list = Queue()
    html_queue = Queue()
    href_queue = Queue()
    for i in range(len(all_href_list)):
        url_queue = Queue()
        for j in range(len(all_href_list[i])):
            url_queue.put(all_href_list[i][j])
        url_queue_list.put(url_queue)


    crawl_list = []             # 创建三个线程,加到线程列表中
    for i in range(50):
        Crawl = CrawlInfo(url_queue_list,html_queue,html_queue_list,href_queue,href_queue_list)
        crawl_list.append(Crawl)
        Crawl.start()

    for crawl in crawl_list:
        # 等待操作,可以理解成url队列解析完了之后,需要等待一会,再交给html队列解析内容
        crawl.join()

    parse_list = []
    for i in range(50):
        parse = ParseInfo(html_queue_list,href_queue,href_queue_list)
        parse_list.append(parse)
        parse.start()

    for parse in parse_list:
        parse.join()

    gdownl_list = []
    for i in range(50):
        downl = GetDownLoad(href_queue_list)
        gdownl_list.append(downl)
        downl.start()

    for downl in gdownl_list:
        downl.join()


  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值