python 爬虫 基于网站博主所有作品多线程爬取优化细节

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/1 18:33
# @Author  : huni
# @File    : 爬壁纸(多线程+多页爬取).py
# @Software: PyCharm

from threading import Thread        #多线程的包
from queue import Queue             #队列
import requests
from lxml import etree
import os


class CrawlInfo(Thread):

    #重写构造函数
    def __init__(self,url_queue,html_queue):
        Thread.__init__(self)
        #声明两个类属性
        self.url_queue = url_queue
        self.html_queue = html_queue

    #重写run方法
    def run(self):
        #爬虫代码
        headers = {
            'Referer': 'http://www.xiannvku.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'
        }
        while self.url_queue.empty() == False:          #当url队列中不是空的就继续爬
            url = self.url_queue.get()                  #从队列中获取一个url
            reponse = requests.get(url=url,headers=headers)
            if reponse.status_code == 200:
                self.html_queue.put(reponse.text)       #访问成功把html文件放进html队列中


class ParseInfo(Thread):
    def __init__(self,html_queue):
        Thread.__init__(self)
        self.html_queue = html_queue

    #重写run方法
    def run(self):
        s = requests.Session()
        s.keep_alive = False  # 关闭多余连接
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
        while self.html_queue.empty() == False:
            e = etree.HTML(self.html_queue.get())           #从html队列中获取一个html使用etree解析
            #下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
            li_list = e.xpath('//ul[@class="img"]/li')
            for li in li_list:
                detailurl = li.xpath('./a[1]/@href')[0]
                detail_page = requests.get(detailurl, headers=headers).text
                detail_tree = etree.HTML(detail_page)
                pagenum = int(detail_tree.xpath('//div[@id="pages"]/a')[-2:-1][0].xpath('./text()')[0])

                title = detail_tree.xpath('//title/text()')[0]
                title_path = search_path + f'/{title}'
                if not os.path.exists(title_path):
                    os.mkdir(title_path)

                for j in range(1, pagenum + 1):
                    rep = str(j) + '.html'
                    href = detailurl.replace(detailurl.split('-')[-1], rep)

                    page = requests.get(url=href, headers=headers).text
                    tree = etree.HTML(page)

                    img_list = tree.xpath('//div[@class="content"]/center/img')
                    for img in img_list:
                        src = img.xpath('./@src')[0]
                        jpgname = src.split('/')[-1]
                        jpgpath = title_path + '/' + jpgname
                        jpgdata = requests.get(url=src, headers=headers).content

                        with open(jpgpath, 'wb') as fp:
                            fp.write(jpgdata)
                            print(jpgname, '保存完成!')


if __name__ == '__main__':
    headers = {
        'Referer': 'http://www.xiannvku.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }
    # 创建一个存储有url和html的容器:队列
    url_queue = Queue()
    html_queue = Queue()

    key = 'XXX'     # 可更换其他关键词

    base_url = 'http://www.xiannvku.com/index.php/pic/search'

    paradata = {
        'key': key
    }
    search_page = requests.post(url=base_url, headers=headers, data=paradata).text
    search_tree = etree.HTML(search_page)
    search_num = search_tree.xpath('//div[@class="text-c"]/a[1]/text()')[0]
    print('搜索到:', search_num, '内容')


    search_path = './xxx' + f'/{key}'
    if not os.path.exists(search_path):
        os.mkdir(search_path)

    gril_page_num = (int(search_num.replace('条', '')) // 28) + 1
    for i in range(1, gril_page_num + 1):
        every_url = f'http://www.xiannvku.com/pic/search?key={key}&page={i}'
        url_queue.put(every_url)


    crawl_list = []             #创建三个线程,加到线程列表中
    for i in range(100):
        Crawl = CrawlInfo(url_queue,html_queue)
        crawl_list.append(Crawl)
        Crawl.start()

    for crawl in crawl_list:
        #等待操作,可以理解成url队列解析完了之后,需要等待一会,再交给html队列解析内容
        crawl.join()

    parse_list = []
    for i in range(100):
        parse = ParseInfo(html_queue)
        parse_list.append(parse)
        parse.start()

    for parse in parse_list:
        parse.join()

``

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值