python 爬虫全站高质量数据内容快速爬取要素过多建议收藏

最新推荐文章于 2024-05-06 13:16:59 发布

乎你

最新推荐文章于 2024-05-06 13:16:59 发布

阅读量10w+

点赞数 1

分类专栏：爬虫文章标签：队列 python xpath queue 多线程

本文链接：https://blog.csdn.net/m0_50944918/article/details/110824417

版权

爬虫专栏收录该内容

45 篇文章 15 订阅

订阅专栏

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/7 15:06
# @Author  : huni
# @File    : 高质量多线程.py
# @Software: PyCharm
from threading import Thread
from queue import Queue
import requests
from lxml import etree
import os
from urllib import parse
import time
import re

# 获取列表页
class CrawlInfo(Thread):
    #重写构造函数
    def __init__(self,url_queue,html_queue):
        Thread.__init__(self)
        #声明两个类属性
        self.url_queue = url_queue
        self.html_queue = html_queue

    def run(self):
        #爬虫代码
        while self.url_queue.empty() == False:          #当url队列中不是空的就继续爬
            url = self.url_queue.get()                  #从队列中获取一个url
            reponse = requests.get(url=url,headers=headers)
            if reponse.status_code == 200:
                self.html_queue.put(reponse.text)       #访问成功把html文件放进html队列中

#获取单页面的主题列表
class ParseInfo(Thread):
    def __init__(self,html_queue,detail_queue):
        Thread.__init__(self)
        self.html_queue = html_queue
        self.detail_queue = detail_queue

    def run(self):

        while self.html_queue.empty() == False:
            tree1 = etree.HTML(self.html_queue.get())           #从html队列中获取一个html使用etree解析
            #下面的内容可以修改为自己需要解析的内容，比如爬取段子，音频，视频等，只需要在运行前把base_url做更改就可以了
            href_set = set(tree1.xpath('//ul[@class="photo_ul"]//@href'))
            for href1 in href_set:
                href2 = 'https://www.nvshens.org' + href1
                response2 = requests.get(url=href2, headers=headers).text
                detail_queue.put(response2)

# 从主题也获取页码
class DetailInfo(Thread):
    def __init__(self, detail_queue):
        Thread.__init__(self)
        self.detail_queue = detail_queue


    def run(self):
        s = requests.Session()
        s.keep_alive = False  # 关闭多余连接
        headers = {
            'Referer': 'https://www.nvshens.org/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
        }
        while self.detail_queue.empty() == False:
            tree2 = etree.HTML(self.detail_queue.get())  # 从html队列中获取一个html使用etree解析
            href_page_num = int((re.findall(r'(\d+)', tree2.xpath('//*[@id="dinfo"]/span/text()')[0]))[0])

            hrefbefore = tree2.xpath('//*[@id="pages"]/a[2]/@href')[0]

            title = tree2.xpath('/html/head/title/text()')[0]
            title_path = search_path + '/' + title
            if not os.path.exists(title_path):
                os.mkdir(title_path)

            for j in range(1, href_page_num // 3 + 2):
                href3 = 'https://www.nvshens.org' + hrefbefore.replace(hrefbefore.split('/')[-1],f'{j}.html')

                response3 = requests.get(url=href3, headers=headers).text
                tree3 = etree.HTML(response3)
                src_list = tree3.xpath('//ul[@id="hgallery"]//@src')
                for src in src_list:
                    src1 = src.replace('img','t1')
                    src2 = src1.replace('/s','')

                    jpg_data = requests.get(url=src2, headers=headers).content
                    jpg_name = src2.split('/')[-1]

                    jpg_path = title_path + '/' + jpg_name
                    with open(jpg_path, 'wb') as fp:
                        fp.write(jpg_data)
                        print(jpg_name, '下载完成')


if __name__ == '__main__':
    star = time.time()
    headers = {
        'Connection': 'keep-alive',
        'Referer': 'https://www.nvshens.org/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }
    # 创建一个存储有url和html的容器：队列
    url_queue = Queue()
    html_queue = Queue()
    detail_queue = Queue()

    kw = 'xxx'
    keyword = parse.quote(kw, encoding='utf-8')
    search_url = f'https://www.nvshens.org/girl/search.aspx?name={keyword}'
    resp = requests.get(url=search_url, headers=headers).text
    find_link = re.findall(r'<a style="line-height:19px;" href=\'(.*?)\' target=', resp)
    url = 'https://www.nvshens.org' + find_link[0] + 'album/'

    search_path = './搜索' + f'/{kw}'
    if not os.path.exists(search_path):
        os.mkdir(search_path)

    # 解析每个分页
    response = requests.get(url=url, headers=headers).text
    tree = etree.HTML(response)

    search_info = tree.xpath('//*[@id="post"]/div[2]/div/text()')[0]
    page_num = int(re.findall(r'共(.*?)部', search_info)[0])
    for i in range(1, page_num // 30 + 2):
        href = url + f'{i}.html'
        url_queue.put(href)

    crawl_list = []
    for i in range(5):
        Crawl = CrawlInfo(url_queue,html_queue)
        crawl_list.append(Crawl)
        Crawl.start()

    for crawl in crawl_list:
        crawl.join()            # 等待操作

    parse_list = []
    for i in range(10):
        parse = ParseInfo(html_queue,detail_queue)
        parse_list.append(parse)
        parse.start()

    for parse in parse_list:
        parse.join()

    detail_list = []
    for i in range(10):
        detail = DetailInfo(detail_queue)
        detail_list.append(detail)
        detail.start()

    for detail in detail_list:
        detail.join()

    print(time.time()-star)

乎你

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬虫全站高质量数据内容快速爬取要素过多建议收藏

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : 2020/12/7 15:06# @Author : huni# @File : 高质量多线程.py# @Software: PyCharmfrom threading import Threadfrom queue import Queueimport requestsfrom lxml import etreeimport osfrom urllib impor
复制链接

扫一扫