python 爬虫 如法炮制 通过搜索引擎爬取相关得要的数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/5 13:18
# @Author  : huni
# @File    : 搜索爬取.py
# @Software: PyCharm

import requests
from lxml import etree
from urllib import parse
import os
import time
from queue import Queue
from threading import Thread

# 获取列表页
class CrawlInfo(Thread):
    #重写构造函数
    def __init__(self,url_queue,html_queue):
        Thread.__init__(self)
        #声明两个类属性
        self.url_queue = url_queue
        self.html_queue = html_queue

    def run(self):
        #爬虫代码
        while self.url_queue.empty() == False:          
            url = self.url_queue.get()                  
            reponse = requests.get(url=url,headers=headers)
            print(reponse.status_code)
            if reponse.status_code == 200:
                self.html_queue.put(reponse.text)       


#获取单页面的主题列表
class ParseInfo(Thread):
    def __init__(self,html_queue,detail_queue):
        Thread.__init__(self)
        self.html_queue = html_queue
        self.detail_queue = detail_queue

    def run(self):
        s = requests.Session()
        s.keep_alive = False  # 关闭多余连接

        while self.html_queue.empty() == False:
            tree2 = etree.HTML(self.html_queue.get())           
            
            li_list = tree2.xpath('//ul[@id="pins"]/li')
            for li in li_list:
                href = li.xpath('./a/@href')[0]

                page3_text = requests.get(url=href,headers=headers).text
                tree3 = etree.HTML(page3_text)

                title = tree3.xpath('/html/head/title/text()')
                pagenum = int(tree3.xpath('//div[@class="pagenavi"]/a[5]//text()')[0])

                title_path = kw_path + f'/{title}'
                if not os.path.exists(title_path):
                    os.mkdir(title_path)

                for pa in range(1,pagenum+1):
                    everyhref = href + f'/{pa}'

                    page4_text = requests.get(url=everyhref,headers=headers).text
                    tree4 = etree.HTML(page4_text)

                    src = tree4.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0]

                    jpgdata = requests.get(url=src,headers=headers).content
                    jpgname = src.split('/')[-1]

                    jpg_path = title_path + f'/{jpgname}'
                    with open(jpg_path, 'wb') as fp:
                        fp.write(jpgdata)
                        print(jpgname, '下载完成')


if __name__ == '__main__':
    star = time.time()
    headers = {
        'Referer': 'https://www.mzitu.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }

    kw = '关键词'
    keyword = parse.quote(kw,encoding='utf-8')
    url = f'https://www.mzitu.com/search/{keyword}/'

    kw_path = './搜索' + f'/{kw}'
    if not os.path.exists(kw_path):
        os.mkdir(kw_path)

    url_queue = Queue()
    html_queue = Queue()
    detail_queue = Queue()

    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)

    search_page_num = int(tree.xpath('/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
    for search_page in range(1,search_page_num+1):
        everyurl = f'https://www.mzitu.com/search/{keyword}/page/{search_page}/'
        url_queue.put(everyurl)

    crawl_list = []
    for i in range(5):
        Crawl = CrawlInfo(url_queue, html_queue)
        crawl_list.append(Crawl)
        Crawl.start()

    for crawl in crawl_list:
        crawl.join()  # 等待操作

    parse_list = []
    for i in range(5):
        parse = ParseInfo(html_queue, detail_queue)
        parse_list.append(parse)
        parse.start()

    for parse in parse_list:
        parse.join()

    print(time.time() - star)


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值