#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/5 13:18
# @Author : huni
# @File : 搜索爬取.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib import parse
import os
import time
from queue import Queue
from threading import Thread
# 获取列表页
class CrawlInfo(Thread):
#重写构造函数
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
#声明两个类属性
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
#爬虫代码
while self.url_queue.empty() == False:
url = self.url_queue.get()
reponse = requests.get(url=url,headers=headers)
print(reponse.status_code)
if reponse.status_code == 200:
self.html_queue.put(reponse.text)
#获取单页面的主题列表
class ParseInfo(Thread):
def __init__(self,html_queue,detail_queue):
Thread.__init__(self)
self.html_queue = html_queue
self.detail_queue = detail_queue
def run(self):
s = requests.Session()
s.keep_alive = False # 关闭多余连接
while self.html_queue.empty() == False:
tree2 = etree.HTML(self.html_queue.get())
li_list = tree2.xpath('//ul[@id="pins"]/li')
for li in li_list:
href = li.xpath('./a/@href')[0]
page3_text = requests.get(url=href,headers=headers).text
tree3 = etree.HTML(page3_text)
title = tree3.xpath('/html/head/title/text()')
pagenum = int(tree3.xpath('//div[@class="pagenavi"]/a[5]//text()')[0])
title_path = kw_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for pa in range(1,pagenum+1):
everyhref = href + f'/{pa}'
page4_text = requests.get(url=everyhref,headers=headers).text
tree4 = etree.HTML(page4_text)
src = tree4.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0]
jpgdata = requests.get(url=src,headers=headers).content
jpgname = src.split('/')[-1]
jpg_path = title_path + f'/{jpgname}'
with open(jpg_path, 'wb') as fp:
fp.write(jpgdata)
print(jpgname, '下载完成')
if __name__ == '__main__':
star = time.time()
headers = {
'Referer': 'https://www.mzitu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
kw = '关键词'
keyword = parse.quote(kw,encoding='utf-8')
url = f'https://www.mzitu.com/search/{keyword}/'
kw_path = './搜索' + f'/{kw}'
if not os.path.exists(kw_path):
os.mkdir(kw_path)
url_queue = Queue()
html_queue = Queue()
detail_queue = Queue()
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
search_page_num = int(tree.xpath('/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
for search_page in range(1,search_page_num+1):
everyurl = f'https://www.mzitu.com/search/{keyword}/page/{search_page}/'
url_queue.put(everyurl)
crawl_list = []
for i in range(5):
Crawl = CrawlInfo(url_queue, html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
crawl.join() # 等待操作
parse_list = []
for i in range(5):
parse = ParseInfo(html_queue, detail_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
print(time.time() - star)
python 爬虫 如法炮制 通过搜索引擎爬取相关得要的数据
最新推荐文章于 2023-03-17 22:32:41 发布