#!/usr/bin/env python
# coding=utf-8
"""
基于进程+线程实现多任务爬虫程序,爬取站长之家风景图片
"""
import time
import uuid
from multiprocessing import Queue, Process
from threading import Thread
import requests
from lxml import etree
from openpyxl import Workbook
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
}
# 数量计数器
num = 0
# 存入表格
wb = Workbook()
ws = wb.active
ws.title = '图片'
# 创建表头
ws.append(['id', 'name', 'image_url'])
class DownloadThread(Thread):
"""
下载线程
"""
def __init__(self, url,name):
self.url = url
self.show_name = name
self.content = None
super().__init__()
def run(self):
print("开始下载%s:%s" % (self.show_name, self.url))
resp = requests.get(self.url, headers=headers)
if resp.status_code == 200:
resp.encoding = 'utf-8'
self.content = resp.text
print("%s下载完成"%self.show_name)
else:
print("此页面异常,异常代码:", resp.status_code)
def get_content(self):
return self.content
class DownloadProcess(Process):
"""
下载进程
"""
def __init__(self, url_q, html_q):
self.url_q: Queue = url_q # 下载
self.html_q: Queue = html_q # 解析
super().__init__()
def run(self):
while True:
try:
url = self.url_q.get(timeout=30)
# 启动下载子线程,下载每页数据
t = DownloadThread(url,name="列表页")
t.start()
t.join()
# 获取下载数据
html = t.get_content()
# 将数据压入到解析队列中
self.html_q.put((url,html))
except:
break
print("--下载进程DownloadProcess结束--")
class ParseThread(Thread):
"""
解析线程
"""
def __init__(self,url_q,html,base_url):
self.url_q:Queue = url_q
self.html:Queue = html
self.base_url = base_url
super().__init__()
def run(self):
html = etree.HTML(self.html)
page = html.xpath('//div[@class="fenye"]/a[@class="active"]/b/text()')[0]
print("解析线程开启")
# 一、这个方法获取到的图片是大图,如果没有大图则下载小图
images = html.xpath('//div[@id="container"]/div[contains(@class,"picblock")]/div/a')
for img in images:
item = {}
item['id'] = uuid.uuid4().hex
item['name'] = img.xpath('./@alt')[0]
# 下载大图,如果没有大图则下载小图
img_url = img.xpath('./@href')[0]
# 启动下载子线程,下载大图
t = DownloadThread(img_url,name="图片")
t.start()
t.join()
# 获取下载数据,有可能没有返回None则下载小图
img_html = t.get_content()
if img_html:
img_html = etree.HTML(img_html)
item['cover'] = img_html.xpath('//div[@class="imga"]/a/@href')[0]
else:
try:
item['cover'] = img.xpath('./img/@src2')[0]
except:
item['cover'] = img.xpath('./img/@src')[0]
global num
num += 1
print("已下载数据个数:%s,当前页数:%s"%(num,page))
print(item)
text = [item['id'],item['name'],item['cover']]
ws.append(text)
# 二、这个方法获取到的图片是小图,速度很快
# images = html.xpath('//div[@id="container"]/div[contains(@class,"picblock")]//img')
# for img in images:
# item = {}
# item['id'] = uuid.uuid4().hex
# item['name'] = img.xpath('./@alt')[0]
# try:
# item['cover'] = img.xpath('./@src2')[0]
# except:
# item['cover'] = img.xpath('./@src')[0]
# print(item)
try:
# 获取下一页url,获取最后一页会报错
get_next_url = html.xpath('//div[@class="fenye"]/a[@class="nextpage"]/@href')[0]
if get_next_url.startswith("http"):
next_url = get_next_url
else:
next_url = self.base_url + get_next_url
self.url_q.put(next_url) # 添加到下载队列中
except:
pass
class ParseProcess(Process):
"""
解析进程
"""
def __init__(self, url_q, html_q):
self.url_q: Queue = url_q
self.html_q: Queue = html_q
super().__init__()
def run(self):
while True:
try:
# 读取解析任务
url, html = self.html_q.get(timeout=30)
print("准备启动解析任务子线程", url)
# 获取base_url'http://sc.chinaz.com/tupian/'
base_url = url[:url.rindex('/')+1]
# 启动子线程解析任务
t = ParseThread(self.url_q,html,base_url)
t.start()
except:
break
print("-*-解析进程ParseProcess结束-*-")
file_end_name = time.strftime("%Y-%m-%d", time.localtime())
wb.save("图片表"+file_end_name + '.xlsx')
print("保存表格完毕")
if __name__ == '__main__':
task1 = Queue() # 下载任务队列
task2 = Queue() # 解析任务队列
# 起始爬虫任务
task1.put('http://sc.chinaz.com/tupian/fengjingtupian.html')
p1 = DownloadProcess(task1, task2) # 下载
p2 = ParseProcess(task1, task2) # 解析
p1.start()
p2.start()
p1.join()
p2.join()
print("--全部完成--")