import requests
from lxml import etree
import os
import re
from queue import Queue
import threading
class Crawl_url(threading.Thread):
def __init__(self,page_queue,img_queue):
super().__init__()#必须得先调用父类方法
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():#待爬取的url为空时,终止爬取
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
res = requests.get(url,headers=headers).content.decode()
text = etree.HTML(res)
tables = text.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a')
for table in tables:
imgs = table.xpath('.//img')
l = len(imgs)
if l == 2:
pass
else:
src = table.xpath('.//@href')
src1 = str(src)
src1 = src1.strip("''")
src2 = src1.strip('[]')
src3 = src2.strip("''")
self.parse_url(src3)
def parse_url(self,src3):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
res = requests.get(url = src3, headers=headers).content.decode()
with open('表情1.txt', 'w', encoding='utf8')as fp:
fp.write(res)
text = etree.HTML(res)
hres = text.xpath('//*[@id="detail"]/div/div[2]/li/div[3]/div/div/div/div[1]/table/tbody/tr[1]/td/img/@ src')#获取图片的链接
hres = str(hres)
hres = hres.strip('[]')
hres = hres.strip("''")
title = text.xpath('//*[@id="detail"]/div/div[2]/li/div[2]/h1/a/text()')#获取图片标题
title = str(title)
title = title.strip('[]')
title = title.strip("''")
self.img_queue.put((hres,title))#将爬取的链接和图片的名称放入队列中
class Parse_url(threading.Thread):
def __init__(self,page_queue,img_queue):
super().__init__()#必须得先调用父类方法
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
hres,title = self.img_queue.get()
# print({'链接':hres,'标题':title}) 验证队列是否存在
self.parse_hres(hres,title)
def parse_hres(self,hres,title):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
res = requests.get(url=hres, headers=headers).content
wenjian = '斗图啦'
if not os.path.exists(wenjian):
os.mkdir(wenjian)
#清理title,使他能成为保存图片的文件名称
title1 = re.sub("A-Za-z0-9[\/\?\[\]\!\[\]\[>]", '', title)[0:10]
title1 = title1.strip('"')
title1 = re.sub('>','',title1)
filename = title1 + '.jpg'
file_path = os.path.join(wenjian, filename)
with open(file_path,'wb') as fp:
fp.write(res)
def main():
page_queue = Queue(10)
img_queue = Queue(1000)
for x in range(1,11):
url = 'http://www.doutula.com/photo/list/?page=%d'% x
page_queue.put(url)
#开启5个线程爬取图片的url和标题
for x in range(5):
t = Crawl_url(page_queue,img_queue)
t.start()
#开启5个线程解析图片的Url,保存图片
for x in range(5):
t1 = Parse_url(page_queue,img_queue)
t1.start()
if __name__ == '__main__':
main()
使用多线程爬取静态表情图
最新推荐文章于 2024-04-19 10:19:46 发布