import requests
from lxml import etree
from queue import Queue
from threading import Thread
#装饰爬虫2071
headers = {'User-Agent':'Mozilla /5.0(WindowsNT10.0;Win64;x64;rv: 73.0) Gecko / 20100101Firefox / 73.0'}
#取得爬取网址
#解析第一个页面
def Url_xpath(url):
text = requests.get(url=url,headers=headers).text
#xpath解析页面 并获取图片给网址
html = etree.HTML(text)
links = html.xpath('//img/@src')
if len(links)==0:
links=html.xpath('//img/@src2')
return links
# with open('./ptoto/%d.jpg'%n,mode='wb') as fp: #打开文件
#将图片储存到当前文件夹下
n = 0
def Storage():
while True:
#从队列拿出一条消息
url = link_queue.get()
if url is None:
break
links=Url_xpath(url)
for link in links:
if link.startswith('http:'):
content=requests.get(link).content
with open('./test/test/%s'%link.split('/')[-1],mode='wb') as fp:
fp.write(content)
print('============{:s}=================='.format(link.split('/')[-1]))
link_queue.task_done()
if __name__ == '__main__':
link_queue = Queue()
a=[]
n=int(input('please input number:'))
if n<=2:
print('输入错误')
else:
url1 = ['http://sc.chinaz.com/',] + ['http://sc.chinaz.com/tupian/index_{:d}.html'.format(i) for i in range(2,n-2)]
for url in url1:
link_queue.put(url)
print('====================================')
for i in range(10):
t = Thread(target=Storage)
t.start()
a.append(t)
print('====================================')
link_queue.join()
for i in range(10):
link_queue.put(None)
print('====================================')
for t in a:
t.join()
print('=================结束===================')