import os
import requests_html
import threading
from lxml import etree
class Spider:
def __init__(self):
self.url = 'https://pic.netbian.com/index_{}.html'
self.session = requests_html.HTMLSession()
self.proxies = {
"http": "203.78.142.125:8088"
}
def get_html(self):
for page in range(11, 51):
url = self.url.format(page)
response = self.session.get(url=url,proxies=self.proxies)
# self.parse(response.text,page)
p = threading.Thread(target=self.parse,args=(response.text,page))
p.start()
p.join()
def parse(self,response,page):
tree = etree.HTML(response)
lis = tree.xpath('//*[@id="main"]/div[3]/ul/li')
count = 1
for li in lis:
img = li.xpath('./a/@href')[0]
img_url = self.url.replace('/index_{}.html',img) # 第几页的页面地址
response1 = self.session.get(url=img_url,proxies=self.proxies)
tree1 = etree.HTML(response1.text)
src = tree1.xpath('//*[@id="img"]/img/@src')[0]
src1 = self.url.replace('/index_{}.html',src) # 下载图片的地址
path = 'images/彼岸图网/' + str(page) + '/'
if not os.path.exists(path):
os.makedirs(path)
print(f'开始下载第{page}页,第{count}张图片')
with open(path+str(count)+'.jpg','wb') as f:
f.write(self.session.get(url=src1,proxies=self.proxies).content)
print(f'第{page}页,第{count}张图片下载完成')
count += 1
if __name__ == '__main__':
spider = Spider()
spider.get_html()