使用的包
- beautifulsoup4
- urllib3
- threading
- os
- time
目标网址
目前能够根据页数设定爬网,在单一写真内可以自动翻页。
后面再优化一下注释部分,做到模特页内的自动翻页。
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 23 14:40:18 2021
@author: ZhaZhiCheng
"""
from bs4 import BeautifulSoup
from urllib3 import PoolManager
from time import sleep
import os
import threading as th
def GetPic(src,DirName,count):
http=PoolManager()
number=str(count)+r'.jpg'
pic=http.request('get',src)
with open(DirName+r'/'+number, 'wb') as f:
f.write(pic.data)
def CrawPhoto(url):
http=PoolManager()
# CurrentUrl=r'https://www.tujigu.net/a/45261/'
CurrentUrl=url
File=http.request('get',CurrentUrl)
html=BeautifulSoup(File.data,'html.parser')
Header=html.find(class_='weizhi').find('h1').text
DirName=''.join(Header.split(' ')).replace(r'@','').replace(r'&','').replace(r'/','')
try:
os.system('mkdir '+DirName)
print(DirName)
except :
print('error')
return 0
pass
NextPage=url
CurrentUrl=' '
count=0
while(CurrentUrl!=NextPage):
CurrentUrl=NextPage
File=http.request('get',CurrentUrl)
html=BeautifulSoup(File.data,'html.parser')
PicSrc=html.find(class_='content').find_all(class_='tupian_img')
Thread_List=[]
for i in PicSrc:
src=i['src']
NewThread=th.Thread(target=GetPic,args=[src,DirName,count])
NewThread.start()
Thread_List.append(NewThread)
count=count+1
for j in Thread_List:
j.join()
# print(DirName)
try:
NextPage=html.find(class_='a1',text='下一页')['href']
except :
break
sleep(0.1)
def Geturl(domain,url):
PM=PoolManager()
File=PM.request('get',url)
Html=BeautifulSoup(File.data,'html.parser')
PhotoSeries=Html.find(class_='hezi').find_all('li')
for i in PhotoSeries:
photourl=i.find('a')['href']
CrawPhoto(photourl)
# if Html.find(attrs={'id':'pages'}).find(class_='next').text !='上一页':
# url=domain+Html.find(attrs={'id':'pages'}).find(class_='next')['href']
# else:
# break
url=r'https://www.tujigu.net/t/5789/'
domain=r'https://www.tujigu.net'
Geturl(domain,url)
pages=1#根据目标页数设定
for i in range(1,pages,1):
a=r'/index_'+str(i)+r'.html'
b=url+a
Geturl(domain,b)