import requests
import os
import re
from bs4 import BeautifulSoup
def f ( url_data) :
url_data= url_data. split( "/" )
s= ''
for i in range ( len ( url_data) - 1 ) :
s+= str ( url_data[ i] ) + '/'
return s
headers= {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getsum ( url, name) :
get_page_text = requests. get( url= url, headers= headers)
get_page_text. encoding = 'gb2312'
tt_soup = BeautifulSoup( get_page_text. text, 'lxml' )
img_sum_data= list ( tt_soup. find( 'div' , class_= 'wzfz tu-tit fix' ) )
pattern = r'.*?\d/(\d{1,3})'
img_sum = int ( re. findall( pattern, img_sum_data[ 1 ] . text, re. S) [ 0 ] )
getAll( img_sum, url, name)
def getAll ( img_sum, url, name) :
page_all= [ ]
for i in range ( img_sum- 1 ) :
index_url = 'http://www.jj20.com'
s = f( url)
page_text= requests. get( url= url, headers= headers)
page_text. encoding = 'gb2312'
soup = BeautifulSoup( page_text. text, 'lxml' )
next = soup. select( '.next' )
page_all. append( url)
url= s+ next [ 0 ] [ 'href' ]
down( page_all, name)
def down_img ( url, name, n) :
page_text= requests. get( url= url, headers= headers) . content
path= ''
if not os. path. exists( f'./壁纸' ) :
os. mkdir( f'./壁纸' )
path= f'./壁纸/ { name} '
if not os. path. exists( f'./壁纸/ { name} ' ) :
os. mkdir( f'./壁纸/ { name} ' )
path= f'./壁纸/ { name} '
else :
path= f'./壁纸/ { name} '
with open ( f' { path} / { name} { n} .jpg' , 'wb' ) as f:
f. write( page_text)
print ( f' { name} { n} .jpg 爬取成功!' )
def down ( page_all, name) :
n = 0
for i in page_all:
n+= 1
page_text = requests. get( url= i, headers= headers)
page_text. encoding = 'gb2312'
soup = BeautifulSoup( page_text. text, 'lxml' )
img_url= soup. select( '.photo img' ) [ 0 ] [ 'src' ]
down_img( img_url, name, n)
def main ( ) :
url= 'http://www.jj20.com'
page_text= requests. get( url= url, headers= headers)
page_text. encoding= 'gb2312'
soup= BeautifulSoup( page_text. text, 'lxml' )
tt_url_data= soup. select( '.picbz>li>a' )
tt_url= [ ]
tt_url_name= [ ]
for i in tt_url_data:
tt_url. append( i[ 'href' ] )
tt_url_name. append( i[ 'title' ] )
for i in range ( len ( tt_url) ) :
img_url= url+ tt_url[ i]
img_name= tt_url_name[ i]
getsum( img_url, img_name)
if __name__== '__main__' :
main( )
更新
import requests
import os
import lxml
from bs4 import BeautifulSoup
def f ( url_data) :
url_data= url_data. split( "/" )
s= ''
for i in range ( len ( url_data) - 1 ) :
s+= str ( url_data[ i] ) + '/'
return s
headers= {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getAll ( url, name) :
page_all= [ ]
try :
index_url = 'http://www.jj20.com'
s = f( url)
page_text= requests. get( url= url, headers= headers)
page_text. encoding = 'gb2312'
soup = BeautifulSoup( page_text. text, 'lxml' )
next = soup. select( '.next' )
page_all. append( url)
url= s+ next [ 0 ] [ 'href' ]
except :
print ( 1 )
down( page_all, name)
def down_img ( url, name, n) :
page_text= requests. get( url= f"http: { url} " , headers= headers) . content
if not os. path. exists( f'./壁纸' ) :
os. mkdir( f'./壁纸' )
if not os. path. exists( f'./壁纸/ { name} ' ) :
os. mkdir( f'./壁纸/ { name} ' )
with open ( f'./壁纸/ { name} / { name} { n} .jpg' , 'wb' ) as f:
f. write( page_text)
print ( f' { name} { n} .jpg 爬取成功!' )
def geturl ( url) :
page_text = requests. get( url= url, headers= headers)
page_text. encoding = 'gb2312'
soup = BeautifulSoup( page_text. text, 'lxml' )
img_url= soup. select( ".photo>a>img" )
return img_url[ 0 ] [ 'src' ]
def down ( page_all, name) :
for i in page_all:
n = 0
url_1 = i
while True :
try :
n+= 1
if n>= 2 :
img_url= getnext( url_1)
img= f"http://www.jj20.com/bz/nxxz/shxz/ { img_url} "
url_1= img
url= geturl( img)
down_img( url, name, n)
else :
url = geturl( i)
down_img( url, name, n)
except :
break
def getnext ( url) :
page_text = requests. get( url= url, headers= headers)
page_text. encoding = 'gb2312'
soup = BeautifulSoup( page_text. text, 'lxml' )
next = soup. select( ".next" )
return next [ 0 ] [ 'href' ]
def main ( ) :
url= 'http://www.jj20.com/bz/nxxz/list_7_cc_14.html'
page_text= requests. get( url= url, headers= headers)
page_text. encoding= 'gb2312'
soup= BeautifulSoup( page_text. text, 'lxml' )
tt_url_data= soup. select( '.picbz>li>a' )
tt_name = soup. select( '.picbz>li>a>img' )
tt_url= [ ]
tt_url_name= [ ]
for i in tt_url_data:
tt_url. append( f"http://www.jj20.com/ { i[ 'href' ] } " )
for i in tt_name:
tt_url_name. append( i[ 'alt' ] )
tt_url_1= [ ]
for i in tt_url:
if i not in tt_url_1:
tt_url_1. append( i)
for i in range ( len ( tt_url_1) ) :
img_url= tt_url_1[ i]
img_name= tt_url_name[ i]
getAll( img_url, img_name)
if __name__== '__main__' :
main( )