python爬取小草网站,根据关键字,下载所需图片至本地。根据关键字分类,命名。
先爬取所需栏目下全部标题,和对应的路径,根据所需关键字进行筛选。 满足筛选条件的,则根据对应的路径,去爬取路径下的图片。
def download_img ( self, img_url, src, fileName) :
try :
if not os. path. exists( src) :
os. makedirs( src)
if not os. path. exists( src + fileName) :
header = { 'user-agent' : 'Mozilla/5.0' }
r = requests. get( img_url, headers= header, stream= True )
if r. status_code == 200 :
open ( src + fileName, 'wb' ) . write( r. content)
print ( src + fileName + "下载成功" )
self. info_list. append( src + fileName)
del r
except Exception:
print ( "下载出错:" + str ( Exception) )
print ( "url" + img_url + ",file:" + src + fileName)
def getHTMLText ( self, url) :
try :
kv = { 'user-agent' : 'Mozilla/5.0' }
r = requests. get( url, timeout= 30 , headers= kv)
r. raise_for_status( )
r. encoding = r. apparent_encoding
return r. text
except Exception as e:
print ( url + '页面获取出错:' + str ( e) )
return ''
def parseHTML ( self, html) :
ls = [ ]
try :
soup = BeautifulSoup( html, 'html.parser' )
alltr = soup. find_all( 'tr' , 'tr3 t_one tac' )
for tr in alltr:
h3 = tr. find( 'h3' )
pageUrl = h3. find( 'a' ) . get( 'href' )
name = h3. string. strip( )
if ( "P]" in name) :
ls. append( [ name, pageUrl] )
return ls
except Exception as e:
print ( '页面解析出错:' + str ( e) )
pass
def formatHTMLInfo ( self, html, src) :
try :
soup = BeautifulSoup( html, 'html.parser' )
div = soup. find( 'div' , 'tpc_content do_not_catch' )
imgs = div. find_all( 'img' )
i = 1
if not os. path. exists( src) :
os. makedirs( src)
for img in imgs:
tail = os. path. splitext( img. get( 'ess-data' ) ) [ - 1 ]
idx = ''
if i < 10 :
idx += '00' + str ( i)
elif i < 100 :
idx += '0' + str ( i)
else :
idx += str ( i)
self. download_img( img. get( 'ess-data' ) , src + '\\' , idx + tail)
i += 1
except Exception as e:
print ( '页面解析出错:' + str ( e) )
pass
depth = 100
dict = { }
dict [ '颜色' ] = [ '红' , '橙' , '黄' , '绿' ]
dict [ '学生' ] = [ '小学' , '初中' , '高中' , '大学' , '学生' ]
for i in range ( depth) :
url = basic_url + str ( i+ 1 )
html = Spyder. getHTMLText( url)
temp_list = Spyder. formatHTML( html)
prefix = 'https://分享你我光圈下的最美'
for page in temp_list:
for key in dict :
vList = dict [ key]
for v in vList:
if v in page[ 0 ] :
flag = 2
while flag > 0 :
InfoUrl = prefix + page[ 1 ]
print ( '当前页码:' + str ( i) + ',分类:' + v + ',' + page[ 0 ] + InfoUrl)
htmlInfo = Spyder. getHTMLText( InfoUrl)
Spyder. formatHTMLInfo( htmlInfo, Spyder. src + key + '\\' + page[ 0 ] )
time. sleep( 3 )
flag -= 1
存储获取到的页面信息至本地
def writeHTML ( path, txt) :
f = open ( path, 'w+' , encoding= 'utf-8' )
f. write( txt)
f. close( )
读取路径下的页面为文本 [一读一写,便于前期分析要爬取的目标页面]
def readHTML ( path) :
f = open ( path, 'r' , encoding= 'utf-8' )
txt = f. read( )
f. close( )
return txt
将新增的文件存入数组,根据数组,讲新增的文件自动拷贝一份到目标路径。
好比每天爬取一次,之前爬到的东西存到了a,第二天再爬,本地有a就不下载了,但新增了b 这样本地会先下载下来b,然后将b的本地路径加入info_list 跑批完成后,会将info_list中的文件,拷贝一份到to_url的路径下
def bk_newImg ( self) :
for img_url in self. info_list:
to_url = self. newimg_src + img_url[ len ( self. src) : ]
to_parUrl = os. path. dirname( to_url)
if not os. path. exists( to_parUrl) :
os. makedirs( to_parUrl)
shutil. copyfile( img_url, to_url)
print ( '新下载图片:' + to_url)
self. info_list. clear( )