斗图网斗图全站爬取（用正则表达式re）

最新推荐文章于 2021-08-13 21:12:04 发布

星星火_

最新推荐文章于 2021-08-13 21:12:04 发布

阅读量677

点赞数

文章标签： re 正则表达式 Python爬虫斗图

本文链接：https://blog.csdn.net/qq_42276808/article/details/84432061

版权

import re
import requests
import os

class doutu_spyder():
first_url=[]
first_name=[]
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

def open_url(self,url):#爬取网页代码
html=requests.get(url,headers=self.headers)
html=html.content.decode()
return html

def get_first_url(self,url):#得到首页表情包的URL
html=self.open_url(url)
first_name_re=re.compile(r'<div class="thumbnail".*?<a .*?rel="bookmark" target="_blank" title="(.*?)[ \[]',re.S)
self.first_name=first_name_re.findall(html)
first_url_re=re.compile(r'<div class="thumbnail".*?<a href="(.*?)"',re.S)
self.first_url=first_url_re.findall(html)
#i=0i += 1
print(self.first_name)
print(self.first_url)
for item in self.first_name:
if not os.path.exists('D:/img/%s'%item):
os.mkdir('d:/img/%s'%item)#创建文件夹

def download_img(self):
i=-1
for url in self.first_url:#把二级页面的
html=self.open_url(url)
first_url_re=re.compile(r'<img title=.*?src="(.*?)"',re.S)
first_url1=first_url_re.findall(html)
print(url)
i += 1
t=0
print(self.first_name[i])
for item in first_url1:#图片URL
t=t+1
print(item)
image_name = '斗图_' + str(t) + '.gif' # 图片命名
print(image_name)
with open('D:/img//{}/{}'.format(self.first_name[i],image_name),'ab')as f:
img=requests.get(item,headers=self.headers)
f.write(img.content)
f.close

# def download_(self,path,name):

if __name__ == '__main__':
spyder= doutu_spyder()#爬单页
spyder.get_first_url(url='http://www.bbsnet.com/')
spyder.download_img()
spyder.first_name=[]
spyder.first_url=[]

for a in rang(2,): 爬多页
url='http://www.bbsnet.com/page/'+str(a)
spyder.get_first_url(url)
spyder.download_img()
spyder.first_name=[]
spyder.first_url=[]

星星火_

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
斗图网斗图全站爬取（用正则表达式re）

import re import requestsimport osclass doutu_spyder(): first_url=[] first_name=[] headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ...
复制链接

扫一扫