python百度贴吧图片下载脚本实例

功能介绍: 对百度贴吧内的图片进行下载;
python版本: python2.7
用到的库: urllib,requests

核心原理

使用urllib库爬取贴吧页面的图片链接,将其进行下载;requests用于获取当前访问页面返回状态码;

urllib.urlopen(url).read()
urllib.urlretrieve(pictures,Path_img)
requests.get(url).status_code

原理简单不用多说直接上code

code

#!/usr/bin/Python
# -*- coding: utf-8 -*-
__author__ = "Man_ge"

import urllib
import requests
import time,re,os,sys,random
import datetime

reload(sys)
sys.setdefaultencoding('utf-8')

#保存路径
LOCAL_PATH = "C:\\Users\\Administrator\\Desktop\\meinv4\\"

#basic function
class TB_get:
	def __init__(self):
		pass

	#获取html
	def get_html(self,url):
		page = urllib.urlopen(url).read()
		return page

	#获取url状态
	def get_state(self,url):
		code=requests.get(url).status_code
		return code

	#获取网页title
	def get_title(self,url):
		reg = r'<title>(.*?)</title>'
		reger = re.compile(reg)
		data = re.findall(reger, urllib.urlopen(url).read())
		return data[0].decode('UTF-8').encode('GBK')

	#获取回复信息
	def get_Replypost(self,url):
		reg = r'l_reply_num.*?</li>'
		reger = re.compile(reg)
		data = re.findall(reger, urllib.urlopen(url).read())
		info = re.compile(r'<span .*?>(.*?)</span>')
		info_data = re.findall(info, str(data))
		return int(info_data[0])

	#页数
	def get_pagenumber(self,url):
		reg = r'l_reply_num.*?</li>'
		reger = re.compile(reg)
		data = re.findall(reger, urllib.urlopen(url).read())
		info = re.compile(r'<span .*?>(.*?)</span>')
		info_data = re.findall(info, str(data))
		return int(info_data[1])

class TB_filter:
	def __init__(self,html_page):
		self.data=html_page
 
	#匹配所有<href>
	def filter_href(self):
	    reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
	    reger = re.compile(reg)
	    data = re.findall(reger, self.data)
	    return data

	#匹配所有<a>
	def filter_a(self):
	    reg = r'<a .*?>(.*?)</a>'
	    reger = re.compile(reg)
	    data = re.findall(reger, self.data)
	    return data

	#匹配所有 src:
	def filter_src(self):
	    reg = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
	    reger = re.compile(reg)
	    data = re.findall(reger, self.data)
	    return data

#下载功能; 下载 png,jpg
def download_img(path_html):
	tb = TB_get()
	print "Title : ",tb.get_title(path_html)
	if 'page404' in tb.get_html(path_html):
		print u"很抱歉,该贴已被删除。"
	else:
		print "state : ",tb.get_state(path_html)
		save_path=LOCAL_PATH+tb.get_title(path_html)+"\\"
		isExists=os.path.exists(save_path)
		if not isExists:
			os.makedirs(save_path)
		page_number = tb.get_pagenumber(path_html)#获取当前贴吧的页数
		print u"页数 : ",page_number
		print u"回复贴 : ",tb.get_Replypost(path_html)
		download_page = 0
		while download_page < page_number:
			download_html=path_html+'?pn='+str(download_page+1)#对每页进行下载
			print "\n\nstart access : ",download_html
			state_code=tb.get_state(download_html)
			print "state : ",state_code
			if tb.get_state(download_html) == 200:#如果状态是200就可以下载 否则不能下载
				page_data = tb.get_html(download_html)
				fl = TB_filter(page_data)
				data = fl.filter_src()
				pictures_number=0
				for pictures in data:
					pictures_number+=1
					if pictures.split(".")[-1] in ["png","jpg"]:#筛选出 png,jpg为后缀的图片格式进行下载
						http_1=str(pictures.split("/")[0])
						if http_1=="https:":
							name= str(pictures.split("/")[-1])
							tt= int(time.time())
							newname=str(tt)+".jpg"
							Path_img=save_path+newname
							imgname=str(name.split("_")[0])
							if imgname != "image" and '?' not in name:
								print "\nstart download ====> "+name
								print "loading......."
								urllib.urlretrieve(pictures,Path_img)
								print "download succees ====> "+newname
								time.sleep(1)
			else:
				print "access failed!! state : ",state_code
			download_page+=1
			
#下载器  只需要给定帖子路径,和帖子页数
def downloader(tb_path,tb_pg):
	tb_path='https://tieba.baidu.com/f?kw='+tb_path+'&ie=utf-8&pn='+str((tb_pg-1)*50)
	#print tb_path
	tb = TB_get()
	get_all_tb=tb.get_html(tb_path)
	if tb.get_state(tb_path) == 200:
		print "\n\nAccess : ",tb_path
		reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
		reger = re.compile(reg)
		data = re.findall(reger, get_all_tb)
		for tb_link in data:
			reg1 = r'//tieba.baidu.com/p/.{0,}|/p/.{0,}'
			reger1 = re.compile(reg1)
			all_tb_link = re.findall(reger1, tb_link)
			if all_tb_link != []:#获取当前页数的贴吧的所有帖子
				assign_link=str(all_tb_link).split("/p")[-1]
				assign_link=str(assign_link)[0:-2]
				donwload_link= "https://tieba.baidu.com/p"+assign_link
				print donwload_link
				download_img(donwload_link)
	else:
			print "access failed!! state : ",state_code


if __name__ == '__main__':
	n=0
	#下载美女贴吧1到10页的每个帖子里的图片,一共500个帖子的图片
	while n<10:
		downloader('美女',n+1)
		n+=1

运行

在这里插入图片描述

产出

在这里插入图片描述

在这里插入图片描述

版权声明:本文出自Man_ge博客原创文章,转载必须注明出处:https://mp.csdn.net/mdeditor/83040439

作者:Man_ge https://blog.csdn.net/Man_ge

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值