import requests
import os,re
import urllib.request as ub
#多线程库
from multiprocessing.dummy import Pool as ThreadPool
from lxml import etree
import time
def getArticleLinks(url):
html = requests.get(url) #requests实现get请求传参的两种方式
#print(html.text) 获取源码,
# print(html.url) 获取源码地址,也就是url ,
# print(html.encoding) 获取UTF-8,
# print(html.status_code) 获取源码行数
Selector = etree.HTML(html.text)
# 通过Xpath 获取每个帖子的url后缀
url_list = Selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
# 在每个后缀前加上百度贴吧的url前缀
for i in range(len(url_list)):
url_list[i] = 'http://tieba.baidu.com' + url_list[i]
return url_list
def download_img(url_list):
shiJS = 0
start = time.time()
if not os.path.exists('../image/downloads'):
os.mkdir('../image/downloads')
root_path = os.getcwd()
for each in url_list:
img_dir = 'downloads/' + each[23:].replace("/", '') #Python replace() 方法把字符串中的 old(旧字符串) 替换成 new(新字符串),
if not os.path.exists('../image/'+img_dir):
os.mkdir('../image/'+img_dir)
#应该在 get_img 之前应该先把当前目录改为要放的文件夹目录下
#os.chdir(path) 可以用来改变python当前所在的文件夹
os.chdir('../image/'+img_dir)
get_img(each)
os.chdir(root_path)
shiJS += 1
if shiJS ==10:
elapsed = (int(time.time() - start))
print("Time used:", elapsed,"秒")
def get_img(url):
html = requests.get(url)
# 这里用Xpath或者之前的re拿到img_url_list
Selector = etree.HTML(html.text)
img_url_list = Selector.xpath('//*[@class="BDE_Image"]/@src')
pic_name = 0
for each in img_url_list:
ub.urlretrieve(each, 'pic_%s.jpg' % pic_name)
pic_name += 1
if __name__ == '__main__':
#
#手动输入网址
#
# print(u'-----贴吧图片爬取装置--------')
# print(u'请输入贴吧地址:')
# targetUrl = input('')
# if not targetUrl:
# print(u'---没有地址输入正在使用默认地址(baidu壁纸吧)---')
# targetUrl = 'http://tieba.baidu.com/f?kw=%E5%A3%81%E7%BA%B8&ie=utf-8'
#
#默认设置网址
#
page = ''
while True:
print(u'请输入你要下载的页数:',)
page = input('')
if re.findall(r'^[0-9]*[1-9][0-9]*$', page):
page = int(page)
break
print(u'----------正在下载图片---------')
pool = ThreadPool(4)
#通过输入的页数,经过改变放入数组,变成url中所需的数字
pg0 = []
n = 0
for i in range(page):
pg0.append(n)
n += 50
#计算并获取正确的url页数,并保存到数组
pg = []
for i in pg0:
gALinks = 'https://tieba.baidu.com/f?kw=%E5%A3%81%E7%BA%B8&ie=utf-8&pn=' + str(i)
pg.append(gALinks)
results = pool.map(getArticleLinks,pg)
pool.map(download_img,results)
# ArticleLinks = getArticleLinks('https://tieba.baidu.com/f?kw=%E5%A3%81%E7%BA%B8&ie=utf-8&pn=50')
# download_img(ArticleLinks, page)
pool.close()
pool.join()
print(u'-----------下载成功-----------')
input('Press Enter to exit')
参考:https://www.cnblogs.com/Axi8/p/5773269.html
视频参考:http://www.jikexueyuan.com/course/902_2.html?ss=1