百度贴吧搜索关键字爬取图片批量下载:
思路流程图:
import urllib.request
from urllib import parse
import re
from lxml import etree
import threading
class PostBar(object):
def __init__(self):
self.ua_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
self.url = 'http://tieba.baidu.com/f?'
def start(self):
'''开始执行'''
while True:
word = input('请输入你要爬取的贴吧关键字:').strip()
if word == 'quit': break
beginPage = input('起始页:').strip()
endPage = input('结束页:').strip()
if not (word and beginPage and endPage):continue
word = {'kw':word}
en_word = parse.urlencode(word) #必须传入字典
full_url = self.url+en_word
#print(full_url)
self.handleUrl(full_url,int(beginPage),int(endPage))
def handleUrl(self,url,beginPage,endPage):
'''生成每页的贴吧源代码'''
for page in range(beginPage,endPage+1):
pn = (page-1)*50
full_url = url +'&pn='+str(pn) #完整的url=http://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&pn=0
#print(full_url)
html = self.loadWed(full_url) #下载每页的源代码
new_html = self.handleWeb(html)#将每页的源代码中的隐藏去掉
self.findLink(new_html) #找到每页的图片链接
def loadWed(self,url):
'''下载百度贴吧源代码'''
request = urllib.request.Request(url,headers=self.ua_headers)
respond = urllib.request.urlopen(request)
html = respond.read() #字节型数据
return html
def handleWeb(self,html):
'''将百度源代码中的隐藏标志符去除'''
pattern = re.compile('<!--')
new_html = pattern.sub('', html.decode()) #sub替换str,故bytes要解码成str
new_html = new_html.encode('utf-8') #将html编码还原成bytes
return new_html
def findLink(self,html):
'''将要进行下一页跳转URL地址提取出来'''
selector = etree.HTML(html)
link_list = selector.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for link in link_list:
full_url = "http://tieba.baidu.com" + link
#print(full_url)
comment_html = self.loadPage(full_url) #将评论页的网页源代码下载下来
self.findImage(comment_html) #在源代码中匹配中图片
print('----解析提取完一页-----\n')
def loadPage(self,url):
'''访问贴吧评论的页面,并下载下来'''
request = urllib.request.Request(url,headers=self.ua_headers)
respond = urllib.request.urlopen(request)
comment_html = respond.read()
return comment_html
def findImage(self,html):
'''在源代码中匹配出图片(adbsfee0d9f.jpg)'''
selector = etree.HTML(html)
image_list = selector.xpath('//div[@id]/img[@class="BDE_Image"]/@src')
#print(image_list)
for link in image_list:
t = threading.Thread(target=self.loadImage,args=(link,)) #多线程下载图片
t.start()
def loadImage(self,url):
'''访问其图片并下载到本地磁盘'''
request = urllib.request.Request(url,headers=self.ua_headers)
respond = urllib.request.urlopen(request)
image = respond.read()
filename = url[-15:]
with open('F:\爬虫图片库\爬虫图片03\%s'%filename,'wb') as f:
f.write(image)
if __name__ == '__main__':
spider = PostBar()
spider.start()