其实没有用到header,用了反而什么也没反应,可能是一种反爬措施吧。。
#图片爬虫
import urllib
import urllib.request
from lxml import etree
class Spider(object):
def __init__(self):
self.tiebaName ="女明星"
self.beginPage =1
self.endPage =3
self.url ="http://tieba.baidu.com/f?"
self.ua_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0\
.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
self.fileName = 0
# 构造url
def tiebaSpider(self) :
word_dict = {'kw':self.tiebaName}
word_encode = urllib.parse.urlencode(word_dict)
word_url = self.url + word_encode + '&ie=utf-8&tab=album'
self.loadPage(word_url)
# 爬取页面内容
def loadPage(self,url):
request_info = urllib.request.Request(url)
read_data = urllib.request.urlopen(request_info).read()
html = etree.HTML(read_data)
links_half = html.xpath('//div[@class="gr_block_main"]/div/div/a/@href')
for link in links_half:
link_full = 'http://tieba.baidu.com' + link
self.loadImages(link_full)
# 爬取帖子详情页,获得图片的链接
def loadImages(self,link_full):
request_info = urllib.request.Request(link_full)
read_data = urllib.request.urlopen(request_info).read()
html = etree.HTML(read_data)
links = html.xpath('//img/@src')
for image_link in links:
condition = 'http'
if condition in image_link:
self.writeImages(image_link)
else:
continue
#通过图片所在链接,爬取图片并保存图片到本地:
def writeImages(self,image_link):
image = urllib.request.urlopen(image_link).read()
# 保存图片到本地
file = open(r"E:\picture\\"+str(self.fileName)+".jpg","wb")
file.write(image)
file.close()
self.fileName+=1
print("downloading......",self.fileName)
if __name__ == '__main__':
mySpider=Spider()
mySpider.tiebaSpider()