需求
进入某贴吧,爬取贴吧内每个帖子里的图片,不包含广告图片
流程
-
找某贴吧url规律,规律如下:
http://tieba.baidu.com/f?kw={}&pn={}
pn = (page-1)*50 -
获取某贴吧首页(一级页面)每个帖子的链接的xpath表达式,如下:
//li[@class=“j_thread_list clearfix”]/div/div/div/div/a/@href -
获取帖子内部(二级页面)的图片,不包括广告图片,如下:
//div[@class=“d_post_content j_d_post_content clearfix”]/img[@class=“BDE_Image”]/@src -
写具体爬虫代码,先写通用功能函数
详细代码
import requests
from urllib import parse
import time
import random
from lxml import etree
class TiebaImageSpider:
def __init__(self):
self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
self.headers = {'User-Agent':'''Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)
'''}
#功能函数1:请求
def get_html(self,url):
#用decode()直接得到字符串
html = requests.get(url=url,headers=self.headers).content.decode('utf-8','ignore')
return html
#功能函数2:xpath匹配
def xpath_func(self,html,xpath_bds):
p = etree.HTML(html)
r_list = p.xpath(xpath_bds)
return r_list
#具体获取数据的主函数
def parse_html(self,one_url): #one_url由run()调用该函数时传进来
#提取一级页面帖子链接,xpath返回值格式为['xxx','xxx'],提取出来的是不带'http://tieba.baidu.com'的
one_html = self.get_html(one_url)
xpath_bds = '//li[@class="j_thread_list clearfix"]/div/div/div/div/a/@href'
link_list = self.xpath_func(one_html,xpath_bds)
for link in link_list:
#拼二级页面链接
two_url = 'http://tieba.baidu.com' + link
#封装一个将一个帖子内所有图片保存到本地的函数
self.save_image(two_url)
#具体保存图片的函数
def save_image(self,two_url):
#提取二级页面图片链接,xpath返回值格式为['http:xxx','http:xxx']
two_html = self.get_html(two_url)
xpath_bds = '//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src'
imglink_list = self.xpath_func(two_html,xpath_bds)
#一张张下载图片
for imglink in imglink_list:
#封装一个下载保存图片的函数
self.download_image(imglink)
#下载一张图片后随即休眠0-1秒
time.sleep(random.uniform(0,1))
#下载保存图片
def download_image(self,imglink):
#获取图片响应
img_html = requests.get(url=imglink,headers=self.headers).content
#保存图片
#以图片链接的后10位命名该图片
filename = imglink[-10:]
with open(filename,'wb') as f:
f.write(img_html)
print(filename,'下载成功')
#入口函数
def run(self):
name = input('请输入贴吧名:')
begin = int(input('请输入起始页:'))
end = int(input('请输入终止页:'))
params = parse.quote(name)
for page in range(begin,end+1):
pn = (page-1)*50
url = self.url.format(params,pn)
self.parse_html(url)
if __name__ == '__main__':
spider = TiebaImageSpider()
spider.run()