parse下载内容不报错但是抓不下来数据,列表为空,不知道是哪儿原因
import requests
from lxml import etree
import random
import time
from urllib import parse
class TiebaImageSpider(object):
def init(self):
self.url = “http://tieba.baidu.com/f?kw={}&pn={}”
self.headers = {
‘User-Agent’: ‘Mozilla/5.0(Windows NT 6.1; WOW64)AppleWebKit/535.1(KHTML, like Gecko)Chrome/14.0.835.163 Safari/535.1’}
# 功能函数,获取html
def get_html(self, url):
res = requests.get(url=url, headers=self.headers)
html = res.content
return html
# 功能函数,解析
def xpath_func(self, html, xpath_bds):
parse_obj = etree.HTML(html)
# print(parse_obj)
r_list = parse_obj.xpath(xpath_bds)
# print(r_list)
return r_list
# 做事情
def parse_html(self, url):
# 1.先提取帖子链接
one_html = self.get_html(url).decode()
xpath_bds = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
tlink_list = self.xpath_func(one_html, xpath_bds)
# print(tlink_list)
# for 遍历
for tlink in tlink_list:
tlink = "http://tieba.baidu.com" + tlink
self.get_image(tlink)
# 像帖子发请求+提取图片链接+像图片发请求+保存图片
def get_image(self, tilnk):
html = self.get_html(tilnk).decode()
xpath_bds = '//li/div/div/div/div/img[@class="j_retract"]/@src'
# //div[@class="d_post_content j_d_post_contenclearfix"]/img[@class["BDE_Image"]/@src
img_link_list = self.xpath_func(html, xpath_bds)
for img_link in img_link_list:
# 保存图片
self.save_image(img_link)
def save_image(self, img_link):
html = self.get_html(img_link)
filename = img_link[-10:]
with open(filename, 'wb') as f:
f.write(html)
print(filename, '下载成功')
def run(self):
name = input('贴吧名:')
start = int(input('起始页:'))
end = int(input('终止页:'))
name = parse.quote(name)
for page in range(start, end + 1):
pn = (page - 1) * 50
url = self.url.format(name, pn)
self.parse_html(url)
# print(self.parse_html(url))
if name == ‘main’:
spider = TiebaImageSpider()
spider.run()