爬虫基本格式上篇有了,这篇记录图片的爬取,同样没啥技术点,上代码
import requests
from lxml import etree
from pprint import pprint
class TiebaSpider(object):
def __init__(self):
self.kw = "魔兽世界"
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.headers = {
"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
}
def run(self):
url_list = [self.base_url.format(self.kw,pn) for pn in range(0,50,50)]
for url in url_list:
response = requests.get(url,headers=self.headers)
html = response.content.decode('utf-8')
# 数据提取
eroot = etree.HTML(html)
# 提取行内容
rows = eroot.xpath('//li[@class="tl_shadow tl_shadow_new"]')
print(rows)
for row in rows:
# 提取标题
# 必须添加 . 表示当前
titles = row.xpath('.//div[@class="ti_title"]/span/text()')
if titles is not None and len(titles) > 0:
title = "".join(titles)
# print(title)
# 提取图片
imgs = row.xpath('.//img[@class="j_media_thumb_holder medias_img medias_thumb_holder"]/@data-url')
for img in imgs:
print(etree.tostring(img).decode('utf-8'))
pprint(imgs)
pass
if __name__ == '__main__':
spider = TiebaSpider()
spider.run()
富文本编辑器不太熟练,注意缩进