学习总结:
1.经试验,无需定义请求头headers也能爬取到数据
2.网站编码使用utf-8,试验时要了解清楚网站编码,以免数据乱码,返回数据可以使用response.encoding='网站编码’指定编码类型
3.图片数据的保存需使用二进制编码保存:response.content,使用python with open语句即可保存图片
# -*- coding=utf-8 -*-
import requests
from lxml import etree
class CrawlTieba:
def __init__(self):
# self.header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
self.info_list = []
self.info_dict = {}
# 构造请求url,返回请求url列表
def get_url(self,page):
default_url = "https://tieba.baidu.com/f?kw=猫&ie=utf-8&pn={}"
url_list = []
for i in range(1,page+1):
url = default_url.format(i*50) #每个页面50个标题
url_list.append(url)
return url_list
# 处理请求,返回请求结果
def request_url(self,url):
response = requests.get(url=url)
return response
# 处理返回的请求为HTML对象,可使用xpath
def handle_html(self, html):
html = etree.HTML(html)
return html
# 使用xpath解析请求结果,提取所需的数据,返回数据列表
def parse_html(self,html):
div_list = html.xpath('//ul[@id="thread_list"]/li//div[contains(@class,"threadlist_title")]/a')
info_list = []
for div in div_list:
info_dict = {}
info_dict['title'] = div.xpath('./@title')[0]
info_dict['detail_url'] = 'https://tieba.baidu.com'+div.xpath('./@href')[0]
info_list.append(info_dict)
return info_list
# 爬取详情页内图片url,返回图片url列表
def crawl_img_url(self,html_detail_page):
img_url_list= html_detail_page.xpath('//img[@class="BDE_Image"]/@src')
return img_url_list
# 传入图片url,爬取图片,返回二进制数据
def crawl_img(self,img_url):
img_response = self.request_url(img_url)
return img_response.content
# 主函数入口
def run(self, page):
url_list = self.get_url(page)
for url in url_list:
response = self.request_url(url)
html = self.handle_html(response.text)
info_lsit = self.parse_html(html) #返回首页信息列表,每个title,以及详情页url
self.info_list = info_lsit
# 进入详情页,处理详情页数据
a = 1
for i in range(len(self.info_list)):
detail_url = self.info_list[i]['detail_url']
response_detail = self.request_url(detail_url)
html_detail_page = self.handle_html(response_detail.text)
img_url_list = self.crawl_img_url(html_detail_page) #返回详情页图片列表
self.info_list[i]['img_url_list'] = img_url_list
self.info_list.append(self.info_list[i]['img_url_list'])
print('done:{}'.format(a))
a += 1
# 下载图片
# for img_url in img_url_list:
# img_info = self.crawl_img(img_url)
# with open(img_url.split('/')[-1],'wb') as f:
# f.write(img_info)
# f.close()
print(self.info_list)
if __name__ == '__main__':
crawl_tieba = CrawlTieba()
crawl_tieba.run(1) # 输入爬取的页面数1