python爬虫入门超easy系列(五)
如何爬取百度贴吧(lol)
1.先验证必要的帖子元素能够解析,修改run中的内容
import requests
import lxml.html
class TiebaSpider(object):
"""
实现下载某个贴吧指定页码前的内容,存储下载内容
"""
def __init__(self,name,pages):
"""
初始化方法
"""
self.tieba_name = name
self.pages_download = pages
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.header = {"User-Agent": "Python"}
def make_url_list(self):
"""
获取待下载的url列表
:return:返回生成的url列表
"""
url_lists = []
for i in range(self.pages_download):
download_url = self.base_url.format(self.tieba_name, i * 50)
url_lists.append(download_url)
return url_lists
#注意maybe static 警告,怎么消除警告?
def save_url(self,content,file_name):
"""
将指定内容进行存储
:param content:要存储的内容,这里传入的参数要求是二进制
:param file_name:文件名
:return:无
"""
with open(file_name,'wb') as f:
f.write(content)
def download_url(self,url_str):
"""
下载指定url处的内容
:param url_str: 要下载的地址
:return: 返回下载的结果
"""
response = requests.get(url_str,headers=self.header)
return response.text #content是二进制数据
def run(self):
"""
主业务逻辑,使用run后期可以改造成多线程爬虫
:return:
"""
url_lists = self.make_url_list()
#url_str in url_lists:
result_text = self.download_url(url_lists[0])
parse_result = lxml.html.fromstring(result_text)
result_elements = parse_result.cssselect("ul#thread_list > li.j_thread_list")
print(result_elements[0].cssselect("div.threadlist_title > a.j_th_tit")[0].text)
tieba_spider = TiebaSpider('lol',3)
tieba_spider.run()
2.把run方法更新成如下内容,解析并打印帖子列表及下一页内容
def run(self):
"""
主业务逻辑,使用run后期可以改造成多线程爬虫
:return:
"""
url_lists = self.make_url_list()
for url_str in url_lists:
result_text = self.download_url(url_str)
parse_result = lxml.html.fromstring(result_text)
result_elements = parse_result.cssselect("ul#thread_list > li.j_thread_list")
next_element = parse_result.cssselect("div.thread_list_bottom a.next")[0]
print(next_element.text)
print(next_element.xpath(".//@href"))
for result_element in result_elements:
result_thread = result_element.cssselect("div.threadlist_title > a.j_th_tit")[0]
print(result_thread.text)
print(result_thread.xpath(".//@href"))
print("*" * 20)
3.添加url类型对象类用于标识url类型及完善网址
class UrlType(object):
"""
URL类型对象,用来存放网址类型
"""
def __init__(self,url_type,url_str):
"""
初始化函数
:param url_type: url类型 1:下一页 2:详情页
:param url_str: url字符串
"""
self.url_type = url_type
self.url_str = url_str
def compelete_url(self):
"""
根据url_type类型返回完整的url
:return:
"""
if self.url_type == 1:
return "https:"+ self.url_str
if self.url_type == 2:
return "https://tieba.baidu.com" + self.url_str
return self.url_str
4.在run方法中添加queue队列实现持续的解析下载
while not self.crawl_queue.empty():
url_type = self.crawl_queue.get()
#广度优先遍历的访问自身
print("正在下载",url_type.compelete_url())
result_text = self.download_url(url_type.compelete_url())
parse_result = lxml.html.fromstring(result_text)
#广度优先遍历的访问邻居
if url_type.url_type == 1 or url_type.url_type == 0:
result_elements = parse_result.cssselect("ul#thread_list > li.j_thread_list")
next_element = parse_result.cssselect("div.thread_list_bottom a.next")[0]
url_str_next = next_element.xpath(".//@href")[0]
self.crawl_queue.put(UrlType(1,url_str_next))
#print("next page is :",url_str_next)
#print("*" * 50)
for result_element in result_elements:
result_thread = result_element.cssselect("div.threadlist_title > a.j_th_tit")[0]
print(result_thread.text)
url_str_detail = result_thread.xpath(".//@href")[0]
self.crawl_queue.put(UrlType(2,url_str_detail))
print(url_str_detail)
else:
pass
完整代码:
import requests
import lxml.html
from queue import Queue
class TiebaSpider(object):
def __init__(self,name,pages):
self.tieba_name = name
self.pages_download = pages
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.headers = {"User_Agent":"python"}
self.crawl_queue = Queue()
self.crawl_queue.put(UrlType(0,"https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=0"))
def make_url_list(self):
url_lists = []
for i in range(self.pages_download):
download_url =self.base_url.format(self.tieba_name,i*50)
url_lists.append(download_url)
return url_lists
def save_url(self,content,file_name):
with open(file_name,'wb') as f:
f.write(content)
def download_url(self,url_str):
response = requests.get(url_str,headers = self.headers)
return response.content
def run(self):
while not self.crawl_queue.empty():
url_type = self.crawl_queue.get()
print('正在下载:',url_type.compelete_url())
resulet_text = self.download_url(url_type.compelete_url())
parse_result = lxml.html.fromstring(resulet_text)
if url_type.url_type ==1 or url_type.url_type ==0:
resulet_elements = parse_result.cssselect("ul#thread_list > li.j_thread_list")
next_element = parse_result.cssselect("div.thread_list_bottom a.next")[0]
url_str_next = next_element.xpath(".//@href")[0]
self.crawl_queue.put(UrlType(1,url_str_next))
# print(url_str_next)
for resulet_element in resulet_elements:
result_thread = resulet_element.cssselect("div.threadlist_title > a.j_th_tit")[0]
print(result_thread.text)
url_str_detail = result_thread.xpath(".//@href")[0]
self.crawl_queue.put(UrlType(2,url_str_detail))
print(url_str_detail)
else:
pass
class UrlType(object):
def __init__(self,url_type,url_str):
self.url_type = url_type
self.url_str = url_str
def compelete_url(self):
if self.url_type ==1:
return "https:"+self.url_str
if self.url_type ==2:
return "https://tieba.baidu.com" + self.url_str
return self.url_str
tieba_spider = TiebaSpider('lol',3)
tieba_spider.run()