爬虫惯用的两种解析方法xpath和BeautifulSoup 的案例
要爬取的网站为一个外国网站 (侵删)
xpath解析方法
import requests
from lxml import etree
import json
class BookSider(object):
def __init__(self):
self.base_url = "http://www.allitebooks.org/page/{}"
self.headers = {
'User-Agent': '这里填写自己浏览器的User-Agent'
}
self.data_list = []
#注意:'User-Agent': '这里填写自己浏览器的User-Agent'
def get_url_list(self):
url_list = []
for i in range(1,10):
#获取9页数据 1-9
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 1.发请求
def send_request(self,url):
data = requests.get(url,headers=self.headers).content.decode()
return data
# 2.解析数据
def parse_xpath_data(self,data):
paese_data = etree.HTML(data)
# 解析所有本书
book_list = paese_data.xpath('//div[@class="main-content-inner clearfix"]/article')
# 解析所有本书的数据
for book in book_list:
num = str(book)
# (1)书名
book_name = book.xpath('.//h2[@class="entry-title"]/a/text()')
# (2)书的图片url链接
book_img_url = book.xpath('.//img[@class="attachment-post-thumbnail wp-post-image"]/@src')
# (3)书的作者
book_author = book.xpath('.//h5[@class="entry-author"]/a/text()')
# book_author = book.xpath('/html/body/div/div[2]/section/main/div/article[2]/div[2]/header/div/span/h5/a/text()')
# (4)书的简介
book_intro = book.xpath('.//div[@class="entry-summary"]/p/text()')
book_data = {
"num" : num,
"book_name" : book_name,
"book_img_url" : book_img_url,
"book_author" : book_author,
"book_intro" : book_intro
}
self.data_list.append(book_data)
print(num)
# print(self.data_list)
# 3.保存文件
def save_data(self,):
data_str = json.dumps(self.data_list)
with open("book_xpath.json","w",encoding="utf-8")as f:
f.write(data_str)
# 4.运行
def run(self):
url_list = self.get_url_list()
for url in url_list:
data = self.send_request(url)
print(url)
self.parse_xpath_data(data)
self.save_data()
BookSider().run()
BeautifulSoup解析方法
import requests
from bs4 import BeautifulSoup
import json
class BookSider(object):
def __init__(self):
self.base_url = 'http://www.allitebooks.org/page/{}'
self.headers = {
'User-Agent': '这里填写自己浏览器的User-Agent'
}
self.book_data_list = []
self.i = 0
#注意:'User-Agent': '这里填写自己浏览器的User-Agent'
# 1.列表页数
def get_pages(self):
url_list = []
for i in range(1,3):
#获取2页数据 1、2
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 2.发请求
def send_request(self,url):
data = requests.get(url,headers=self.headers).content.decode()
return data
# 3.解析数据
def parse_data(self,data):
html = BeautifulSoup(data,'lxml')
book_list = html.find_all('article')
for book in book_list:
self.i = self.i+1
book_name = book.select_one('.entry-title').get_text()
book_img_url = book.select_one('.attachment-post-thumbnail').get('src')
book_author = book.select_one('.entry-author').get_text()[3:]
book_intro = book.select_one('.entry-summary p').get_text()
book_data = {
"序号" : self.i,
"book_name" : book_name,
"book_img_url" : book_img_url,
"book_author" : book_author,
"book_intro" : book_intro
}
self.book_data_list.append(book_data)
# 4.保存文件
def save_data(self,data,file_path):
data_str = json.dumps(self.book_data_list)
with open(file_path,"w",encoding="utf-8")as f:
f.write(data_str)
# 5.run程序
def run(self):
url_list = self.get_pages()
for url in url_list:
data = self.send_request(url)
print(url)
self.parse_data(data)
self.save_data(data,"book_bs4.json")
BookSider().run()
User-Agent获取方法
图片:
JSON数据的呈现
针对底下评论说明:
xpath方法不是安装在浏览器上的 他是在一个环境上的 。 如果一个编辑器在有xpath环境下运行有xpath方法的程序 方可运行成功。
下面是添加bs4和xpath在编辑器(Idea)里的添加方法之一:
还有一种方法是直接在cmd的python环境下输入相关命令(比如:xpath方法是 from lxml import etree)添加。
区分:
Xpath-Helper插件:这个是安装在浏览器上面的插件,可以帮助程序员在浏览器中测试你的xpath方法是否正确, 不用每次在你的编辑器上运行程序查看结果是否正确。 这个方法也是提供了便利吧。