爬取http://www.allitebooks.org/这个网页的图书,将图书的信息保存到csv文件中,解析数据时分别使用BeautifulSoup,正则表达式和Xpath解析,共两份代码
import requests
from bs4 import BeautifulSoup
import json
import csv
import re
items = []
class BookCrawl(object):
def __init__(self):
self.base_url = 'http://www.allitebooks.org/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
#1.构建所有的url,
def get_url_list(self):
url_list = []
for i in range(1,3):#想爬取页面范围
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 2发送请求
def send_request(self,url):
#获取网页的源代码,可以通过浏览器访问,也可以直接调用requests库
# browser.get(url)
# time.sleep(5)
# data = browser.page_source
data = requests.get(url,headers =self.headers).content.decode()
return data
#3 解析数据 分别用xpath 和bs4
def parse_xpath_data(self,data):
parse_data = BeautifulSoup(data,'lxml')
#1. 解析出所有的书 book
book_list = parse_data.find_all('article')
for book in book_list:
book = str(book)
# print(book)
pa