爬取书名、评分、评论人数、作者/译者、出版社、出版日期、售价,并保存到Excel和mysql数据库。
技术:运用多线程(每一个书籍类型创建一个线程)加速处理
注意:爬取时,只是在类型转变时,进行了随机延时,在爬取具体书籍评论量时,没有进行延迟处理。别爬太快,防止ip被封。
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
'''
@File : douban_spider.py
@Contact : raogx.vip@hotmail.com
@License : (C)Copyright 2019-2020, Liugroup-NLPR-CASIA
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020/4/2 16:00 ligang 1.0 None
加 V 交流:15188607997
'''
import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor
import numpy
import numpy as np
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
# Some User Agents Chrome/Edge/IE
User_Agents = [
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
]
def book_spider(book_tag):
"""
进行书籍详情爬取
:param book_tag: 书籍类型
:return: 书籍信息
"""
book_list = list()
try_times = 0
# 起始页:0 结束页:5
for page_num in range(0, 5):
url = 'https://www.douban.com/tag/' + \
urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5)
try:
source_code = requests.get(
url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml') # lxml module is required.
list_soup = soup.find('div', attrs={'class': 'mod book-list'})
try_times += 1
if list_soup is None and try_times < 200:
continue
elif list_soup is None or len(list_soup) <= 1:
break
# 遍历取出当前页所有的书籍信息
for book_info in list_soup.findAll('dd'):
title = book_info.find(
'a', attrs={
'class': 'title'}).string.strip()
desc = book_info.find(
'div', attrs={
'class': 'desc'}).string.strip()
desc_list = desc.split('/')
# 获取每本书的url地址
book_url = book_info.find(
'a', attrs={'class': 'title'}).get('href')
# 获取作者
try:
author_info = '/'.join(desc_list[0:-3])
except BaseException:
author_info = ' 暂无'
# 获取出版社
try:
pub_info = '/'.join(desc_list[-3:-2])
except BaseException:
pub_info = ' 暂无'
# 获取出版日期
try:
pub_date = desc_list[-2:-1][0]
except BaseException:
pub_date = ' 暂无'
# 获取售价
try:
price = desc_list[-1]
except BaseException:
price = ' 暂无'
# 获取评分
try:
rating = book_info.find('span',
{'class': 'rating_nums'}).string.strip()
except BaseException:
rating = '0.0'
# 获取每本书的详细信息
try:
people_num = get_people_num(book_url)
people_num = people_num.strip('人评价')
except BaseException:
people_num='0.0'
book_list.append([title, rating, people_num, author_info, pub_info, pub_date, price])
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
return book_list
# 访问每本书的url地址,以获取详细信息
def get_people_num(url):
"""
爬取书籍评论数
:param url: 具体的某本书籍的url地址
:return: 评论数
"""
source_code = requests.get(
url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text
plain_text = str(source_code)
soup = BeautifulSoup(plain_text,'lxml')
people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip()
return people_num
def fetch_list(book_tag, book_dicts):
"""
书籍爬取,并通过评分,进行排序
:param book_tag: 书籍类型
:param book_dicts: 字典格式返回书籍信息
:return:
"""
book_list = book_spider(book_tag)
# 按照评分进行排序
book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
book_dicts[book_tag] = book_list
def run_spider(book_tag_lists):
"""
创建线程,进行书籍爬取
:param book_tag_lists: 所有的需要爬取的书籍类型
:return: 所有的数据详情信息
"""
# 存储所有的书籍信息: 字典格式:'文化':[], '算法':[]
book_dicts = dict()
# 运用多线程进行爬取,每一类型创建一个线程
with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor:
for book_tag in book_tag_lists:
executor.submit(fetch_list, book_tag, book_dicts)
return book_dicts
def output_to_excel(book_dicts, book_tag_lists):
"""
所有的详情信息写入到Excel中,按照类型分sheet页
:param book_dicts: 爬取的书籍信息
:param book_tag_lists: 书籍类型列表
:return:
"""
wb = Workbook(write_only=True)
for book_tag in book_tag_lists:
ws = wb.create_sheet(title=book_tag)
ws.append(['序号', '书名', '评分', '评论人数','作者/译者', '出版社', '出版日期', '售价', '评论'])
for index, book_list in enumerate(book_dicts[book_tag], start=1):
ws.append([index, book_list[0], book_list[1], book_list[2], book_list[3], book_list[4], book_list[5],
book_list[6]],)
file_name = 'Book-List'
# 拼接保存文件名字
for i in range(len(book_tag_lists)):
file_name += ('-' + book_tag_lists[i])
file_name += '.xlsx'
wb.save(file_name)
if __name__ == '__main__':
# 再次添加需要爬取的书籍类型
book_tag_lists = ['Python', '算法']
book_dicts = run_spider(book_tag_lists)
output_to_excel(book_dicts, book_tag_lists)
print("----All Done----")
结果:
数据保存到 mysql数据库
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
'''
@File : douban_spider.py
@Contact : raogx.vip@hotmail.com
@License : (C)Copyright 2019-2020, Liugroup-NLPR-CASIA
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020/4/2 16:00 ligang 1.0 None
加 V 交流:15188607997
'''
import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor
import numpy
import numpy as np
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import pymysql
# Some User Agents Chrome/Edge/IE
User_Agents = [
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
]
def book_spider(book_tag):
"""
进行书籍详情爬取
:param book_tag: 书籍类型
:return: 书籍信息
"""
book_list = list()
try_times = 0
# 起始页:0 结束页:5
for page_num in range(0, 5):
url = 'https://www.douban.com/tag/' + \
urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5)
try:
source_code = requests.get(
url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml') # lxml module is required.
list_soup = soup.find('div', attrs={'class': 'mod book-list'})
try_times += 1
if list_soup is None and try_times < 200:
continue
elif list_soup is None or len(list_soup) <= 1:
break
# 遍历取出当前页所有的书籍信息
for book_info in list_soup.findAll('dd'):
title = book_info.find(
'a', attrs={
'class': 'title'}).string.strip()
desc = book_info.find(
'div', attrs={
'class': 'desc'}).string.strip()
desc_list = desc.split('/')
# 获取每本书的url地址
book_url = book_info.find(
'a', attrs={'class': 'title'}).get('href')
# 获取作者
try:
author_info = '/'.join(desc_list[0:-3])
except BaseException:
author_info = ' 暂无'
# 获取出版社
try:
pub_info = '/'.join(desc_list[-3:-2])
except BaseException:
pub_info = ' 暂无'
# 获取出版日期
try:
pub_date = desc_list[-2:-1][0]
except BaseException:
pub_date = ' 暂无'
# 获取售价
try:
price = desc_list[-1]
except BaseException:
price = ' 暂无'
# 获取评分
try:
rating = book_info.find('span',
{'class': 'rating_nums'}).string.strip()
except BaseException:
rating = '0.0'
# 获取每本书的详细信息
try:
people_num = get_people_num(book_url)
people_num = people_num.strip('人评价')
except BaseException:
people_num='0.0'
print((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag))
insert_data((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag))
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
return book_list
# 访问每本书的url地址,以获取详细信息
def get_people_num(url):
"""
爬取书籍评论数
:param url: 具体的某本书籍的url地址
:return: 评论数
"""
source_code = requests.get(
url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text
plain_text = str(source_code)
soup = BeautifulSoup(plain_text,'lxml')
people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip()
return people_num
def fetch_list(book_tag, book_dicts):
"""
书籍爬取,并通过评分,进行排序
:param book_tag: 书籍类型
:param book_dicts: 字典格式返回书籍信息
:return:
"""
book_list = book_spider(book_tag)
# 按照评分进行排序
book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
book_dicts[book_tag] = book_list
def run_spider(book_tag_lists):
"""
创建线程,进行书籍爬取
:param book_tag_lists: 所有的需要爬取的书籍类型
:return: 所有的数据详情信息
"""
# 存储所有的书籍信息: 字典格式:'文化':[], '算法':[]
book_dicts = dict()
# 运用多线程进行爬取,每一类型创建一个线程
with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor:
for book_tag in book_tag_lists:
executor.submit(fetch_list, book_tag, book_dicts)
return book_dicts
def connect_mysql():
global conn
# 打开数据库连接 IP 用户名 密码 库名
conn = pymysql.connect('localhost', 'root', '123456', 'shuping')
def insert_data(data=''):
# 使用cursor()方法创建一个游标对象
cursor = conn.cursor()
# SQL语句:向数据表中插入数据
sql = """insert into doubanbook(title, rating, people_num, author_info, pub_info, pub_date, price, booktype) value(%s, %s, %s, %s, %s, %s, %s, %s)"""
# 异常处理
# try:
# 执行SQL语句
cursor.execute(sql, data)
# 提交事务到数据库执行
conn.commit() # 事务是访问和更新数据库的一个程序执行单元
# except:
# # 如果发生错误则执行回滚操作
# conn.rollback()
def close_mysql():
# 关闭数据库连接
conn.close()
if __name__ == '__main__':
connect_mysql()
# 再次添加需要爬取的书籍类型
book_tag_lists = ['Python', '算法']
book_dicts = run_spider(book_tag_lists)
print("----All Done----")
结果: