前言
相信很多观众老爷们喜欢耍豆瓣啥的,看看书,喝喝茶的(ps:拉倒吧),哈哈哈,经过10800秒的研究,我分析了豆瓣图书的结构,发现在不登录的情况下,短评论只能爬取220条的数据,但是!!!注意!!!!
长评论是没有限制的啊,那还不用力抓???原文摘录和读书笔记都是没有限制额,还不抓???安排
首先先安排给观众老爷们看看效果
兴奋起来了不,来,我们上源码,然后尝试使用
# -*- coding:utf-8 -*-
import re
import threading
import time
from random import randint
import pymongo
import requests
from lxml import etree
from tqdm import tqdm
class DouBanBook():
'''豆瓣书评、摘录、读书笔记抓取'''
def __init__(self, *args):
self.book_id = args
self.base_url = 'https://book.douban.com/subject/'
self.book_content_url = [] # 获取图书主页url
self.book_original_url = [] # 获取图书原文摘录url
self.book_comments_url = [] # 获取图书评论url
self.book_notes_url = [] # 获取图书笔记url
self.book_name = [] # 获取书名
self.proxy = {
'http':'http://127.0.0.1:8080'
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
}
def config_url(self):
'''配置起始抓取网页地址'''
print('配置起始地址...')
for book_id in tqdm(self.book_id):
# 配置起始地址
temp = self.base_url + str(book_id)
self.book_content_url.append(temp)
self.book_original_url.append(temp + '/blockquotes')
self.book_comments_url.append(temp + '/reviews')
self.book_notes_url.append(temp + '/annotation')
def get