运行环境:
Python3.6
2019-05-24更新,由于原有的页面改版了,所以现在新增了一个[纵横中文网book.zongheng.com
]采集代码Demo
- 存在反爬,导致爬虫运行出错,下面两个方法亲测可以解决
- 加入
代理IP
,我写了一个代理IP
提取接口 -> 跳转; - 将浏览器访问生成的
Cookie
信息加入到headers
中;
- 加入
- 该爬虫不能正确抓取
VIP权限
才能访问的内容
# -*- coding: utf-8 -*-
# @Author : Leo
import re
import os
import logging
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
logging.basicConfig(level=logging.INFO, # 最低输出
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class ZonghengSpider:
"""
纵横中文网爬虫
- http://book.zongheng.com/
"""
# 小说保存主路径
novel_save_dir = 'novels'
session = requests.session()
# 设置重试次数
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
def __init__(self):
self.session.headers.update(
{
'Host': 'book.zongheng.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'})
self.chapter_url = 'http://book.zongheng.com/api/chapter/chapterinfo?bookId={book_id}&chapterId={chapter_id}'
def crawl(self, target_url: str):
"""
开始爬取当前指定url
:param target_url: 为需要爬取的书籍页面URL
:return:
"""
def request_url(url):
resp = self.session.get(url=url)
if resp.status_code == 200:
return resp.json()
else:
return None
book_name, book_id, chapter_id = self.get_page_info(target_url)
logging.info(f'获取到的书籍名: {book_name}, 书籍ID: {book_id}, 首章ID: {chapter_id}')
if all([book_name, book_id, chapter_id]):
# 设置保存路径
novel_save_path = os.path.join(self.novel_save_dir, book_name)
if not os.path.exists(novel_save_path):
os.makedirs(novel_save_path)
logging.info(f'书籍保存路径: {novel_save_path}')
index = 0
while True:
index += 1
chapter_url = self._get_chapter_url(book_id, chapter_id)