最近闲来无事,拿来练练手。
注:
由于网站可能会变动,本代码不保证后面一直都能用,仅讲述抓取的思路;
个人纯属研究使用,请不要应用于商业目的;
使用语言:Python
版本:3.4.3
依赖:BeautifulSoup、requests(可以使用pip install进行安装)
代码也比较简单,直接贴上来:
HttpClient.py
# -*- coding: utf-8 -*-
import requests
def make_request(url):
print('make_request: ', url)
r = requests.get(url, timeout=(30, 90))
# if r.status_code == 200:
print('content-type: ', r.headers['content-type'])
print('encoding: ', r.encoding)
print('apparent_encoding: ', r.apparent_encoding)
return r
Kanunu8.py
# -*- coding: utf-8 -*-
import os
import sys
import re
import encodings
#为了引用与父目录同级文件夹中的方法
sys.path.append("..")
# 解决gb2312乱码问题
encodings.aliases.aliases['gb2312'] = 'gb18030'
from bs4 import BeautifulSoup
from _pyio import open
from util import *
book_url = ''
book_name = ''
#屏蔽掉作者的链接
writer_link_pattern = re.compile(r'.*/writer/\d+\.html')
#由于我用的是Windows平台,文件名中不能包含下列非法字符,需要过滤
window_illegal_file_name_pattern = re.compile(r'[\\|/|:|\*|\?|"|<|>|\|]')
def find_tbody(tag):
if tag.name == 'tbody':
if tag.find('tbody') is None and tag.find('strong').string == '正文':
return True
elif '发布时间' in tag.get_text():
return True
return False
def strong_with_no_href(tag):
return tag.name == 'strong' and tag.a is None and tag.font is not None
def find_title(tag):
if tag.h1 is not None:
return tag.h1.font.string
elif tag.h2 is not None:
return tag.h2.font.string
else:
return tag.find(strong_with_no_href).font.string
def make_soup(html):
# , from_encoding='gb18030'
soup = BeautifulSoup(html, "html.parser")
print('original_encoding: ', soup.original_encoding, ', declared_html_encoding: ', soup.declared_html_encoding, ', from_encoding: ', soup.from_encoding)
return soup
def get_legal_window_file_name(name):
if name is None:
return 'unknown'
return window_illegal_file_name_pattern.sub('', name)
if __name__ == '__main__':
book_url = input('请输入电子书URL:')
# 按任意键继续
# if input('请按任意键开始抓取...'):
# pass
#获取Html内容
request = HttpClient.make_request(book_url)
html = request.content
soup = make_soup(html)
# 爬取书名
book_name = soup.find('title').string
path = './' + get_legal_window_file_name(book_name) + '.txt'
links = []
#提取所有章节的链接
for tmp in soup.find_all('tbody'):
if len(tmp.find_all('tr')) > 1 :
all_link = tmp.find_all('a')
if not all_link is None:
links.extend(all_link)
if book_url.endswith('.html'):
parent_url = book_url[0:book_url.rindex('/') + 1]
else:
parent_url = book_url
with open(path, 'w', encoding="utf-8") as f:
for link in links:
# 作家链接,忽略
if not writer_link_pattern.match(link['href']) is None:
continue
print('\n', link.string)
url = parent_url + link['href']
print(url)
response = HttpClient.make_request(url)
chapter_soup = make_soup(response.content)
chapter_name = find_title(chapter_soup)
# 章节标题
f.write('\n\n')
f.write(chapter_name)
f.write('\n\n')
# 章节内容
f.write(chapter_soup.find('p').get_text().replace('<br/>', ''))
# for p in chapter_soup.find('p').contents:
# if p == '<br>':
# f.write('\n')
# elif p is NavigableString:
# f.write(p)
# elif p is Tag:
# f.write(p.string)
f.flush()
print('电子书已成功保存: ', path)
遇到的问题:
不同的书(甚至章节)标题内容、字体(h1,h2...)、标签结构都不同;
编码问题,抓下来是乱码,具体原因请参考;
应该是为了增加爬取的难度吧,不过只能针对遇到的问题进行分析、解决;
转载于:https://blog.51cto.com/lbrant/1688440