爬取网站:https://www.biquge.cm
话不多说,上代码
# coding=gbk
from bs4 import BeautifulSoup
import requests
import random
import os
global lots_headers
lots_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
]
global headers
headers = {'User-Agent':random.choice(lots_headers)}
global baseurl
baseurl = 'https://www.biquge.cm'
global url
url = 'https://www.biquge.cm/8/8804'
global savePath
savePath = 'G:\\牧神记'
def createFile(path):
if os.path.exists(path) is False:
os.mkdir(path)
os.chdir(path)
def downfile():
res = requests.get(url, headers=headers)
# 这里转换编码是为了保证title即小说标题写入不乱码
html_doc = res.content.decode('gbk')
# 转换编码方法二如下
# html_doc = str(res.content, 'gbk')
soup = BeautifulSoup(html_doc, 'html.parser')
url_sub = soup.find('div', id='list').find_all('a')
for a in url_sub:
try:
# zjurl为章节url
zjurl = baseurl + a.attrs['href']
bookName = a.text+'.txt'
# 这里转换编码是为了保证content即小说内容不乱码
res_sub = requests.get(zjurl, headers=headers)
html_sub_doc = res_sub.content.decode('gbk')
# 转换编码方法二如下
# html_sub_doc = str(res_sub.content, 'gbk')
soup_sub = BeautifulSoup(html_sub_doc, 'html.parser')
text = soup_sub.find('div', id='content').text
# 方法一:将四个\xa0替换为一个\n
content = text.replace('\xa0\xa0\xa0\xa0', '\n')
# 方法二:去掉所有\xa0
# content = "".join(text.split())
f = open(bookName, 'w+')
f.write(content)
print('下载完成,章节名:' + str(bookName))
f.close()
except Exception as e:
print(e)
def main():
createFile(savePath)
print("开始下载牧神记...")
downfile()
if __name__ == '__main__':
main()
注释已经挺详细的了,方便以后查看。
遇到的问题1:乱码
如果遇到文件名乱码,或者是文件内容乱码,一定要查看编码方式。
文件乱码解决方法:
文件头加上编码方式说明:# coding=gbk
网页内容解析乱码解决方法:
对网页内容解析的时候,要对其content采用decode方法,即解码为gbk。(关于解码/编码方式我在下一篇文章单独说明一下)举例: html_doc = res.content.decode('gbk')
或者采用str()将content编码方式转为gbk。举例:html_doc = str(res.content, 'gbk')
遇到的问题2:\xa0需替换,否则内容为空,并出现如下提示
提示:'gbk' codec can't encode character '\xa0' in position 0: illegal multibyte sequence
``
解决方法:替换或去掉\xa0
方法1:content = text.replace('\xa0\xa0\xa0\xa0', '\n')
方法2:content = "".join(text.split())