书籍主页
疫情期间无事,今天试着爬取一部科幻小说。
代码主要分为两个部分。
一是书籍主页源码爬取分析,这这里,获取各章节的url。
二是单独一章的内容爬取。
最后使用列表循环即可实现所有章的爬取。
代码如下:
#coding=gbk
import requests
import re
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.wikipedia.org/',
'Connection': 'keep-alive',
}
def require(url):
"""获取网页源码
"""
response = requests.get(url, headers=headers)
'''print(response.status_code)#状态码
print(response.encoding)#首选编码
print(response.apparent_encoding)#备选编码'''
response.encoding=response.apparent_encoding
html=response.text#源代码文本
return html
html=require('https://www.kanunu8.com/book3/6655/index.html')
'''元页面源码为html'''
with open('chapter_list.txt','wb')as f:
f.write(bytes(html,encoding='utf-8'))
chapter_list=re.findall('<table cellspacing="1" cellpadding="8".*?</tr>(.*?)</table>',html,re.S)
#抓大
chapters_urls=re.findall('<a href="(.*?)">.*?</a>',str(chapter_list),re.S)
urls_list=[]
for chapter_url in chapters_urls:
chapter_url='https://www.kanunu8.com/book3/6655/'+chapter_url
urls_list.append(chapter_url)
print(urls_list)
#提取组合成各章节网址
"""章节地址列表为urls_list"""
def one_chapter(one_url):
'''进入具体的一章,提取出章节名'''
one_html=require(one_url)
chapter_name=re.findall('size="4">(.*?)</font>',one_html)
chapter_name=str(chapter_name)+'.txt'
chapter_name=chapter_name.replace('\\u3000',' ')
chapter_name=chapter_name.replace('[\'',' ')
chapter_name=chapter_name.replace('\']',' ')
chapter_name=chapter_name.replace(' ',' ')
chapter_name=chapter_name.replace('AAA',' ')
return chapter_name
def two_chapter(one_url):
'''进入具体的一章,提取出章节内容'''
one_html=require(one_url)
first_chapter=re.findall('<p>(.*?)</p>',one_html,re.S)
first_chapter=str(first_chapter).replace('<br />',' ')
first_chapter=first_chapter.replace('"','"')
first_chapter=first_chapter.replace('\\u3000',' ')
first_chapter=first_chapter.replace('\\r',' ')
first_chapter=first_chapter.replace('\\n','\n')
first_chapter=first_chapter.replace('[\'',' ')
first_chapter=first_chapter.replace('\']',' ')
return first_chapter
def receive(title,content):
"""保存各章"""
#content=''.join(content)#字符串
content=bytes(content,encoding='utf-8')#字节
with open(title,'wb')as f:
f.write(content)
"""保存所有章"""
for urls in urls_list:
receive(one_chapter(urls),two_chapter(urls))
本人对os模块还未学习,照抄网上的也总是失败,就放弃了对该模块的使用。