提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
在大数据时代,互联网中拥有海量的信息数据,对个人而言,这些数据有的是有价值的,有的是没有价值。那么,如何在海量信息中找到并快速获取我们所需的大量数据?爬虫,就是很好的一种获取数据的方式。本章,我以网页小说爬取为例,为大家分享python爬虫的一些经验和遇到的一些问题。
一、爬虫所需库
requests:python的第三方库,专门用于发送HTTP请求。
安装requests库:pip install requests
二、网页分析
1.小说章节网页分析
以爬取某网站《深空**》为例,目标url:www.****.in/book/12793/
选取其中某一章,检查网页,可以找到这本小说所有章节的链接和名称。由此可以得出章节的路径xpath://div[@class=“panel panel-default”]/dl/dd/a/@href(有点懵???没关系,我们可以支直接在Google中查看一个章节的xpath,如下)
获取xpath演示
eg:某章的xpath为/html/body/div[2]/div[2]/dl/dd[1]/a
抓取章节代码如下(示例):
url = "https://www.xbxwx.in/book/12793/" # 网址
response = requests.get(url, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_j_list = ['https://www.xbxwx.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
url_List.append(url_j_list)
2.小说内容网页分析
同上所述,可以得出小说内容的xpath://*[@id=“htmlContent”]/text()
抓取小说内容代码如下(示例):
rep = requests.get(url_x, headers=headers)
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
for con in text:
f.write(con)
print(f'{name}_{i + 1} 下载完成')
三、代码实例
1.获取小说章节路径
import requests
from lxml import etree
import xlwt
path = r'D:\Python_project\xiaoshuo\ '
headers = {
"Referer": "https://www.***.in/book/12793/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}
def get_urls():
# 由于网站中是分页显示章节,即:
'''
1-60章网址:https://www.***.in/book/12793/
60-120章网址:https://www.***.in/book/12793/index_2.html
网址源码:
<select class="form-control" onchange="window.location=this.value;">
<option value="/book/12793/">第1页</option>
<option value="/book/12793/index_2.html">第2页</option>
。。。。。。
<option value="/book/12793/index_25.html" selected="">第25页(末页)</option>
</select>
/html/body/div[2]/div[2]/div[2]/select/option[1]
'''
url = "https://www.***.in/book/12793/" # 网址
response = requests.get(url, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_N_id = ['https://www.***.in/' + x for x in html.xpath('//select[@class="form-control"]/option/@value')]
number = url_N_id.__len__()
url_List = []
for j in range(number):
# 第N网址中章节的url列表
'''
获取章节的网址 :可直接在网页检查-元素-找到章节网址-复制xpath
<dd class="col-md-3">
<a href="38415404.html" title="第一章 旧土">第一章 旧土</a>
</dd>
右键->复制xpath: xpath = /html/body/div[2]/div[2]/dl/dd[1]/a
即 xpath ==> "38415404.html"
'''
if j != 0:
url_j_id = url_N_id[j]
response = requests.get(url_j_id, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_j_list = ['https://www.***.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
url_List.append(url_j_list)
url_list = []
for m in range(url_List.__len__()):
for n in range(url_List[m].__len__()):
url_list.append(url_List[m][n])
return url_list
def main():
urls = get_urls()
# 创建工作薄对象
workbook = xlwt.Workbook(encoding='GB2312')
Sheet_name = workbook.add_sheet('深空彼岸小说章节网站')
Headers = ['序号', '网址']
for index, Header in enumerate(Headers):
Sheet_name.write(0, index, Header)
for index, url in enumerate(urls):
Sheet_name.write(index+1, 0, index+1)
Sheet_name.write(index + 1, 1, url)
workbook.save('xiaoshuo.xls')
if __name__ == '__main__':
main()
2.爬取小说内容
import requests
from lxml import etree
import xlrd
import time
import random
import chardet # 自动检测编码格式
path = r'D:\Python_project\xiaoshuo\ '
path1 = r'D:\Python_project\xiaoshuo\xs'
headers = {
"Referer": "https://www.***.in/book/12793/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}
def get_text(url):
rep = requests.get(url, headers=headers)
'''
rep.encoding = 'GB2312'
'''
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
# 因为有章节分为上下页,还需下载下一页
# 下一页网址:
'''
eg:第一页 https://www.***.in/book/12793/38415404.html
第二页 https://www.***.in/book/12793/38415404_2.html
第X页 //*[@id="content"]/div[1]/h1/small
'''
# 章节页数
strnum = dom.xpath('//*[@id="content"]/div[1]/h1/small/text()')
if len(strnum) == 0:
rep = requests.get(url, headers=headers)
'''
rep.encoding = 'GB2312'
'''
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path + f'{name}.txt', 'w', encoding='utf-8') as f:
for con in text:
f.write(con)
print(f'{name} 下载完成')
else:
str1 = strnum[0][3:4]
num = int(str1)
for i in range(num):
if i == 0:
url_x = url
else:
url_x = url[:-5] + '_' + str(i + 1) + '.html'
rep = requests.get(url_x, headers=headers)
'''
rep.encoding = 'GB2312'
'''
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
for con in text:
f.write(con)
print(f'{name}_{i + 1} 下载完成')
def main():
# 获取存储在xls文件中的小说网址
workbook = xlrd.open_workbook('xiaoshuo.xls')
Sheet_name = workbook.sheet_by_name('深空彼岸小说章节网站')
print(Sheet_name.name, Sheet_name.ncols, Sheet_name.nrows)
# 获取第一个sheet
Sheet1 = workbook.sheet_by_index(0)
# 获取第2列单元格数据
cols = Sheet1.col_values(1)
# print(urls)
urls = cols[1264:1269]
for url in urls:
get_text(url)
time.sleep(random.randint(1, 3))
if __name__ == '__main__':
main()
总结
以上就是关于python爬虫相关的内容,本文仅仅简单介绍了爬虫在网页小说中的使用案例,后续我将继续更新python爬虫相关的应用案例,谢谢。