导包
import requests
import lxml #lxml是python的一个解析库,支持HTML和XML的解析,支持XPath解析方式,而且解析效率非常高
from bs4 import BeautifulSoup
import pandas as pd
import os
headers从网页获得,按F12, —>NETWORK---->刷新网页---->随便点一个name----->找到headers:User-Agent
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
获得源码,转HTML格式,获取想要的内容
url='http://www.biquge.com.tw/lishi/'
yuanma_caidanye=requests.get(url,headers=headers) #菜单页的源码
yuanma_caidanye.encoding='gbk'
html_yuanma_caidanye=BeautifulSoup(yuanma_caidanye.text,'lxml') # 源码转html格式
lianjie=[]
a=html_yuanma_caidanye.find_all('span',class_='s2') #获取各个小说的链接
for i in a :
lianjie.append(i.a['href'])
从上面得到的小说链接获得下一层想要的东西,这次用 html.parser ,这个能爬全部,比上面 lxml 效果好
title=[]
lianjie_zhangjie=[]
for i in lianjie:
yuanma_xiaoshuoye=requests.get(i) #获取小说源码
yuanma_xiaoshuoye.encoding='gbk'
html_yuanma_xiaoshuoye=BeautifulSoup(yuanma_xiaoshuoye.text,'html.parser') # 这个牛逼,能爬全部☆☆☆☆☆☆☆☆
title.append(html_yuanma_xiaoshuoye.find('h1').text.strip())
x=html_yuanma_xiaoshuoye.find_all('dd')
lianjie_shu=[]
for j in x:
lianjie_shu.append(j)
lianjie_zhangjie.append(lianjie_shu)
上面得到的半截的链接,给加上前半截
lianjie_zhangjie_quan=[]
for shu in lianjie_zhangjie:
lianjie_shu2=[]
for i in shu:
if i.find('a'):
try :
aa=i.find('a')['href']
lianjie_shu2.append('http://www.biquge.com.tw'+aa)
except:
print('i get the error')
lianjie_zhangjie_quan.append(lianjie_shu2)
从章节链接爬取章节内容,这段代码要运行很长时间
shu_all=[]
for shu in lianjie_zhangjie_quan[:]:
shu_quanwen=[]
for i in shu:
yuanma_zhangjie=requests.get(i,headers,timeout=1000) #加上时间,可以调整
yuanma_zhangjie.encoding='gbk'
html_zhangjie=BeautifulSoup(yuanma_zhangjie.text,'html.parser')
try:
shu_quanwen.append(html_zhangjie.find('div',id='content').text)
except:
print('hello world')
shu_all.append(shu_quanwen)
爬下来的书进行存储
for i in range(len(shu_all)):
filePath=r'C:\Users\Administrater\Desktop\pachong\book\\'+title[i]+".txt"
if not os.path.exists(filePath):
print('hi')
f = open(filePath, 'w', encoding="utf8")
for j in shu_all[i]:
f.write(j)
f.close()
大功告成