一 因为要爬取的网站比较简单也没有反爬机制,所以话不多说,直接看代码
#!/usr/bin/python
#coding:utf-8
# urllib2是python内置的HTTP请求库
import urllib2
import re
# Beautifulsoup库可把html解析成对象进行处理
from bs4 import BeautifulSoup
import sys
def OpenPage(url):
Myheader={}
request=urllib2.Request(url,headers=Myheader) #构造请求
f=urllib2.urlopen(request) #发送请求
data=f.read() #读取返回的请求
return data.decode("GBK",errors="ignore").encode("utf-8")
#return data
def ParseMainPage(page):
soup=BeautifulSoup(page,"html.parser") #去掉返回页面中不必要的元素
list_chars=soup.find_all(href=re.compile("read")) #查找每个小说章节的链接
url_list=[]
for item in list_chars:
url_list.append("http://www.shengxu6.com"+item['href'])
def ParseDetailPage(page):
soup=BeautifulSoup(page,"html.parser")
title=soup.find_all(class_="panel-heading")[0].get_text() #获取heading
content=soup.find_all(class_="panel-body")[0].get_text() #获取正文
return title,content
def WriteDataToFile(file_path,data):
f=open(file_path,"a+")
f.write(data)
f.close()
if __name__=="__main__":
url="http://www.shengxu6.com/book/2967.html"
main_page=OpenPage(url)
url_list=ParseMainPage(main_page)
for url in url_list:
print "Clone url=" +url
detail_page=OpenPage(url) #打开每个章节的具体页面
title,content=ParseDetailPage(detail_page)
data="\n\n\n"+title+"\n\n\n"+content
data=data.encode("utf-8")
WriteDataToFile("minqinitaihuai.txt",data)#将爬到的数据写到文件中
print "complete!!!!!!"
二 爬取结果
正在爬取数据:
爬取到的小说章节: