python爬取某一小说
经过几天的学习简单写一个爬取小说的代码,试试结果,可惜爬取得有些慢,下面是代码:
# _*_ coding:utf-8 _*_
import urllib2,urllib
import re
import sys
from bs4 import BeautifulSoup
import random
reload(sys)
sys.setdefaultencoding('utf8')
def getHtml(url):
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
user_agent_random = random.choice(user_agents)
header = {
'User-Agent':user_agent_random,
'Host':'www.biquge.com.tw',
'Referer':'http://www.biquge.com.tw/',
'GET':url
}
request = urllib2.Request(url,headers=header)
html = urllib2.urlopen(request).read()
html = html.decode('gbk','ignore').encode("utf8")
# print html
return html
#取得章节和章节url保存到列表中
def getht(h):
soup = BeautifulSoup(h,'html.parser')
html_ = soup.find_all('dd')
i = 0
book = []
book_mark = []
for im in html_:
len1 = len(im)
s=str(im)
html_url = 'http://www.biquge.com.tw'+s[13:35]
book.append(html_url)
book_mark.append(s[37:-9])
i = i + 1
return book,book_mark
#取网页内容值
#在小说一个章节网页取内容保存到本地
def getContent(html_book,html_book_mark):
soup = BeautifulSoup(html_book,'html.parser')
b = soup.find_all('div',id='content')[0]
fh = open('E://python/2.txt','a')
s = b.get_text()
st =html_book_mark+ str(s)+'\n'
fh.write(st)
fh.close()
print html_book_mark+'保存成功'
#循环取出每章内容保存
def get_par(books,book_marks):
t = 0
for (bo,bo_mark) in zip(books,book_marks):
getContent(getHtml(bo),bo_mark)
t+=1
if t > len(books):
print "全部章节保存完全"
url = 'http://www.biquge.com.tw/16_16273/'
#得到书籍目录页
html = getHtml(url)
#得到每一章节网址
book,book_m = getht(html)
#得到每一章节网址内容
get_par(book,book_m)
注意点:测试时在取到某章时出现 ‘gbk’ codec can’t decode bytes in position 7782-7783: illegal multibyte sequence错误,在decode时添加ignore就可以解决该问题
代码块
部分user_agent :
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
...