需求:
抓取某些网站上的小说,按页抓取
每页都有next 按钮,获取这写next 按钮的 href 然后 就可以逐页抓取
解析网页使用beautisoup
from bs4 import BeautifulSoup
import urllib2
import time
import sys
#http://www.vc.com/htm/2016/12/24/t02/367246.html
host_name = 'http://www.vc.com'
def html_process(html_file,url):
'''
use bs to get the titile && contain && next link from html_file
'''
global host_name
#soup = BeautifulSoup(open(html_file),"html_parser")
soup = BeautifulSoup(html_file,"html.parser")
#####################################################
text = '/dev/shm/novel.txt'
file = open(text,'a')
file.write('######################################')
file.write('\r\n' + url + '\r\n')
#####################################################
#get title
title_ret = soup.title.string.split('-')[0].strip()
file.write('\r\n@# '+ title_ret+ '\r\n')
#####################################################
#get context
file.write( soup.find("div",id='view2').get_text() + '\r\n')
file.close()
#####################################################
#get next href
link = soup.find_all("li",class_ = "next")[0]
if None == link:
print 'next link is None'
exit(0)
next_href = host_name + link.a['href']
return next_href
def html_get(url):
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0"
headers = {'User-Agent':user_agent}
req = urllib2.Request(url,headers = headers)
try:
page = urllib2.urlopen(req,timeout=20).read()
return page
except urllib2.URLError,e:
print "error while loading" + url
exit(1)
except socket.timeout:
#do retry
return html_get(url)
def test(url):
while None != url:
html_file = html_get(url)
if None == html_file:
print 'ERROR OF READING ',url
exit(1)
url = html_process(html_file,url)
time.sleep(5)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding( "utf-8" )
#start up url
test("http://www.vc.com/htm/2013/11/2/t02/316551.html")