python 爬虫抓小说

#coding = utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
import sys
import traceback 

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
	reload(sys)
	sys.setdefaultencoding(default_encoding)

def write1(url,title):
	print "start ->" + title
	f = urllib2.urlopen(url)
	soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
	str1=""
	str1=str('\n\n')+title+str('\n\t')+str(soup.find('div',id='contTxt'))
	
	str1=str1.replace('<div id="contTxt" class="contTxt1"><p>','\n')
	str1=str1.replace('</p><p>','\n')
   	str1=str1.replace('</p></div>','')
	return str1

if __name__=='__main__':
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	url = 'http://vip.book.sina.com.cn/books/142600'
	url2 = 'http://vip.book.sina.com.cn'
	url3 = 'http://vip.book.sina.com.cn/chapter/142600/94509.html'
	f = urllib2.urlopen(url)
	soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
	
	uls = soup.findAll('ul')
	delli=BeautifulSoup(str(uls)).findAll('li')
	str3=""
	for i in delli:#-26
		try:
						
			title = i.a.string
			href = i.a['href']
			if href.startswith('/chapter'):				
				str2 = url2 + str(href)
				print  title + '\n' + str2 
				str3 += write1(str2,title)
			else:
				print 'no'
			
		except:
			print traceback.format_exc()			
			pass  
		
	ff=open('heike.txt','wr+')    
	ff.write(str3)
	ff.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值