python 爬虫抓小说

最新推荐文章于 2024-09-10 08:00:00 发布

叶晟

最新推荐文章于 2024-09-10 08:00:00 发布

阅读量1.1k

点赞数 1

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/whu_piece/article/details/17336415

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

#coding = utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
import sys
import traceback 

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
	reload(sys)
	sys.setdefaultencoding(default_encoding)

def write1(url,title):
	print "start ->" + title
	f = urllib2.urlopen(url)
	soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
	str1=""
	str1=str('\n\n')+title+str('\n\t')+str(soup.find('div',id='contTxt'))
	
	str1=str1.replace('<div id="contTxt" class="contTxt1"><p>','\n')
	str1=str1.replace('</p><p>','\n')
   	str1=str1.replace('</p></div>','')
	return str1

if __name__=='__main__':
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	url = 'http://vip.book.sina.com.cn/books/142600'
	url2 = 'http://vip.book.sina.com.cn'
	url3 = 'http://vip.book.sina.com.cn/chapter/142600/94509.html'
	f = urllib2.urlopen(url)
	soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
	
	uls = soup.findAll('ul')
	delli=BeautifulSoup(str(uls)).findAll('li')
	str3=""
	for i in delli:#-26
		try:
						
			title = i.a.string
			href = i.a['href']
			if href.startswith('/chapter'):				
				str2 = url2 + str(href)
				print  title + '\n' + str2 
				str3 += write1(str2,title)
			else:
				print 'no'
			
		except:
			print traceback.format_exc()			
			pass  
		
	ff=open('heike.txt','wr+')    
	ff.write(str3)
	ff.close()