#coding = utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
import sys
import traceback
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
def write1(url,title):
print "start ->" + title
f = urllib2.urlopen(url)
soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
str1=""
str1=str('\n\n')+title+str('\n\t')+str(soup.find('div',id='contTxt'))
str1=str1.replace('<div id="contTxt" class="contTxt1"><p>','\n')
str1=str1.replace('</p><p>','\n')
str1=str1.replace('</p></div>','')
return str1
if __name__=='__main__':
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
url = 'http://vip.book.sina.com.cn/books/142600'
url2 = 'http://vip.book.sina.com.cn'
url3 = 'http://vip.book.sina.com.cn/chapter/142600/94509.html'
f = urllib2.urlopen(url)
soup = BeautifulSoup(f.read().decode('utf-8','ignore'))
uls = soup.findAll('ul')
delli=BeautifulSoup(str(uls)).findAll('li')
str3=""
for i in delli:#-26
try:
title = i.a.string
href = i.a['href']
if href.startswith('/chapter'):
str2 = url2 + str(href)
print title + '\n' + str2
str3 += write1(str2,title)
else:
print 'no'
except:
print traceback.format_exc()
pass
ff=open('heike.txt','wr+')
ff.write(str3)
ff.close()
python 爬虫抓小说
最新推荐文章于 2024-09-10 08:00:00 发布