用Python写的爬虫程序,可以爬小说
__author__ = 'zzxuan'
import urllib.request
import sys,os,string
from html.parser import HTMLParser
url = "http://book.zongheng.com/chapter/309372/5304510.html"
title='爬小说'
class TitleParser(HTMLParser):
def __init__(self):
self.datas=[]
self.flag=None
self.start=None
self.tit=None
self.next=None
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs ):
#print(tag,attrs)
if tag=='title':
self.tit=True
if attrs.count(('id', 'nextChapterButton')):
self.next=attrs[2][1]
if attrs.count(('id', 'chapterContent'))>0:
self.start=True
if self.start==True:
if tag=='p' :
self.flag=True
else:
self.flag=False
def handle_data(self,data):
if self.tit:
self.datas.append(data)
self.tit=False
if self.flag and self.start:
self.datas.append(data)
def handle_endtag(self,tag):
if self.start and tag=='div':
self.start=False
def crawdata(url):
opener = urllib.request.build_opener()
#opener.addheaders = [headers]
data = opener.open(url).read()
data=data.decode('utf-8', "ignore")
tp=TitleParser()
tp.feed(data)
target=open(title+'.txt','a',encoding='utf-8')
for l in tp.datas:
target.write(l+'\r\n')
#print(l)
target.close()
print(tp.next)
return tp.next
uuu=url
while True:
uuu=crawdata(uuu)
input()