很悲催,python常用的正则表达式一点都不熟,写了个漏洞百出的只看楼主的代码,先贴上来,有空再完善。
# -*- coding: utf8 -*-
import urllib2
import re
#from BeautifulSoup import BeautifulSoup
##def (i,title)=ExtractTitle(rawdata):
## buf=[]
#### print rawdata[0]
## i=0
## while(rawdata[i]):
## m = rawdata[i].find('<title>')
## i=i+1
## rawdata[i]
## i++
## while (m==-1)
##
## if m!=-1:
## i=i+1
## print rawdata[i]
def Extract(rawdata):
author=[]
title=[]
i=0
content=[]
link=[]
while i < len(rawdata):
m = rawdata[i].find('<title>')
if m!=-1:
title=rawdata[i+1]
print title
###find the author
tmpline = rawdata[i].find('topic-doc')
if tmpline!=-1:
print rawdata[i-2]
posBeg=rawdata[i-2].find('alt=')
posEnd=rawdata[i-2].find('/>')
author = rawdata[i-2][posBeg+5:posEnd-1]
print "author: "+author
i=i+1
## find the content written by the author
while i < len(rawdata):
contentLine = rawdata[i].find('alt=\"'+author+'\"')
if contentLine!=-1:
while i< len(rawdata):
pLine = rawdata[i].find('<p>')
if pLine!=-1:
while i< len(rawdata):
pEndLine=rawdata[i].find('</p>')
if pEndLine !=-1:
print rawdata[i]
content.append(rawdata[i])
break
i+=1
break
i+=1
i+=1
i=i+1
def ExtractLink(rawdata,start):
i=start
links=[]
while i< len(rawdata):
line = rawdata[i].find('paginator')
if line!=-1:
tmpRow = rawdata[i]
posBeg=tmpRow.find("href=")
if posBeg!=-1:
tmpRow=tmpRow[posBeg+6:-1]
posBeg=0
while posBeg!=-1:
posEnd=tmpRow.find(">")
## print posBeg
## print posEnd
## print tmpRow[posBeg:posEnd]
links.append(tmpRow[posBeg:posEnd-1])
posBeg=tmpRow.find("href")
if posBeg==-1:
break
tmpRow=tmpRow[posBeg+6:-1]
posBeg=0
break
i+=1
links=links[0:len(links)-1]
return links
def ExtractAuthorContent(rawdata):
author=[]
title=[]
pos=0
pEnd=0
print len(rawdata)
title,pos=findTitle(rawdata,pos)
author,pos=findAuthor(rawdata,pos)
## find the content written by the author
content,pEnd=extractAllSections(rawdata,author,pos)
return author
## print content
## print pEnd
##
def ExtractContent(rawdata,author):
pos=0
print len(rawdata)
print author
## find the content written by the author
content,pEnd=extractAllSections(rawdata,author,pos)
def extractSection(rawdata,start):
i=start
content=[]
done=0
while i< len(rawdata):
pBeginLine = rawdata[i].find('<p>')
if pBeginLine!=-1:
break
i+=1
while i< len(rawdata):
pEndLine=rawdata[i].find('</p>')
content.append(rawdata[i])
if pEndLine !=-1:
print rawdata[i]
break
i+=1
return (content,i)
def extractAllSections(rawdata,author,start):
i=start
content=[]
pEnd=0
count=0
while i < len(rawdata):
contentLine = rawdata[i].find('alt=\"'+author+'\"')
if contentLine!=-1:
tmpContent,i=extractSection(rawdata,i)
content.append(tmpContent)
if i!=rawdata:
count+=1
pEnd=i
i+=1
i+=1
return (content,pEnd)
def findTitle(rawdata,start):
i=start
while i < len(rawdata):
m = rawdata[i].find('<title>')
if m!=-1:
title=rawdata[i+1]
print title
break
i+=1
return(title,i)
def findAuthor(rawdata,start):
i=start
author=0
while i < len(rawdata):
tmpline = rawdata[i].find('topic-doc')
if tmpline!=-1:
posBeg=rawdata[i-2].find('alt=')
posEnd=rawdata[i-2].find('/>')
author = rawdata[i-2][posBeg+5:posEnd-1]
print "author: "+author
break
i+=1
return (author,i)
#data=file('E:/petrelli/play/crawl_douban/douban_2.htm','r').readlines()
#soup = BeautifulSoup(data)
#print soup.prettify()
#Extract(data)
data = urllib2.urlopen('http://www.douban.com/group/topic/9737262/').readlines()
links=ExtractLink(data,0)
author=ExtractAuthorContent(data)
for link in links:
print link
## data = urllib2.urlopen(link).readlines()
## ExtractContent(data,author)
#ExtractAuthorContent(data)
#for line in data:
# print line