抽取豆瓣小组文章的程序

很悲催,python常用的正则表达式一点都不熟,写了个漏洞百出的只看楼主的代码,先贴上来,有空再完善。

# -*- coding: utf8 -*-
import urllib2
import re
#from BeautifulSoup import BeautifulSoup
##def (i,title)=ExtractTitle(rawdata):
##    buf=[]
####    print rawdata[0]
##    i=0
##    while(rawdata[i]):
##        m = rawdata[i].find('<title>')
##        i=i+1
##        rawdata[i]
##        i++
##    while (m==-1)
##
##         if m!=-1:
##             i=i+1
##             print rawdata[i]

def Extract(rawdata):
    author=[]
    title=[]
    i=0
    content=[]
    link=[]
    while i < len(rawdata):   
        m = rawdata[i].find('<title>')
        if m!=-1:
            title=rawdata[i+1]
            print title
###find the author
        tmpline = rawdata[i].find('topic-doc')
        if tmpline!=-1:
            print rawdata[i-2]
            posBeg=rawdata[i-2].find('alt=')
            posEnd=rawdata[i-2].find('/>')
            author = rawdata[i-2][posBeg+5:posEnd-1]
            print "author: "+author
            i=i+1
            ## find the content written by the author
            while i < len(rawdata):
                contentLine = rawdata[i].find('alt=\"'+author+'\"')
                if contentLine!=-1:
                    while i< len(rawdata):
                        pLine = rawdata[i].find('<p>')
                        if pLine!=-1:
                            while i< len(rawdata):
                                pEndLine=rawdata[i].find('</p>')
                                if pEndLine !=-1:
                                    print rawdata[i]
                                    content.append(rawdata[i])
                                    break
                                i+=1
                            break
                        i+=1
                i+=1
        i=i+1
            
def ExtractLink(rawdata,start):
    i=start
    links=[]
    while i< len(rawdata):
        line = rawdata[i].find('paginator')        
        if line!=-1:
            tmpRow = rawdata[i]
            posBeg=tmpRow.find("href=")
            if posBeg!=-1:
                tmpRow=tmpRow[posBeg+6:-1]
                posBeg=0
            while posBeg!=-1:                
                posEnd=tmpRow.find(">")
##                print posBeg
##                print posEnd
##                print tmpRow[posBeg:posEnd]
                links.append(tmpRow[posBeg:posEnd-1])
                posBeg=tmpRow.find("href")
                if posBeg==-1:
                    break            
                tmpRow=tmpRow[posBeg+6:-1]
                posBeg=0
            break        
        i+=1
    links=links[0:len(links)-1]
    return links


def ExtractAuthorContent(rawdata):
    author=[]
    title=[]
    pos=0
    pEnd=0
    print len(rawdata)
    title,pos=findTitle(rawdata,pos)
    author,pos=findAuthor(rawdata,pos)
    ## find the content written by the author
    content,pEnd=extractAllSections(rawdata,author,pos)
    return author

##    print content
##    print pEnd
##

def ExtractContent(rawdata,author):
    pos=0
    print len(rawdata)
    print author
    ## find the content written by the author
    content,pEnd=extractAllSections(rawdata,author,pos)


def extractSection(rawdata,start):
    i=start
    content=[]
    done=0
    while i< len(rawdata):
        pBeginLine = rawdata[i].find('<p>')
        if pBeginLine!=-1:
            break
        i+=1
    while i< len(rawdata):
        pEndLine=rawdata[i].find('</p>')
        content.append(rawdata[i])
        if pEndLine !=-1:
            print rawdata[i]
            break
        i+=1
    return (content,i)
    
def extractAllSections(rawdata,author,start):
    i=start
    content=[]
    pEnd=0
    count=0
    while i < len(rawdata):
        contentLine = rawdata[i].find('alt=\"'+author+'\"')
        if contentLine!=-1:
            tmpContent,i=extractSection(rawdata,i)
            content.append(tmpContent)
            if i!=rawdata:
                count+=1
                pEnd=i
                i+=1
        i+=1
    return (content,pEnd)
        
def findTitle(rawdata,start):
    i=start
    
    while i < len(rawdata):
        m = rawdata[i].find('<title>')
        if m!=-1:
            title=rawdata[i+1]
            print title
            break
        i+=1        
    return(title,i)
  
def findAuthor(rawdata,start):
    i=start
    author=0
    while i < len(rawdata):
        tmpline = rawdata[i].find('topic-doc')
        if tmpline!=-1:
            posBeg=rawdata[i-2].find('alt=')
            posEnd=rawdata[i-2].find('/>')
            author = rawdata[i-2][posBeg+5:posEnd-1]
            print "author: "+author
            break
        i+=1
    return (author,i)
    

#data=file('E:/petrelli/play/crawl_douban/douban_2.htm','r').readlines()
#soup = BeautifulSoup(data)
#print soup.prettify()
#Extract(data)

data = urllib2.urlopen('http://www.douban.com/group/topic/9737262/').readlines()
links=ExtractLink(data,0)
author=ExtractAuthorContent(data)
for link in links:
    print link
##    data = urllib2.urlopen(link).readlines()
##    ExtractContent(data,author)
    
#ExtractAuthorContent(data)

#for line in data:
#    print line
    
 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值