糗百爬虫

from BeautifulSoup import BeautifulSoup
import urllib2
import urllib

outfile = open("qiubai1.txt", "w")
def formalize(text):
    result = ''
    lines = text.split(u'\n')
    for line in lines:
        line = line.strip()
        if len(line) == 0:
            continue
        result += line + u'\n\n'
    return result
def writeIO(text):
    #text=text+r"\r\n"
    outfile.write(text)
    #print >> outfile, text + "\r\n"  

def qiuBaiDemo(page):

    url="http://www.qiushibaike.com/hot/page/"+page
    #print url
    data = urllib2.urlopen(url).readlines()
    #print len(data)
   
    soup = BeautifulSoup("".join(data))
    #print soup.div.content
    #print soup['title']
    #print type(soup)
    contents = soup.findAll('div',"content")
    contentss=[]
    for i in range(0,len(contents)):
        try:
            title=contents[i]['title']
            contentss.append(contents[i])         
        except:
            print ""
    
    stories = [str(text) for text in contentss]
    count=0
    for story in stories:
        count+=1
        minisoup = BeautifulSoup(story)  
        text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
        #text = urllib.unquote(unescape(text, {'"':'"'}))
        text = formalize(text).encode("gb18030")
        print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"

        print text
        writeIO(text)
           
        
        
        
        
    
    

        


if __name__ == '__main__':
    page=raw_input('Enter the page you want view : ')
    qiuBaiDemo(page)
    outfile.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值