python抓取糗事百科段子图片

最新推荐文章于 2024-09-09 23:28:21 发布

爱疯1410

最新推荐文章于 2024-09-09 23:28:21 发布

阅读量444

点赞数

文章标签： python

本文链接：https://blog.csdn.net/a116514/article/details/52997732

版权

糗百网址:http://www.qiushibaike.com/hot

打开页面后查看元素

找到段子的html代码片段

<div class="content">
<span>小时候家里穷从来没穿过名牌，当时我们特别羡慕班上的一个同学，因为他姑姑送给他一件两面都可以穿的夹克。一面是阿di的，一面是nai克的。</span>
</div>

//段子

# -*- coding:utf-8 -*-

import urllib
import urllib2
import re

class LC_QiuBai(object):
    """docstring for LC_QiuBai"""
    def __init__(self):
        super(LC_QiuBai, self).__init__()
        self.page = 1
        self.enable = True
    def hello(self):
        print '6666'
    def getData(self):
        url = url = 'http://www.qiushibaike.com/hot/page/' + str(self.page)
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        try:
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request)
            content = response.read().decode('utf-8')
            pubName = r'<h2>(.*?)</h2>'
            # content_duan = r'<div .*?content">(.*?)</div>'
           content_duan = r'<div .*?content">.*?<span>(.*?)</span>.*?</div>'
            match_L = re.findall(content_duan,content,re.S|re.M)
            for line in match_L:
                #print '\033[35m %s \033[0m' %line + '\n'
                print line + '\n'
        except urllib2.URLError, e:
            if hasattr(e,"code"):
                print e.code
            if hasattr(e,"reason"):
                print e.reason
        self.page+=1;
print u'''

-----------------------------
操作:输入stop退出
功能:按下回车依次浏览今日的糗百热点
-----------------------------

'''
lcQiuBai=LC_QiuBai()
lcQiuBai.getData()
while lcQiuBai.enable:
    myInput=raw_input()
    if 'stop'==myInput:
            lcQiuBai.enable=False
            break
    else:
            lcQiuBai.getData()