python初学——简单爬虫

最新推荐文章于 2024-12-16 16:42:36 发布

挽风月

最新推荐文章于 2024-12-16 16:42:36 发布

阅读量248

点赞数

文章标签： python-爬虫

本文链接：https://blog.csdn.net/poice00/article/details/48940623

版权

# coding=utf-8
'''
Created on Oct 6, 2015
<li class="course-one">
            <a href="/view/510" target="_self">
                <div class="course-list-img">
                    <img width="240" height="135" alt="Android-节日短信送祝福(UI篇)" src="http://img.mukewang.com/5608968d0001c0e406000338-240-135.jpg">
                </div>
            <h5>
                <span>Android-节日短信送祝福(UI篇)</span>
            </h5>
            <div class="tips">
                <p class="text-ellipsis">本课程带领大家实现一个节日短信的群发的app。</p>
                <span class="l ">更新至3-1</span>
                <span class="l ml20">4128人学习</span>
            </div>
            <span class="time-label">2小时33分钟 | 中级 </span>
            <b class="follow-label">跟我学</b>
            </a>
</li>
@author: sys
'''
import urllib2,re

def getContent(html):
    reg=r'<div class="course-list-img">.+?src="(.+?\.jpg)".+?</div>.+?<span>(.+?)</span>.+?class="tips".+?class="text-ellipsis">(.+?)</p>.+?class="time-label">(.+?)</span>'
    contentre=re.compile(reg,re.DOTALL)
    contentlist=contentre.findall(html)
    return contentlist

def display(contentlist):
    for content in contentlist:
        #values=dict(poster=content[0],title=content[1],rating=content[2],ticket_btn=content[3])
        print 'img','\t',content[0].strip()
        print 'title','\t',content[1].strip()
        print 'description','\t',content[2].strip()
        print 'time','\t',content[3].strip()
        print'..............................................................................'

def getHtml(url):
    headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
       'Accept-Language':  'zh-CN,zh;q=0.8',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    }
    req = urllib2.Request(url,'GET',headers = headers)
    page=urllib2.urlopen(req)
    html=page.read()
    page.close()
    return html

if __name__=="__main__":
    url="http://www.imooc.com/course/list"
    html=getHtml(url)
    #print html
    contentlist=getContent(html)
    print len(contentlist)
    #print contentlist
    display(contentlist)
    print "finished"