# coding=utf-8
'''
Created on Oct 6, 2015
<li class="course-one">
<a href="/view/510" target="_self">
<div class="course-list-img">
<img width="240" height="135" alt="Android-节日短信送祝福(UI篇)" src="http://img.mukewang.com/5608968d0001c0e406000338-240-135.jpg">
</div>
<h5>
<span>Android-节日短信送祝福(UI篇)</span>
</h5>
<div class="tips">
<p class="text-ellipsis">本课程带领大家实现一个节日短信的群发的app。</p>
<span class="l ">更新至3-1</span>
<span class="l ml20">4128人学习</span>
</div>
<span class="time-label">2小时33分钟 | 中级 </span>
<b class="follow-label">跟我学</b>
</a>
</li>
@author: sys
'''
import urllib2,re
def getContent(html):
reg=r'<div class="course-list-img">.+?src="(.+?\.jpg)".+?</div>.+?<span>(.+?)</span>.+?class="tips".+?class="text-ellipsis">(.+?)</p>.+?class="time-label">(.+?)</span>'
contentre=re.compile(reg,re.DOTALL)
contentlist=contentre.findall(html)
return contentlist
def display(contentlist):
for content in contentlist:
#values=dict(poster=content[0],title=content[1],rating=content[2],ticket_btn=content[3])
print 'img','\t',content[0].strip()
print 'title','\t',content[1].strip()
print 'description','\t',content[2].strip()
print 'time','\t',content[3].strip()
print'..............................................................................'
def getHtml(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
req = urllib2.Request(url,'GET',headers = headers)
page=urllib2.urlopen(req)
html=page.read()
page.close()
return html
if __name__=="__main__":
url="http://www.imooc.com/course/list"
html=getHtml(url)
#print html
contentlist=getContent(html)
print len(contentlist)
#print contentlist
display(contentlist)
print "finished"
python初学——简单爬虫
最新推荐文章于 2024-11-07 09:12:34 发布