某公司给来的邮件测试题,从优酷视频列表页抓前5页的视频的标题和链接。用python写了个,以前都没怎么用过,算是进步了。。。。
- #Python version:2.6.6
- #filename:youkuParser
- import urllib2
- import codecs
- from sgmllib import SGMLParser
- class URLLister(SGMLParser): #继承SGMLParser
- def __init__(self):
- SGMLParser.__init__(self)
- def start_a(self,attrs): #对超链接标签进行筛选判断
- if len(attrs)==4:
- if ((attrs[0][0]=='href') & (attrs[1][0]=='title') & (attrs[2][0]=='target') & (attrs[2][1]=='video')):
- f = codecs.open('out.txt','a','utf-8') #从attrs中输出到文件
- title = unicode(attrs[1][1],'utf-8')
- link = unicode(attrs[0][1],'utf-8')
- f.write(title)
- f.write('\n')
- f.write(link)
- f.write('\n\n')
- f.close()
- class NextPage(SGMLParser): #抓取下一页的链接
- nextPage = ''
- def __init__(self):
- SGMLParser.__init__(self)
- def start_a(self,attrs):
- if len(attrs)==2:
- if((attrs[0][0]=='href') & (attrs[1][0]=='charset') & (attrs[1][1]=='742-4-1-999')):
- nextPage = "http://www.youku.com" + attrs[0][1] #构造超链接
- page.nextPage = nextPage
- sock = urllib2.urlopen("http://www.youku.com/v_showlist/t2d1c94g235.html")
- HtmlSource = sock.read()
- sock.close()
- print 'Hello'
- for i in range(1,6): #对前几页循环执行
- lister = URLLister()
- lister.feed(HtmlSource)
- page = NextPage()
- page.feed(HtmlSource)
- sock = urllib2.urlopen(page.nextPage)
- HtmlSource = sock.read()
- sock.close()
- print 'Done'
转载于:https://blog.51cto.com/wqzzq/661944