前几个星期,雷子让写网络爬虫,我记得那次报班monkey第一期培训班的时候,他教我们用BeautifulSoup来写。就是需要导入第三方库bs4,不是很方便。
机缘巧合在网上找到一篇文章,不需要BeautifulSoup就可以进行爬虫。只需只用正则表达式即可。
自己的代码如下:
# -*- coding: utf-8 -*-
import urllib2, time, cookielib, socket, httplib
from urllib2 import Request, urlopen, URLError, HTTPError
from StringIO import StringIO
import re
count = 0
values = {}
try:
headers = {'User-Agent':'Mazilla/5.0(Windows;U;Windows NT 6.1;en-US;rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
def leshi(url):
htmlURL = ''
findmeiju_re = re.compile(r'<a href="(http://www.letv.com/tv/.+?)" title="(.+?)".+? target="(.+?)">',re.DOTALL)
findmeiju_zhuyan_re = re.compile(r'<a href="(http://so.letv.com/.+?)from=list" target="_blank">(.+?)</a>')
findmeiju_bofang_re = re.compile(r'class="ico_play_num">(.+?)</span>')
html = urllib2.urlopen(url).read()
print "当前页面所有美剧名称如下:"
for x in findmeiju_re.findall(html):
values = dict(
links = x[0],
title = x[1],
)
print values['title'] + '\n'
print '--------------------------------'
print "当前页面所有主演名称如下:"
for y in findmeiju_zhuyan_re.findall(html):
print y[1]
print '--------------------------------'
print "当前页面所有美剧播放次数如下:"
for z in findmeiju_bofang_re.findall(html):
print z
myurl = "http://list.letv.com/listn/c2_t-1_a50071_y-1_s1_md_o9_d1_p.html"
leshi(myurl)
except URLError, e:
print 'time out'
count = count + 1
except socket.timeout, e:
print 'socket time out'
count = count + 1
except httplib.BadStatusLine, e:
print 'BadStatusLine error'
count = count + 1
当时模仿monkey给的例子写的代码如下:
# -*- coding: utf-8 -*-
import urllib2, time, cookielib, socket, httplib
from bs4 import BeautifulSoup
from urllib2 import Request, urlopen, URLError, HTTPError
from StringIO import StringIO
import re
count = 0
tongcheng_name = []
try:
'''
opener = urllib2.build_opener(urllib2.ProxyHandler({'http':'http://180.166.50.106:8000'}))
urllib2.install_opener(opener)
'''
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request('http://hz.58.com/xihuqu/hezu/', headers =headers)
webpage = urllib2.urlopen(req, timeout = 30)
soup = BeautifulSoup(webpage.read())
tongcheng_div = soup.find('section', {'id':'selection'})
tongcheng_class= tongcheng_div.findAll('div', {'class':'seltabs'})
tongcheng_links = [div.findAll('a') for div in tongcheng_class]
print tongcheng_div
print '------'
print tongcheng_class
print '-------'
print tongcheng_links
print '-----------'
print str(tongcheng_links[0][0])[-16:-11]
tongcheng_name.append(str(tongcheng_links[0][0])[-16:-11])
time.sleep(1)
except URLError, e:
print 'time out'
count = count + 1
except socket.timeout, e:
print 'socket time out'
count = count + 1
except httplib.BadStatusLine, e:
print 'BadStatusLine error'
count = count + 1
except AttributeError, e:
print 'no selection'
主要是正则表达式不会用,也没接触过。。表示亚历山大啊。。
相信如果会正则表达式,会方便很多。。
嘿嘿,感谢雷子~~