展开全部
使用62616964757a686964616fe4b893e5b19e31333363393634 BeautifulSoup 进行解析 html,需要安装 BeautifulSoup#coding=utf-8
import urllib2
import socket
import httplib
from bs4 import BeautifulSoup
UserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
def downloadPage(url):
try:
opener = urllib2.build_opener()
headers = { 'User-Agent': UserAgent }
req = urllib2.Request(url = url, headers = headers)
resp = opener.open(req, timeout = 30)
result = resp.read()
return result
except urllib2.HTTPError, ex:
print ex
return ''
except urllib2.URLError, ex:
print ex
return ''
except socket.error, ex:
print ex
return ''
except httplib.BadStatusLine, ex:
print ex
return ''
if __name__ == '__main__':
content = downloadPage("这填douban的地址")
#print content
soap = BeautifulSoup(content, 'lxml')
lst = soap.select('ol.grid_view li')
for item in lst:
# 电影详情页链接
print item.select('div.item > div.pic a')[0].attrs['href']
# 图片链接
print item.select('div.item > div.pic a img')[0].attrs['src']
# 标题
print item.select('div.item > div.info > div.hd > a > span.title')[0].get_text()
# 评分
print item.select('div.item > div.info > div.bd > div.star > span.rating_num')[0].get_text()
print '-------------------------------------------------------------------------'