#作者:Nonikka
#版本:0.3
#2014年3月28日
import os,urllib.request,re
try:
douban250 = urllib.request.urlopen('http://movie.douban.com/top250').read()
except:
print("link Error")
douban250 = douban250.decode('utf-8')
re250 = re.compile(r'<div class="info">\s+<div class="hd">\s+<a href="(.+?)" class="">',re.DOTALL)
movies_page = re250.findall(douban250)
r_number = re.compile(r'<a href.+?(\d{4,7})</span>人评价',re.DOTALL) #正则评论数
r_name = re.compile(r'<span property="v:itemreviewed">(.+?)</span>',re.DOTALL) #正则名字
page_open = []
for i in movies_page:
page_open.append(urllib.request.urlopen(i).read().decode('utf-8'))
movies_name = []
pinglunshu = []
for numbers in page_open:
print(r_name.findall(numbers) + r_number.findall(numbers))
movies_name.append(r_name.findall(numbers)) #电影名做列表
pinglunshu.append(r_number.findall(numbers)) #评论数做列表
dic = {}
intpinglunshu = []
for i in pinglunshu:
intpinglunshu.append(int(pinglunshu.pop(0)[0])) #str转int
intmovies_name = []
for i in movies_name:
intmovies_name.append(movies_name.pop(0)[0])
for i_ in intmovies_name:
dic[i_] = intpinglunshu.pop(0)
dic = sorted(dic.items(),key=lambda d:d[1],reverse = True) #字典按value排序
out = open('data.txt','w')
for key in dic : #输出文本
out.write(str(key) + '\n')
#只能输出10个,此处有bug
out.close()
os.system("pause")
Python爬虫初学(2)豆瓣电影top250评论数
最新推荐文章于 2024-03-16 10:00:00 发布