【首先感谢我们小组中4位女生(HLX,DH,ZCY,WX)的努力,在征得她们同意之后才将我们的实验报告贴上】
一、设计目的和要求
团队合作设计基于豆瓣电影的推荐系统。至少利用三种算法
二、整体框图
三、数据获取
1、确定推荐系统主题
首先我们分别查看了豆瓣网的电影和读书页面的源代码,发现电影和书其实都差不多,都具有名字、ID、图片地址、导演和演员(作者、出版社)以及标签。
后来我们发现图书的分类标签非常多样化,在文学中可以分为小说、散文、诗歌、童话、名著等,在生活可以分为旅行、励志、健康、美食、教育等,而在经管、科技里又可以分为很多类,所以这些书籍都是有两级标签的(如下图所示),而对电影来说,分类标签就简单得多,可以只有一级标签,如惊悚、剧情、爱情、冒险、动作、科幻、动画等,同时,一部电影可以有多个标签。综上,我们组决定把推荐系统的主题定为电影。
2、电影数据采集
在进行数据采集之前,我们通过浏览网页以及查阅书籍,学会了使用Python语言抓取网页数据的方法。
然后我进入豆瓣电影的主界面,查看其源代码,发现源代码中仅有26部可获取的电影的信息,而我们需要的是几百部电影,后来我们发现了豆瓣电影TOP250的网页,每页有25部电影,一共有10页,于是我开始从TOP250部电影着手,抓取了这些电影的电影名、电影ID、平均得分、标签、导演、主演、图片地址以及电影地址。
<span style="font-size:18px;">#coding:utf-8
import urllib
import re
# Created by DuanHao
page = 0
j=0
link = 1
while j<10:
url = "http://movie.douban.com/top250?start="+str(page)+"&filter=&type="
content =urllib.urlopen(url).read()
#抓取电影名
prenamelocate = 0
prename = re.compile(r'img alt')
prenamelist = re.findall(prename,content)
aftname = re.compile(r'src=')
aftnamelist = re.findall(aftname,content)
#抓取电影ID
preidlocate = 0
preid = re.compile(r'em class=')
preidlist = re.findall(preid,content)
aftid = re.compile(r'img alt')
aftidlist = re.findall(aftid,content)
#抓取平均得分
prescorelocate = 0
prescore = re.compile(r' \ ')
prescorelist = re.findall(prescore,content)
#抓取图片地址
img = re.compile(r'src="(.+?\.jpg)" class=')
imglist = re.findall(img,content)
#抓取电影地址
premovielocate = 0
premovie = re.compile(r'em class=')
premovielist = re.findall(premovie,content)
aftmovie = re.compile(r'img alt')
aftmovielist = re.findall(aftmovie,content)
#抓取每一页的以上数据
i=0
while i<25:
prenamelocate = content.find(prenamelist[0],prenamelocate)
aftnamelocate = content.find(aftnamelist[0],prenamelocate)
name = content[prenamelocate+9:aftnamelocate-2]
print name
code.open('top250direct.txt','a','utf-8').write(name+' ')
prenamelocate = prenamelocate + 7
preidlocate = content.find(preidlist[0],preidlocate)
aftidlocate = content.find(aftidlist[0],preidlocate)
premovielocate = content.find(premovielist[0],premovielocate)
aftmovielocate = content.find(aftmovielist[0],premovielocate)
if link/100 != 0:
movie_id = content[preidlocate+80+2:aftidlocate-29]
movie_net = content[premovielocate+48+2:aftmovielocate-28]
elif link/10 != 0:
movie_id = content[preidlocate+80+1:aftidlocate-29]
movie_net = content[premovielocate+48+1:aftmovielocate-28]
else:
movie_id = content[preidlocate+80:aftidlocate-29]
movie_net = content[premovielocate+48:aftmovielocate-28]
print movie_id
code.open('top250direct.txt','a','utf-8').write(movie_id+' ')
preidlocate = preidlocate + 9
link+=1
prescorelocate = content.find(prescorelist[0],prescorelocate)
score = content[prescorelocate-47:prescorelocate-44]
print score
code.open('top250direct.txt','a','utf-8').write(score+' ')
prescorelocate = prescorelocate + 10
code.open('top250direct.txt','a','utf-8').write(imglist[i]+' ')
code.open('top250direct.txt','a','utf-8').write(movie_net+' ')
premovielocate = premovielocate + 9
#进入电影子网
subcontent =urllib.urlopen(movie_net).read()
predir = re.compile(r'directedBy')
predirlist = re.findall(predir,subcontent)
aftdir = re.compile(r'</a>')
aftdirlist = re.findall(aftdir,subcontent)
predirlocate = subcontent.find(predirlist[0])
aftdirlocate = subcontent.find(aftdirlist[0],predirlocate)
director = subcontent[predirlocate+12:aftdirlocate]
print director
code.open('top250direct.txt','a','utf-8').write(director+' ')
#抓取3名主演
stars = ['']*3
prestarlocate = 0
prestar = re.compile(r'starring')
prestarlist = re.findall(prestar,subcontent)
aftstar = re.compile(r'</a>')
aftstarlist = re.findall(aftstar,subcontent)
k = 0
while k<3:
prestarlocate = subcontent.find(prestarlist[0],prestarlocate)
aftstarlocate = subcontent.find(aftstarlist[0],prestarlocate)
stars[k] = subcontent[prestarlocate+10:aftstarlocate]
print stars[k]
code.open('top250direct.txt','a','utf-8').write(stars[k]+' ')
k+=1
prestarlocate = prestarlocate + 10
else:
print "stars over"
#抓取上映日期
predate = re.compile(r'initial')
predatelist = re.findall(predate,subcontent)
predatelocate = subcontent.find(predatelist[0])
date = subcontent[predatelocate+29:predatelocate+39]
print date
code.open('top250direct.txt','a','utf-8').write(date+' ')
#抓取标签
pretag = re.compile(r'genre')
afttag = re.compile(r'</span>')
pretaglist = re.findall(pretag,subcontent)
afttaglist = re.findall(afttag,subcontent)
pretaglocate = 0
tags = ['']*10
p = 0
while pretaglocate != -1:
pretaglocate = pretaglocate + 5
pretaglocate = subcontent.find(pretaglist[0],pretaglocate)
afttaglocate = subcontent.find(afttaglist[0],pretaglocate)
if pretaglocate !=-1:
tags[p] = subcontent[pretaglocate+7:afttaglocate]
print tags[p]
code.open('top250direct.txt','a','utf-8').write(tags[p]+' ')
p+=1
else:
code.open('top250direct.txt','a','utf-8').write('\n')
print "tags over"
print "suburl over"
i+=1
else:
print "found end"
page = page + 25
j+=1
else:
print "top 250 finished!"
</span>
后来为了增加电影,我们通过抓取到的用户的数据,将其打过分的且电影库中没有的电影加进去,最终将电影扩充到了近2000部,代码如下:
<span style="font-size:18px;">#Created by DuanHao
#coding:utf-8
import urllib
import re
#在用户数据中找到电影ID并检查电影ID库中是否已存在
idd = re.compile(r' (\d+) '
user = open("user.txt",'r')
for line in user:
iddlist = re.findall(idd,line)
print iddlist[0]
url = "http://movie.douban.com/subject/"+str(iddlist[0])+"/"
content =urllib.urlopen(url).read()
flag = 0
fo = open("movie_id.txt",'r')
for ids in fo:
if ids == iddlist[0]+"\n":
flag = 1
print "already have!"
break;
else:
continue
#如果电影不存在则添加该电影的信息至电影库
if flag == 0:
prename = re.compile(r'itemre')
prenamelist = re.findall(prename,content)
aftname = re.compile(r'</span>')
aftnamelist = re.findall(aftname,content)
prenamelocate = content.find(prenamelist[0])
aftnamelocate = content.find(aftnamelist[0],prenamelocate)
name = content[prenamelocate+14:aftnamelocate]
prescore = re.compile(r'average')
prescorelist = re.findall(prescore,content)
prescorelocate = content.find(prescorelist[0])
score = content[prescorelocate+9:prescorelocate+12]
img = re.compile(r'src="(.+?\.jpg)" title=')
imglist = re.findall(img,content)
movie = "http://movie.douban.com/subject/"+str(iddlist[0])+"/"
predir = re.compile(r'directedBy')
predirlist = re.findall(predir,content)
aftdir = re.compile(r'</a>')
aftdirlist = re.findall(aftdir,content)
predirlocate = content.find(predirlist[0])
aftdirlocate = content.find(aftdirlist[0],predirlocate)
director = content[predirlocate+12:aftdirlocate]
if prenamelocate != -1 and aftnamelocate != -1 and prescorelocate != -1 and predirlocate != -1:
open('movie_id.txt','a').write(iddlist[0]+"\n")
open('movielist.txt','a').write(name+' '+iddlist[0]+' '+score+' '+imglist[0]+' '+movie+' '+director+' ')
print name,score,director
stars = ['']*3
prestarlocate = 0
prestar = re.compile(r'starring')
prestarlist = re.findall(prestar,content)
aftstar = re.compile(r'</a>')
aftstarlist = re.findall(aftstar,content)
k = 0
while k<3:
prestarlocate = content.find(prestarlist[0],prestarlocate)
aftstarlocate = content.find(aftstarlist[0],prestarlocate)
stars[k] = content[prestarlocate+10:aftstarlocate]
if prestarlocate != -1 and aftstarlocate != -1:
print stars[k]
open('movielist.txt','a').write(stars[k]+' ')
k+=1
prestarlocate = prestarlocate + 10
else:
print "stars over"
predate = re.compile(r'initial')
predatelist = re.findall(predate,content)
predatelocate = content.find(predatelist[0])
date = content[predatelocate+29:predatelocate+39]
if predatelocate != -1:
print date
open('movielist.txt','a').write(date+' ')
pretag = re.compile(r'genre')
afttag = re.compile(r'</span>')
pretaglist = re.findall(pretag,content)
afttaglist = re.findall(afttag,content)
pretaglocate = 0
tags = ['']*10
p = 0
while pretaglocate != -1:
pretaglocate = pretaglocate + 5
pretaglocate = content.find(pretaglist[0],pretaglocate)
afttaglocate = content.find(afttaglist[0],pretaglocate)
if pretaglocate !=-1:
tags[p] = content[pretaglocate+7:afttaglocate]
print tags[p]
open('movielist.txt','a').write(tags[p]+' ')
p+=1
else:
open('movielist.txt','a').write('\n')
print "tags over"
print "url over"
else:
print flag
print "This user text has finished!"
</span>
3、用户数据采集
抓取整合豆瓣网用户影评打分等信息。从豆瓣网电影top250中的电影评论中爬取评论用户所有的影评历史,包括影评网址、用户ID、电影ID等信息进行整合入库。
代码实现:
(1) catch_user_1函数
<span style="font-size:18px;">#Create by HuSilong
import urllib2
import time
import re
import sys
#import main
#headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
#req1 = urllib2.Request("",headers=headers)
#con = urllib2.urlopen(req1).read()
def function(net):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' }
url1 = 'http://movie.douban.com/people/'+str(net)+'/reviews?'
req1 = urllib2.Request(url1,headers=headers)
con1 = urllib2.urlopen(req1).read()
findNumber = "\\(\d{1,3}\\)"
endofline = re.compile(findNumber)
endd = re.findall(endofline,con1)
x_number_s = con1.find(endd[0])
x_number_end = con1.find('</title>')
x1 = con1[x_number_s+1:x_number_end-2]
x = int(x1)
page = 0
usernet = ['']*x
username = ['']*x
movieid = ['']*x
moviename = ['']*x
moviestar = ['']*x
message = ['']*x
link = 1
while page <=(x)/10:
url = 'http://movie.douban.com/people/'+str(net)+'/reviews?start='+str(page*10)
req = urllib2.Request(url,headers=headers)
con = urllib2.urlopen(req).read()
i = 0
user_name_start = con.find(r'starb')
user_name_usernet = con.find(r'href=',user_name_start)
user_name_usernetend = con.find(r'">',user_name_usernet)
user_name_end = con.find(r'</a',user_name_usernetend)
movie_id = con.find(r'href=',user_name_end)
movie_id_end = con.find(r'">',movie_id)
movie_name_end = con.find(r'</a',movie_id_end)
movie_star = con.find(r'allstar')
while i < x and user_name_start != -1:
usernet[i] = con[user_name_usernet+6:user_name_usernetend]
username[i] = con[user_name_usernetend+2:user_name_end]
movieid[i] = con[movie_id+38:movie_id_end-1]
moviename[i] = con[movie_id_end+3:movie_name_end]
moviestar[i] = con[movie_star+7:movie_star+8]
message[i] = usernet[i]+' '+str(net)+' '+movieid[i]+' '+moviestar[i]
print link,' ',message[i]
f = open(r'user_name_4.txt','a')
f.write(message[i])
f.write("\n")
user_name_start = con.find(r'starb',movie_star)
user_name_usernet = con.find(r'href=',user_name_start)
user_name_usernetend = con.find(r'">',user_name_usernet)
user_name_end = con.find(r'</a',user_name_usernetend)
movie_id = con.find(r'href=',user_name_end)
movie_id_end = con.find(r'">',movie_id)
movie_name_end = con.find(r'</a',movie_id_end)
movie_star = con.find(r'allstar',movie_name_end)
i = i + 1
link = link + 1
else:
print page+1,'download finish'
page = page + 1
time.sleep(1)
else:
print all,'find end'
</span>
(2) main函数
<span style="font-size:18px;">#The Main.py created by HuSilong
import urllib2
import time
import re
import sys
import catch_user_1
count = 440
while count<=800:
headers={"Accept": "*/*","Referer": "http://answers.yahoo.com/","User-Agent": "Mozilla/5.0+(compati