用来导入豆瓣主页所有的推荐电影
import requests
import pymongo
import re
import time
connection = pymongo.MongoClient()
att = connection.douban
tags = {'热门', '最新', '经典', '可播放', '豆瓣高分', '冷门佳片', '华语', '欧美', '韩国', '日本', '动作', '喜剧', '爱情', '科幻', '悬疑', '恐怖', '文艺'}
i = 0
for tag in tags:
start = time.time()
i += 1
url = "https://movie.douban.com/j/search_subjects"
page_limit = 20
page_start = 0
params = {
'type': 'movie',
'tag': tag,
'sort': 'recommend',
'page_limit': str(page_limit),
'page_start': str(page_start)
}
response = requests.get(url, params=params)
data = response.text
post_info = att[tag]
while(True):
params = {
'type': 'movie',
'tag': tag,
'sort': 'recommend',
'page_limit': str(page_limit),
'page_start': str(page_start)
}
response = requests.get(url, params=params)
if(response.text == '{"subjects":[]}'):
break
data = response.text
movies = re.findall('"rate":"(.*?)",.*?,"title":"(.*?)","url":"(.*?)",.*?,"cover":"(.*?)",.*?,.*?,"is_new":(.*?)}', data)
for movie in movies:
#a = movie[2].replace('\\\\', '')
tmp = {
"rate": movie[0],
"title": movie[1],
"url": movie[2],
"cover": movie[3],
"is_new": movie[4]
}
post_info.insert(tmp)
page_start += 20
time.sleep(1)
end = time.time()
print(tag + ": " + str(end - start) + '\n')
# 反斜杠的处理 http://www.jb51.net/article/19740.htm