注:仅用于技术学习
知道数量来源了,就可以爬了
import requests
import json
import time
import pymongo
import hashlib
#加密
def get_md5(value):
md5 = hashlib.md5()
md5.update(value.encode())
return md5.hexdigest()
#获取json数据
def get_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
response = requests.get(url, headers=headers)
return json.loads(response.text)
#保存
def save(item):
#连接数据库
client = pymongo.MongoClient('mongodb://localhost')
#创建库
db = client['douban']
#设置增量
hash_url = get_md5(item['url'])
item['hash_url'] = hash_url
db['moves'].update({'hash_url':item['hash_url']},{'$set':dict(item)},True)
#提取数据
def parse(json_data):
item = {}
for data in json_data['data']:
# print(data)
item['directors'] = data['directors']
item['rate'] = data['rate']
item['title'] = data['title']
item['casts'] = data['casts']
item['casts'] = data['casts']
item['url'] = data['url']
print(item)
save(item)
if __name__ == '__main__':
i = 0
#不知道有多少页可以使用死循环,但必须设置退出条件
while True:
#数据的来源
base_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start={}'
json_data = get_data(base_url.format(i*20))
#退出条件
if json_data = '[]'
break
i+=20
parse(json_data)
#爬一组停一下,低调点
time.sleep(5)