spa1(Ajax加载、爬虫练习)
总结(requests实现)
'''
jsonpath模块简单用法
Eg1.
from jsonpath import jsonpath
data = {'name1': {'name2': {'name3': {'name4': {'name5': '陈子康'}}}}}
#常规字典索引
print(data['name1']['name2']['name3']['name4']['name5'])
#使用jsonpath模块
print(jsonpath(data, '$..name5'))
jsonpath模块可以直接访问多层字典的内部位置
注意:jsonpath会以列表形式返回字典中所有满足条件的值,取值要添加索引' [0] '
'''
requests实现
# url = 'https://spa1.scrape.center/'
import requests
import json
from jsonpath import jsonpath
# 构造每一页的url
def get_url(INDEX_URL, offset):
url = INDEX_URL + str(offset)
return url
# 访问详情页,并返回简介
def abstract_data(film_i, page):
# 构造详情页的url
url = 'https://spa1.scrape.center/api/movie/{}/'.format((page-1)*10+film_i)
response = requests.get(url).text
res = json.loads(response)
return jsonpath(res, '$..drama')
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
}
# 定义一个空列表,用于储存电影数据
film_list = []
for page in range(1, 11):
INDEX_URL = 'https://spa1.scrape.center/api/movie/?limit=10&offset='
url = get_url(INDEX_URL, (page-1)*10)
response = requests.get(url, headers=headers).text
# print(response)
# 将json字符串转换为Python对象
res = json.loads(response)
# 将每一页的结果提取出来
res_page = res["results"]
# print(res_page)
# 提取每一部电影的信息
for i in range(len(res_page)):
film_dict = {}
# 电影名、外语名、类型、时长、发行时间、评分、详情页的简介
film_dict['name'] = res_page[i]['name']
film_dict['w_name'] = res_page[i]['alias']
film_dict['type'] = res_page[i]['categories']
film_dict['long'] = res_page[i]['minute']
film_dict['time'] = res_page[i]['published_at']
film_dict['score'] = res_page[i]['score']
film_dict['abstract'] = abstract_data(i, page)
film_list.append(film_dict)
print(film_list)
print(len(film_list))
if __name__ == '__main__':
main()