使用requests,lxml模块获取豆瓣电影所有分类下的电影(电影标题,演员,评分,链接,类型等信息)
- 获取主页面内容(url:https://movie.douban.com/chart)
- 将主页url并传给get_content函数,使用xpath获取到所有类型的url
- 使用正则提取每个url里的type_name,type参数(因为每个类型的详情页是ajax请求),用于拼接ajax请求地址
- 每个类型的ajax分页处理,返回json数据
- 从json数据中提取需要的字段信息
下面是源码
import requests, json, re
from lxml import etree
from urllib import parse
def get_conent(url, headers):
'''
:param url: 请求url
:param headers:
:return: python的list或者字典
'''
response = requests.get(url,headers=headers)
return response.text
def parse_json(json_data):
item = {}
for data in json_data:
rating = data['rating']
imag = data['cover_url']
title = data['title']
actors = data['actors']
detail_url = data['url']
vote_count = data['vote_count']
types = data['types']
item['rating'] = rating
item['imag'] = imag
item['title'] = title
item['actors'] = actors
item['vote_count'] = vote_count
item['detail_url'] = detail_url
item['types'] = types
print(item)
def parse_ajax(url,type_,refer):
# print(referer)
# print(url)
headers = {
'X-Requested-With': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Referer': refer,
}
# json_str = get_conent(url, headers=headers)
# print(json_str)
i =0
while True:
# print(2)
json_str = get_conent(url.format(type_,i), headers=headers)
print(json_str)
if json_str =='[]':
# print(1)
break
json_data = json.loads(json_str)
parse_json(json_data)
i += 100
def main():
base_url = 'https://movie.douban.com/chart'
#先请求首页。获取分类的type值
headers={
'Referer': 'https://movie.douban.com/',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
}
html_str = get_conent(base_url,headers)
# print(html_str)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
# print(type_urls)
for url in type_urls:
#/typerank?type_name=剧情&type=11&interval_id=100:90&action=
p = re.compile(r'.*?type_name=(.*?)&type=(.*?)&interval.*?')
result = p.search(url)
type_name = result.group(1)
type_ = result.group(2)
# print(type)
#拼接接口url
#https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action':'',
}
#parse.urlencode()方法可以将一个字典,装化成key1=value1&key2=value2
#同时还能将中文按urlencoding编码来进行转码。
#https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=
refer = 'https://movie.douban.com/j/chart/top_list?'+parse.urlencode(params)
# print(refer)
ajax_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=100'
parse_ajax(ajax_url,type_,refer)
if __name__ == '__main__':
main()