文章目录
爬取豆瓣电影Top250数据,包括电影的电影名、导演、演员等基本信息,以及海报图片、剧情简介和评论数量。
运行截图如下:
1、构建请求头
总共有10页,每页25条电影数据,page_start为每页的起始位置,如第一页为0,第二页为25,因此想要爬取全部页数的数据只用从0遍历到250,以25为步长即可,即range(0, 250, 25)。
请求头可以使用https://curlconverter.com/快速构建,使用方法可访问https://blog.csdn.net/Pangaoyang_/article/details/140873357?spm=1001.2014.3001.5502
cookies = {
'll': '"118282"',
'bid': 'qpeBkdWNQ30',
'__utma': '30149280.1285408772.1722931171.1722931171.1722931171.1',
'__utmc': '30149280',
'__utmz': '30149280.1722931171.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'__utmt': '1',
'__utmb': '30149280.1.10.1722931171',
'__utma': '223695111.549597820.1722931184.1722931184.1722931184.1',
'__utmb': '223695111.0.10.1722931184',
'__utmc': '223695111',
'__utmz': '223695111.1722931184.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1722931184%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D',
'_pk_id.100001.4cf6': '39e7e842a6abee49.1722931184.',
'_pk_ses.100001.4cf6': '1',
'ap_v': '0,6.0',
'__yadk_uid': '5tRoftzrzq0L8EylRtLcRgAgQ8c6kVkb',
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# 'cookie': 'll="118282"; bid=qpeBkdWNQ30; __utma=30149280.1285408772.1722931171.1722931171.1722931171.1; __utmc=30149280; __utmz=30149280.1722931171.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1722931171; __utma=223695111.549597820.1722931184.1722931184.1722931184.1; __utmb=223695111.0.10.1722931184; __utmc=223695111; __utmz=223695111.1722931184.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1722931184%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_id.100001.4cf6=39e7e842a6abee49.1722931184.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __yadk_uid=5tRoftzrzq0L8EylRtLcRgAgQ8c6kVkb',
'priority': 'u=0, i',
'referer': 'https://movie.douban.com/top250',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
params = {
'start': f'{page_start}', # 每页起始位置
'filter': '',
}
2、提取数据
用XPath提取网页数据,在提取的时候发现演员数据放在了JS中,因此这部分需要用正则表达式提取。
response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers)
html = etree.HTML(response.text)
hrefs = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href') # 详情链接
# 遍历每页的所有电影
for href in hrefs:
response = requests.get(href, cookies=cookies, headers=headers)
html2 = etree.HTML(response.text)
title = html2.xpath('//*[@id="content"]/h1/span[1]/text()')[0] # 电影名
year = html2.xpath('//*[@id="content"]/h1/span[2]/text()')[0][1:-1] # 年份
director = html2.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0] # 导演
try:
writer = html2.xpath('//*[@id="info"]/span[2]/span[2]/a/text()')[0] # 编剧
except: # 没有编剧
writer = ''
plot = html2.xpath('//*[@id="info"]/span[5]/text()')[0] # 剧情
score = html2.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0] # 评分
synopsis = html2.xpath('//*[@id="link-report-intra"]/span/text()') # 剧情介绍
comment_number = html2.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()')[0].split(' ')[1] # 评论数
image_url = html2.xpath('//*[@id="mainpic"]/a/img/@src')[0] # 图片地址
# 包含地区、语言、又名
temps = html2.xpath('//*[@id="info"]/text()') # 地区
new_temp = []
for temp in temps:
temp = temp.replace(' ', '') # 清除空格
if ('
' not in temp) and (temp != '/') and (temp != ''): # 筛选出换行符、空字符串和/
new_temp.append(temp)
area = new_temp[0] # 地区
language = new_temp[1] # 语言
alias = new_temp[-2] # 别名
# 正则提取演员名
data = re.findall('<script type="application/ld+json">(.*?)</script>', response.text, re.DOTALL)[0] # 正则匹配
data = data.replace('
', '') # 替换换行符
data = data.replace(' ', "") # 一个特殊字符串
data = json.loads(data) # 转换为字典
actors = []
for act in data['actor']: # 遍历演员列表
actors.append(act['name'])
3、保存数据
数据存储此处使用pandas模块直接保存为了.xlsx文件,可以很方便地使用Excel查看,也可以根据需要选择保存到数据库中。
data = [title, year, director, writer, actors, area, plot, language, alias, score, synopsis, comment_number, image_url]
columns = ['电影名', '年份', '导演', '编剧', '主演', '地区', '剧情', '语言', '又名', '评分', '剧情介绍', '评论数', '海报地址']
result = pd.DataFrame(all_data, columns=columns)
result.to_excel('豆瓣电影数据.xlsx', index=True)
保存结果如下:
完整代码如下:
import requests
from lxml import etree
import re
import json
import pandas as pd
cookies = {
'll': '"118282"',
'bid': 'qpeBkdWNQ30',
'__utma': '30149280.1285408772.1722931171.1722931171.1722931171.1',
'__utmc': '30149280',
'__utmz': '30149280.1722931171.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'__utmt': '1',
'__utmb': '30149280.1.10.1722931171',
'__utma': '223695111.549597820.1722931184.1722931184.1722931184.1',
'__utmb': '223695111.0.10.1722931184',
'__utmc': '223695111',
'__utmz': '223695111.1722931184.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1722931184%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D',
'_pk_id.100001.4cf6': '39e7e842a6abee49.1722931184.',
'_pk_ses.100001.4cf6': '1',
'ap_v': '0,6.0',
'__yadk_uid': '5tRoftzrzq0L8EylRtLcRgAgQ8c6kVkb',
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# 'cookie': 'll="118282"; bid=qpeBkdWNQ30; __utma=30149280.1285408772.1722931171.1722931171.1722931171.1; __utmc=30149280; __utmz=30149280.1722931171.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1722931171; __utma=223695111.549597820.1722931184.1722931184.1722931184.1; __utmb=223695111.0.10.1722931184; __utmc=223695111; __utmz=223695111.1722931184.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1722931184%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_id.100001.4cf6=39e7e842a6abee49.1722931184.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __yadk_uid=5tRoftzrzq0L8EylRtLcRgAgQ8c6kVkb',
'priority': 'u=0, i',
'referer': 'https://movie.douban.com/top250',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
all_data = []
def acquire_movie(page_start):
params = {
'start': f'{page_start}', # 每页起始位置
'filter': '',
}
response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers)
html = etree.HTML(response.text)
hrefs = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href') # 详情链接
# 遍历每页的所有电影
for href in hrefs:
response = requests.get(href, cookies=cookies, headers=headers)
html2 = etree.HTML(response.text)
title = html2.xpath('//*[@id="content"]/h1/span[1]/text()')[0] # 电影名
year = html2.xpath('//*[@id="content"]/h1/span[2]/text()')[0][1:-1] # 年份
director = html2.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0] # 导演
try:
writer = html2.xpath('//*[@id="info"]/span[2]/span[2]/a/text()')[0] # 编剧
except: # 没有编剧
writer = ''
plot = html2.xpath('//*[@id="info"]/span[5]/text()')[0] # 剧情
score = html2.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0] # 评分
synopsis = html2.xpath('//*[@id="link-report-intra"]/span/text()') # 剧情介绍
comment_number = html2.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()')[0].split(' ')[1] # 评论数
image_url = html2.xpath('//*[@id="mainpic"]/a/img/@src')[0] # 图片地址
# 包含地区、语言、又名
temps = html2.xpath('//*[@id="info"]/text()') # 地区
new_temp = []
for temp in temps:
temp = temp.replace(' ', '') # 清除空格
if ('
' not in temp) and (temp != '/') and (temp != ''): # 筛选出换行符、空字符串和/
new_temp.append(temp)
area = new_temp[0] # 地区
language = new_temp[1] # 语言
alias = new_temp[-2] # 别名
# 正则提取演员名
data = re.findall('<script type="application/ld+json">(.*?)</script>', response.text, re.DOTALL)[0] # 正则匹配
data = data.replace('
', '') # 替换换行符
data = data.replace(' ', "") # 一个特殊字符串
data = json.loads(data) # 转换为字典
actors = []
for act in data['actor']: # 遍历演员列表
actors.append(act['name'])
data = [title, year, director, writer, actors, area, plot, language, alias, score, synopsis, comment_number, image_url]
all_data.append(data)
print(data)
if __name__ == '__main__':
for i in range(0, 250, 25):
print(f'--------第{i//25 + 1}页---------')
acquire_movie(i)
columns = ['电影名', '年份', '导演', '编剧', '主演', '地区', '剧情', '语言', '又名', '评分', '剧情介绍', '评论数', '海报地址']
result = pd.DataFrame(all_data, columns=columns)
result.to_excel('豆瓣电影数据.xlsx', index=True)