在本篇博客中,我们将使用requests+正则表达式(re)来爬取豆瓣电影TOP250电影榜单,获取每部电影的序号、片名、导演、编剧、主演、类型、制作国家/地区、语言、上映日期、片长、又名、豆瓣评分和剧情简介等内容。
打开豆瓣Top250,分析URL的变化:发现Top250榜总共包含10页,每页25部电影,并且每一页的URL都是有规律的,如第2页的URL是https://movie.douban.com/top250?start=25&filter=,第三页
https://movie.douban.com/top250?start=50&filter=
由此可得,第n页是https://movie.douban.com/top250?start=(i*25)&filter=。
1、调用库,搭建主体框架
import requests
from requests.exceptions import RequestException
import re
def get_page(url):
pass
def get_movie_list(html):
pass
def get_content(movie_url):
pass
if __name__ == '__main__':
for i in range(10):
url = 'https://movie.douban.com/top250?start='+str(i*25)
#发送请求 获取响应
html = get_page(url)
#解析响应 获取电影列表
movie_list = get_movie_list(html)
#获取每部电影的详细内容
for movie in movie_list:
get_content(movie[1])
2、发送请求,获取网页
def get_page(url):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
response.raises_for_tatus()
response.encoding = response.apparent_encoding
return response.text
except :
print('')
和上一篇爬取简单信息不同,我们需要剧情简介等内容,需要点开每部电影,获取详细信息。所以,我们首先要把页面上所有电影的链接爬下来,再逐个链接进行详细信息的爬取。
初始页面的每部电影都包含在li标签中,电影的序号和链接都在上图所示的标签中,接下里我们需要解析初始页面的html代码
3、提取电影序号和链接:
def get_movie_list(html):
pattern = re.compile('<div class="pic".*?em class="">(.*?)</em>.*?<a href="(.*?)">.*?</a>',re.S)
movie_list = pattern.findall(html)
return movie_list
4、依次打开每个电影的url,对详细信息分别进行解析提取:
(1)、片名
(2)、导演、编剧、主演、类型、制作国家/地区、语言、上映日期、片长、又名、豆瓣评分
(3)、剧情简介
def get_content(movie_url):
html = get_page(movie_url)
html = str(html)
#获取片名
pattern = re.compile('<span property="v:itemreviewed">(.*?)</span>',re.S)
name = pattern.findall(html)
print(name)
#导演
pattern = re.compile('<a.*?rel="v:directedBy">(.*?)</a>',re.S)
director = pattern.findall(html)
print(director)
#编剧 没办法一次解析出来 可以解析两次 第一次缩小范围
pattern = re.compile("<span ><span class='pl'>编剧</span>: <span class='attrs'>(.*?)</span></span><br/>",re.S)
author = pattern.findall(html)
if author:
pattern = re.compile('<a href=.*?>(.*?)</a>',re.S)
author = pattern.findall(author[0])
print(author)
#主演
pattern = re.compile('<a.*?rel="v:starring">(.*?)</a>',re.S)
actor = pattern.findall(html)
print(actor)
#类型
pattern = re.compile('<span property="v:genre">(.*?)</span>',re.S)
type = pattern.findall(html)
print(type)
#制片国家/地区
pattern = re.compile('<span class="pl">制片国家/地区:</span>(.*?)<br/>',re.S)
area = pattern.findall(html)
print(area)
#语言
pattern = re.compile('<span class="pl">语言:</span>(.*?)<br/>',re.S)
language = pattern.findall(html)
print(language)
#上映时间
pattern = re.compile('<span property="v:initialReleaseDate" content=.*?>(.*?)</span>',re.S)
time = pattern.findall(html)
print(time)
#片长
pattern = re.compile('<span property="v:runtime" content=.*?>(.*?)</span>',re.S)
runtime = pattern.findall(html)
print(runtime)
#又名
pattern = re.compile('<span class="pl">又名:</span>(.*?)<br/>',re.S)
other_name = pattern.findall(html)
print(other_name)
#评分
pattern = re.compile('<strong class="ll rating_num" property="v:average">(.*?)</strong>',re.S)
score = pattern.findall(html)
print(score)
#简介
pattern = re.compile('<span property="v:summary".*?>(.*?)</span>',re.S)
introduce = pattern.findall(html)
introduce = introduce[0].strip().replace('\n', '').replace('\t', '').replace(
' <br /> ', '')
print(introduce)
# 保存数据
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(movie[0] + '\t' + movie[1] + '\t' + str(name) + '\t' + str(director) + '\t' + str(author) + '\t' + str(
actor) + '\t' + str(type) + '\t' + str(area) + '\t' + str(language) + '\t' + str(
time) + '\t' + str(runtime) + '\t' + str(other_name) + '\t' + str(
score) + '\t' + introduce + '\n')
5、完整代码如下
```python
import requests
from requests.exceptions import RequestException
import re
def get_page(url):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
response.raises_for_tatus()
response.encoding = response.apparent_encoding
return response.text
except :
print('')
def get_movie_list(html):
pattern = re.compile('<div class="pic".*?em class="">(.*?)</em>.*?<a href="(.*?)">.*?</a>',re.S)
movie_list = pattern.findall(html)
return movie_list
def get_content(movie_url):
html = get_page(movie_url)
html = str(html)
#获取片名
pattern = re.compile('<span property="v:itemreviewed">(.*?)</span>',re.S)
name = pattern.findall(html)
print(name)
#导演
pattern = re.compile('<a.*?rel="v:directedBy">(.*?)</a>',re.S)
director = pattern.findall(html)
print(director)
#编剧 没办法一次解析出来 可以解析两次 第一次缩小范围
pattern = re.compile("<span ><span class='pl'>编剧</span>: <span class='attrs'>(.*?)</span></span><br/>",re.S)
author = pattern.findall(html)
if author:
pattern = re.compile('<a href=.*?>(.*?)</a>',re.S)
author = pattern.findall(author[0])
print(author)
#主演
pattern = re.compile('<a.*?rel="v:starring">(.*?)</a>',re.S)
actor = pattern.findall(html)
print(actor)
#类型
pattern = re.compile('<span property="v:genre">(.*?)</span>',re.S)
type = pattern.findall(html)
print(type)
#制片国家/地区
pattern = re.compile('<span class="pl">制片国家/地区:</span>(.*?)<br/>',re.S)
area = pattern.findall(html)
print(area)
#语言
pattern = re.compile('<span class="pl">语言:</span>(.*?)<br/>',re.S)
language = pattern.findall(html)
print(language)
#上映时间
pattern = re.compile('<span property="v:initialReleaseDate" content=.*?>(.*?)</span>',re.S)
time = pattern.findall(html)
print(time)
#片长
pattern = re.compile('<span property="v:runtime" content=.*?>(.*?)</span>',re.S)
runtime = pattern.findall(html)
print(runtime)
#又名
pattern = re.compile('<span class="pl">又名:</span>(.*?)<br/>',re.S)
other_name = pattern.findall(html)
print(other_name)
#评分
pattern = re.compile('<strong class="ll rating_num" property="v:average">(.*?)</strong>',re.S)
score = pattern.findall(html)
print(score)
#简介
pattern = re.compile('<span property="v:summary".*?>(.*?)</span>',re.S)
introduce = pattern.findall(html)
introduce = introduce[0].strip().replace('\n', '').replace('\t', '').replace(
' <br /> ', '')
print(introduce)
# 保存数据
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(movie[0] + '\t' + movie[1] + '\t' + str(name) + '\t' + str(director) + '\t' + str(author) + '\t' + str(
actor) + '\t' + str(type) + '\t' + str(area) + '\t' + str(language) + '\t' + str(
time) + '\t' + str(runtime) + '\t' + str(other_name) + '\t' + str(
score) + '\t' + introduce + '\n')
if __name__ == '__main__':
for i in range(10):
url = 'https://movie.douban.com/top250?start='+str(i*25)
print(url)
#发送请求 获取响应
html = get_page(url)
#print(html)
#解析响应 获取电影列表
movie_list = get_movie_list(html)
print(movie_list)
#获取每部电影的详细内容
for movie in movie_list:
get_content(movie[1])