豆瓣电影 Top 250
1.抓取影片的目录页
import re
import requests
import csv
from bs4 import BeautifulSoup as bs
movie_list = []
# get_multt_pages(pagenum)函数是处理翻页,抓取多个页面的函数(主函数)
def get_multi_pages(pagenum):
"""
该函数是主函数,通过循环处理翻页抓取和多页信息提取的工作
:param pagenum: 要抓取的页面数量(也是要解析的页面数量)
:return: None
"""
# 通过循环构建url,抓取每页的内容
for movie_index in range(0, pagenum * 25, 25):
# movie_index是url中影片的起始排名索引,start参数的值
url = 'https://movie.douban.com/top250?start=' + str(movie_index) + '&filter='
# print('翻页用的url:', url) # 调试用
print(f'正在抓取第{movie_index // 25 + 1}页信息,请稍待......')
# 2.抓取每页的内容,写入本地文本文件,该操作必须在循环中进行 用子函数实现 requests + with open
get_page(url)是抓取每页内容并写入文本的函数
#get_page(url) # 抓取完成后,调用抓取函数的代码即可注释掉
# 3.读取抓取到本地的文本文件,提取每部影片的信息 bs + re 用另一个子函数实现
# 构建用来提取信息的文件名:top250index/top_1.html
# get_movie_info是开启文件提取信息的函数
file_name = 'index/top_' + str(movie_index // 25 + 1) + '.html'
get_movie_info(file_name)
# get_page(url)是抓取每页内容并写入文本的函数
def get_page(url):
"""
抓取页面,写入文件
:param url: 要抓取的url
:return: None
"""
db_headers = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
# 抓取页面内容
db_response = requests.get(url, headers = db_headers)
db_response.encoding = 'utf-8'
movie_index = url.split('=')[1].split('&')[0]
page_num = int(movie_index) // 25 + 1
file_name = 'index/top_' + str(page_num) + '.html'
print(f'正在写入{file_name},请稍待......')
# 将抓取到的页面文件写入文本文件
with open(file_name, 'w', encoding = 'utf-8') as tpf:
tpf.write(db_response.text)
# 3:读取:
def get_movie_info(file_name):
with open (file_name,'r',encoding='utf-8')as db:
str_page=db.read()
soup_page=bs(str_page,'html.parser') #转为soup对象
try:
tagOL=soup_page.find('ol', class_ ='grid_view')
except Exception as e:
print(e)
tagOL=' '
#提取ol中所有的li标签
try:
topLIs=tagOL.find_all('li')#只有标签,后面没有属性用find_all ,需要判断属性用find
except Exception as e:
print(e)
topLIs=[]
print(len(topLIs))
# 遍历每部影片对应的li标签,提取影片信息
for movie in topLIs:
try:
topem=movie.em.string
except Exception as e:
print(e)
topem=''
print('影片的id:',topem)
# b.影片名称 <span class="title">肖申克的救赎</span>
try:
topname=movie.find('span', class_='title').string
except Exception as e :
print(e)
topname=''
try:
topname2=movie.img['alt']
except Exception as e:
print(e)
topname2=''
print('影片名称:', topname2)
try:
toprating=movie.find('span',class_='rating_num').string
except Exception as e :
print(e)
toprating=''
print('影片评分:', toprating)
# d.评论人数
try:
top_people4=movie.find('span',property="v:best").find_next_sibling().string[:-3]
except Exception as e:
print(e)
top_people4=''
print('4:',top_people4)
# e. 短评
try:
top_duanping=movie.find('span',class_='inq').string
except Exception as e :
print(e)
top_duanping=''
print('短评:', top_duanping)
# f.详情页链接
try:
topherf=movie.a['href']
except Exception as e :
print(e)
topherf=''
print('详情页链接:', topherf)
print('-'*100)
# 将抓取到的所有信息添加到全局变量中
movie_list.append((topem,topname,toprating,top_people,top_duanping,topherf))
def save2csv():
# 开启要写入的csv文件
with open ('top2550fullindex.csv','w',encoding='utf-8' , newline = '')as db:
index_writer=csv.writer(db)
index_writer.writerow(['排名', '名称', '评分', '评论人数', '短评', '详情页链接'])
index_writer.writerows(movie_list)
2.抓取影片的详情页
# read_index是从csv文件中读取数据,构建容器的函数
def read_index():
#开启csv文件
with open ('top250fullindex.csv','r',encoding='utf-8')as csv1:
index_reader=list(csv.reader(csv1)) # 将reader得到的迭代器转换为列表,方便后续操作
# 同切片器去掉表头
# 提取排名、名称和详情页链接三列,构建三个容器对象
ranks = [item[0]for item in index_reader][1:] # 提取reader列表中的第一列,去掉表头信息
names=[item[1]for item in index_reader][1:]
links=[item[-1]for item in index_reader][1:]
return ranks,names,links
# get_full_page是抓取详情页并写入文件的函数
def get_full_page():
movie_ranks,movie_names,movie_links=read_index()
# 调用read_index函数,同时返回影片的排名、名称和链接三个列表
print('影片排名:', movie_ranks)
print('影片名称:', movie_names)
print('影片链接:', movie_links)
print('-' * 100)
# 在当前文件夹下创建一个用来存储详情页的文件夹
os.makedirs('top250full',exist_ok=True)
for i in range(0,50):
print(movie_ranks[i], movie_names[i], movie_links[i])
# 构建用来保存的文件名
full_name='top250full/'+movie_ranks[i]+'_'+movie_names[i]+'.html'
print(f"正在抓取第{full_name}请稍后....")
full_page=webtools.get_wbe_page(movie_links[i],decode=True,encoding='utf-8') #
with open (full_name,'w',encoding='utf-8')as fdb:
fdb.write(full_page)
time.sleep(random.randint(1, 3))
# 每抓一页随机停顿1~3秒,避免被服务器发现封掉IP
# get_full_data是打开影片详情页文件,提取信息的函数
def get_full_data():
files.sort(key = lambda i:int(i[:4].replace('-','_').split('_')[0]))
# sort的key机制是将list中的每个元素作为i,进行函数处理,用处理之后的结果进行排序
print(files) # 调试用 按照排名的顺序排列
# 遍历文件名列表,开启文件,提取信息
for file in files[:100]:
with open(root + '/' + file, 'r', encoding='utf-8') as mf:
print(f'正在提取{file}的信息,请稍待......')
print('-'*100)
str_page=mf.read()
soup_page=bs(str_page,'html.parser')
try:
movie_content=soup_page.find('div',id='content')
except Exception as e:
print(e)
movie_content=''
print(movie_content)
# a.排名
try:
movie_rank=movie_content.find('span',class_="top250-no").string
except Exception as e:
print(e)
movie_rank=''
print('影片排名:', movie_rank)
# b.影片名称
try:
movie_name=soup_page.title.string[:-5].strip() # 去掉所有的空格
except Exception as e:
print(e)
movie_name=''
print('影片名称:', movie_name)
# c.上映年份
try:
movie_year=movie_content.find('span',class_="year").string[1:-1]
except Exception as e:
print(e)
movie_year=''
print('上映年份:', movie_year)
# d.演职人员
try:
movie_info=movie_content.find('div',id='info')
except Exception as e:
print(e)
movie_info=''
try:
movie_attrs=movie_info.find_all('span',class_='attrs')
except Exception as e :
print(e)
movie_attrs=''
# 导演
try:
movie_dir = movie_attrs[0].text
except Exception as e:
print(e)
movie_dir = ''
print('导演:', movie_dir)
# 编剧
try:
movie_script = movie_attrs[1].text
except Exception as e:
print(e)
movie_script = ''
print('编剧:', movie_script)
# 演员
try:
movie_actor = movie_attrs[2].text
except Exception as e:
print(e)
movie_actor = ''
print('演员:', movie_actor)
try:
tag_movie_types=movie_info.find_all('span', property = 'v:genre')
movie_types='/'.join([tag.string for tag in tag_movie_types]) # tag 的string方法
except Exception as e:
print(e)
movie_types=''
print('影片类型:', movie_types)
# f.国家/地区 <span class="pl">制片国家/地区:</span> 美国<br/>
# 方法1:用bs直接提取比较麻烦,可以考虑re
try:
pn_movie_area=re.compile(r'<span class="pl">制片国家/地区:</span> (.*?)<br/>')
movie_area=re.findall(pn_movie_area,str(movie_info))[0].strip()
except Exception as e:
print(e)
movie_area=''
print('制片国家/地区:', movie_area)
# g.语言
# 先提取出div info中所有的文本,再设法从中提取语言对应的信息
try:
movie_lange=movie_info.find('span',string='语言:').next_sibling.strip()
except Exception as e:
print(e)
movie_lange=''
print('语言:', movie_lange)
# h.片长
try:
movie_runtime=movie_info.find('span',property = 'v:runtime').string
except Exception as e:
print(e)
movie_runtime=''
print('片长:', movie_runtime)
# i.评分
try:
movie_rating=movie_content.find('strong',class_="ll rating_num").string
except Exception as e:
print(e)
movie_rating=''
print('影片评分:', movie_rating)
# j.评论人数
try:
movie_people=movie_content.find('span',property="v:votes").string
except Exception as e:
print(e)
movie_people=''
print('评论人数:', movie_people)
# k.简介
try:
movie_summary=re.sub(r'\s +',' ',movie_content.find('span',property="v:summary").text.strip())
except Exception as e:
print(e)
movie_summary=''
print('简介:', movie_summary)
movie_list.append((movie_rank, movie_name, movie_year, movie_dir, movie_script, movie_actor, movie_types, movie_area2,
movie_lange, movie_runtime, movie_rating, movie_people, movie_summary))
def save_data():
# 开启文件
with open ('top250full.csv','w',encoding='utf-8')as file:
full_writer=csv.writer(file)
full_writer.writerow(['排名', '名称' ,'上映年份', '导演', '编剧', '演员', '影片类型', '制片国家/地区', '语言', '片长', '评分', '评论人数', '简介'])
full_writer.writerows(movie_list)