Python3爬豆瓣电影详情并写入表格
直接上干货,嘿嘿
1:可以首先从电影详情页入手,打开豆瓣选择找电影随便点开一个电影查看详情,会看到很多详情,然后右击查看源代码,把需要抓取的详情在源代码中搜索找到具体位置。首先定义好需要的数组
from urllib import request
import re
from bs4 import BeautifulSoup
import json
import xlwt
movie_title = []
movie_diector = []
movie_sceeenWriter = []
movie_actor = []
movie_type = []
movie_country = []
movie_language = []
movie_data = []
movie_time = []
movie_name = [] #定义空数组
2:在这里呢我用的是Beautiful soup4和正则来匹配电影详情
title = bs1.title
if (title):
movie_title.append(title.text.strip().split('/')[0])
else:
movie_title.append('')
diector_name = bs1.find('span', class_='attrs') # 导演
if (diector_name):
movie_diector.append(diector_name.get_text())
else:
movie_diector.append('')
sceeenWriter_name = bs1.find_all('span', class_='attrs') # 编剧
if (sceeenWriter_name):
movie_sceeenWriter.append(sceeenWriter_name[len(sceeenWriter_name)-2].get_text())
else:
movie_sceeenWriter.append('')
actor_name = bs1.find_all('span', class_='attrs') # 演员
if(actor_name):
movie_actor.append(actor_name[len(actor_name)-1].get_text())
else:
movie_actor.append('')
type_name = bs1.find('span', property="v:genre") # 类型名字
if(type_name):
movie_type.append(type_name.get_text())
else:
movie_type.append('')
country_namePat = '制片国家/地区:</span>(.*?)<br/>'
country_name = re.compile(country_namePat).findall(content_link) # 制片地
if (country_name):
movie_country.append(country_name[0])
else:
movie_country.append('')
language_namePat = '语言:</span> (.*?)<br/>'
language_name = re.compile(language_namePat).findall(content_link) # 语言
if(language_name):
movie_language.append(language_name[0])
else:
movie_language.append('')
data_name = bs1.find('span', property="v:initialReleaseDate") # 开播时间
if(data_name):
movie_data.append(data_name.get_text())
else:
movie_data.append('')
movies_time = bs1.find('span', property="v:runtime") # 片长
if(movies_time):
movie_time.append(movies_time.get_text())
else:
movie_time.append('')
name_namePat = '又名:</span> (.*?)<br/>'
list = re.compile(name_namePat).findall(content_link) # 又名
if(list):
movie_name.append(list[0]) #列表不是空的说明电影有别名
else:
movie_name.append("") #没有的话用空字符串占个位置
3:详情页代码写好了之后就开始往上走一步,找每个详情页的url,只有打开了详情页才能匹配到信息。回到选电影这个页面,随便找个电影右击复制电网址在选电影的源码中搜索,发现根本找不到。其实这个网址是个js文件,选电影网页上摁f12,刷新网页,在其中搜索电影网址的id,会找到一个这一页中所有的电影id,哈哈哈现在就是要把这些的id匹配出来就能构造出来电影网址啦。上代码
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' + keyword + '&sort=recommend&page_limit=20&page_start='+str(z*20)
response = request.Request(url)
response.add_header('cookie', cookie_str)
response.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
content = request.urlopen(response).read()
tags = json.loads(content)['subjects']
for tag in tags:#遍历json中的id
link = 'https://movie.douban.com/subject/' + tag['id'] + '//'
response_link = request.Request(link)
response_link.add_header('cookie', cookie_str)
response_link.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
content_link = request.urlopen(response_link).read().decode('utf-8', 'ignore')
bs1 = BeautifulSoup(content_link, 'lxml')
title = bs1.title
4:当然上面这一段代码中牵涉到我们接下来要说的内容,如果想找到id势必要先打开网址,而这个js文件的网址很容易找到,就是它啦
5:之后我们遍历完了详情页网址之后,就开始找随着点击加载更多之后的js文件网址的变化,发现关键词变了,后边的尾数也变了,但是其他的保持不变,所以就在关键词和尾数这一块空出来。引起关键词变化的词也在js文件中,寻找方法跟找id时一样,不再多说。
tag_url = 'https://movie.douban.com/j/search_tags?type=movie&tag=%E7%83%AD%E9%97%A8&source='#js文件的url
heads = {'User - Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 70.0.3538.77Safari / 537.36'}
cookie_str = 'Cookie: ll="108303"; bid=VCwqhdGykZc; __yadk_uid=WlU2z5PfT7ayM2AUFMlUVm6XHaxwoUsx; _vwo_uuid_v2=D0CA41EDEED1A0C639F0837D53C6F56A7|731b88fd10c962f37b53e476cad040b8; __utmc=30149280; __utmc=223695111; ps=y; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18705; ue="1274184982@qq.com"; __utma=30149280.1927612880.1539001446.1541743879.1541750881.23; __utmz=30149280.1541750881.23.8.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/safety/unlock_sms/resetpassword; dbcl2="187059995:etWky9gmE6A"; ck=S3Xi; __utmb=30149280.3.10.1541750881; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1541750915%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.656272146.1539001446.1541743879.1541750915.22; __utmb=223695111.0.10.1541750915; __utmz=223695111.1541750915.22.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.4cf6=cb1300d538aa9452.1537752921.24.1541751703.1541743879.'
response_tag = request.Request(tag_url)#准备相应js文件
response_tag.add_header('cookie', cookie_str)#在相应头部增加cookie
response_tag.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
response_tags = request.urlopen(response_tag).read()#响应并且读取url
title_tags = json.loads(response_tags)['tags']#解json文件并且找出标签为tags的列表内容
for title_tag in title_tags:#遍历标签
keyword = request.quote(title_tag)
for z in range(0, 2):
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=' + keyword + '&sort=recommend&page_limit=20&page_start='+str(z*20)
6:接下来就是把数组中事先存放好的信息写入到表格,在这里我用的方法比较繁琐,如果小伙伴有比较好的方法可以评论交流一下
f = xlwt.Workbook()
sheet1 = f.add_sheet('豆瓣电影详情', cell_overwrite_ok=True)
title_text = ['电影名', '导演', '编剧', '演员', '类型名', '制片地', '语言', '开播时间', '片长', '又名']
for i in range(0, len(title_text)):
sheet1.write(0, i, title_text[i])
for q in range(0, len(movie_title)):
sheet1.write(q + 1, 0, movie_title[q])
for w in range(0, len(movie_diector)):
sheet1.write(w + 1, 1, movie_diector[w])
for e in range(0, len(movie_sceeenWriter)):
sheet1.write(e + 1, 2, movie_sceeenWriter[e])
for r in range(0, len(movie_actor)):
sheet1.write(r + 1, 3, movie_actor[r])
for t in range(0, len(movie_type)):
sheet1.write(t + 1, 4, movie_type[t])
for y in range(0, len(movie_country)):
sheet1.write(y + 1, 5, movie_country[y])
for u in range(0, len(movie_language)):
sheet1.write(u + 1, 6, movie_language[u])
for i in range(0, len(movie_data)):
sheet1.write(i + 1, 7, movie_data[i])
for o in range(0, len(movie_time)):
sheet1.write(o + 1, 8, movie_time[o])
for p in range(0, len(movie_name)):
sheet1.write(p + 1, 9, movie_name[p])
f.save('E:/successful.xlsx')