python爬取ajax网站数据
我准备写一个电影推荐系统的博客,需要在豆瓣电影上爬数据,
主要爬取电影的名字,类型,评论人的姓名,评论星级
分析网站
网址:豆瓣电影
我们需要的数据在电影详情里,需要获取电影的链接,通过上面的按钮,我们可以切换看到不同的电影,但是我们的网址不变,不需要通过刷新网站就改变了页面信息,ajax,我们不能通过常规的爬取操作进行数据爬取,可以试一试
import requests
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/'
headers = {
'Cookie': 'bid=n_IZTjBerIQ; __yadk_uid=ycqCriKvT0a5PcKLRNhceCyhO4Sz0Fah; ll="108307"; ct=y; __gads=ID=43ca8aec7a305c12:T=1575853654:S=ALNI_MY9WIbXbbei2QmAyxFJCmXOfU8xUQ; _vwo_uuid_v2=DF59A19EB3C662F8FC97D8FB589D790EE|f0be246caa1bf00438bb6baae052953e; trc_cookie_storage=taboola%2520global%253Auser-id%3D1f892c9b-561e-43a9-bf6b-676d733c5d88-tuct4e733f3; __utmc=30149280; gr_user_id=d7959e6e-df54-4337-93d8-d68de6d1002e; viewed="34882634"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1576207268%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1355054136.1575426113.1576200077.1576207270.8; __utmb=30149280.0.10.1576207270; __utmz=30149280.1576207270.8.4.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utma=223695111.1771316839.1575426113.1576133503.1576207270.7; __utmb=223695111.0.10.1576207270; __utmc=223695111; __utmz=223695111.1576207270.7.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_id.100001.4cf6=b00d2b706551734b.1575426113.7.1576207789.1576133503.',
'Referer':'https://movie.douban.com/',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
r = requests.get(url,headers=headers)
r_b = BeautifulSoup(r.text,'lxml')
f12查看电影的html标签信息
找到url标签
print(r_b.find_all('a',class_='item'))
打印结果
[]
f12,点击Network,XHR
按f5刷新
点击每一条数据,查看Response
发现我们需要的数据,是一个json数据,找到数据的真实json地址,找到获取它的url
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
headers = {
'Cookie': 'bid=n_IZTjBerIQ; __yadk_uid=ycqCriKvT0a5PcKLRNhceCyhO4Sz0Fah; ll="108307"; ct=y; __gads=ID=43ca8aec7a305c12:T=1575853654:S=ALNI_MY9WIbXbbei2QmAyxFJCmXOfU8xUQ; _vwo_uuid_v2=DF59A19EB3C662F8FC97D8FB589D790EE|f0be246caa1bf00438bb6baae052953e; trc_cookie_storage=taboola%2520global%253Auser-id%3D1f892c9b-561e-43a9-bf6b-676d733c5d88-tuct4e733f3; __utmc=30149280; gr_user_id=d7959e6e-df54-4337-93d8-d68de6d1002e; viewed="34882634"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1576207268%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1355054136.1575426113.1576200077.1576207270.8; __utmb=30149280.0.10.1576207270; __utmz=30149280.1576207270.8.4.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utma=223695111.1771316839.1575426113.1576133503.1576207270.7; __utmb=223695111.0.10.1576207270; __utmc=223695111; __utmz=223695111.1576207270.7.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_id.100001.4cf6=b00d2b706551734b.1575426113.7.1576207789.1576133503.',
'Referer':'https://movie.douban.com/',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
r = requests.get(url,headers=headers)
html = r.json()['subjects']#获取json数据中的subjects键的值,是一个列表
获取json数据中subjects键的值,是个列表,列表的每个元素是每个电影的信息,包含电影信息网址,对应元素中的字典的url键,我们可以循环获取网址然后逐个解析即可
电影的名字,类型比较好获取
for i in range(len(r.json()['subjects'])):
movie_url_ = r.json()['subjects'][i]['url']#获取每一部电影信息网址
movie_info = BeautifulSoup(requests.get(movie_url_,headers=headers).text,'lxml')#解析网址
movie_name = movie_info.find('span',property="v:itemreviewed")#电影名字
movie_type = movie_info.find_all('span',property='v:genre')#类型
获取每个电影信息的地址后,分别提取数据即可,我们需要是哪个表,第一个是用户表,第二个是电影表,第三个是评分表,上面我们已经获得了电影名字和类型,电影表基本可以完成了,接下来是获取电影评分数据
整体代码
import requests
from bs4 import BeautifulSoup
def has_class(tag):
return tag.has_attr('class')
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
headers = {
'Cookie': 'bid=n_IZTjBerIQ; __yadk_uid=ycqCriKvT0a5PcKLRNhceCyhO4Sz0Fah; ll="108307"; ct=y; __gads=ID=43ca8aec7a305c12:T=1575853654:S=ALNI_MY9WIbXbbei2QmAyxFJCmXOfU8xUQ; _vwo_uuid_v2=DF59A19EB3C662F8FC97D8FB589D790EE|f0be246caa1bf00438bb6baae052953e; trc_cookie_storage=taboola%2520global%253Auser-id%3D1f892c9b-561e-43a9-bf6b-676d733c5d88-tuct4e733f3; __utmc=30149280; gr_user_id=d7959e6e-df54-4337-93d8-d68de6d1002e; viewed="34882634"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1576207268%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1355054136.1575426113.1576200077.1576207270.8; __utmb=30149280.0.10.1576207270; __utmz=30149280.1576207270.8.4.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utma=223695111.1771316839.1575426113.1576133503.1576207270.7; __utmb=223695111.0.10.1576207270; __utmc=223695111; __utmz=223695111.1576207270.7.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_id.100001.4cf6=b00d2b706551734b.1575426113.7.1576207789.1576133503.',
'Referer':'https://movie.douban.com/',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
r = requests.get(url,headers=headers)
for i in range(len(r.json()['subjects'])):
movie_url_ = r.json()['subjects'][i]['url']
movie_info = BeautifulSoup(requests.get(movie_url_,headers=headers).text,'lxml')
#movie_url = movie_info.find('a').attrs['href']
#movie_get = requests.get(movie_url,headers=headers)
#movie_info = BeautifulSoup(movie_get.text,'lxml')
movie_name = movie_info.find('span',property="v:itemreviewed")
lx = movie_info.find_all('span',property='v:genre')#类型
#每个电影类型个数不同
lx_string = ''
for l in lx:
lx_string = lx_string + str(l.string)+ ' '
print(movie_name.string)
#print(lx_string)
pinglun_url = movie_info.find('div',id='comments-section').find('div',class_='mod-hd').find('span',class_='pl').a.attrs['href']
#print(pinglun_url)
r_pinglun = requests.get(pinglun_url,headers=headers)#访问评论地址
r_pinglun_html = BeautifulSoup(r_pinglun.text,'lxml')
pinglun_values_list = r_pinglun_html.find_all('span',class_='comment-info')
#print(pinglun_values_list)
for j in pinglun_values_list:
pinglun_person_name = j.a.string
#print(pinglun_person_name)
aa = j.find_all(has_class)
if len(aa)==3:
print(aa[0].string,aa[1].attrs['title'],aa[2].attrs['title'])#获取评论者姓名,评论等级,评论时间