案例分析
首先我们可以看到豆瓣的top250电影排名如图,是把top250所有的分为十页,每页25部电影进行展示
而我要获取的数据是每一部电影的详情介绍,包括演员,剧情,评价,演员的扮演角色等信息
首先我们要获取top250的页面信息
base_url='https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh;intel Mac OS 10_11_4)Applewebkit/537.36(KHTML,like Gecko)Chrome/52.0.2743.116 Safari/537.36'
}
def get_page(url):
try:
res=requests.get(url,headers=headers)
if res.status_code==200:
return res.text
return None
except requests.ConnectionError as f:
print('Error')
到这一步我们获得了top250的页面信息,我们将这一页面的HTML信息进行·解析
def save_more_page(html):
soup=BeautifulSoup(html,'lxml')
index=soup.find('span',attrs={'class':'top250-no'}).getText()
title=soup.find('span',attrs={'property':'v:itemreviewed'}).getText()
ss = soup.find('div', attrs={'class':'celebrities related-celebrities'}).getText()
#actor = companybrief.encode('utf-8')
actor = ss.replace('\n', '')
score = soup.find('strong', attrs={'class': 'll rating_num','property':'v:average'}).getText()
comment=soup.find('span', attrs={'property': 'v:votes'}).getText()
mm=soup.find('div', attrs={'class': 'ratings-on-weight'}).get_text().strip('\n')
cc = mm.replace(' ', '')
more_comment = cc.replace('\n', '')
ii=soup.find('span', attrs={'property': 'v:summary'}).getText().strip()
indent = ii.replace('\n', '')
# mood=soup.find('div', attrs={'class': 'mod'}).getText()
move=[index,title,score,comment,more_comment,indent,actor]
movie_list=[]
movie_list.append(move)
return movie_list
到这里那我们获得了该页面25部电影具体详情页的url,那接下来我们就要进入具体的详情页提取信息
def more_page(items):#获取详情页的信息
for item in items:
try:
res=requests.get((item.get('url')))
if res.status_code==200:
#print(res.text)
move=save_more_page(res.text)#调用分析详情页
write_more_page(move)#保存写入文件
except requests.ConnectionError as f:
print("Error")
具体解析详情页如下:
def save_more_page(html):
soup=BeautifulSoup(html,'lxml')
index=soup.find('span',attrs={'class':'top250-no'}).getText()
title=soup.find('span',attrs={'property':'v:itemreviewed'}).getText()
ss = soup.find('div', attrs={'class':'celebrities related-celebrities'}).getText()
#actor = companybrief.encode('utf-8')
actor = ss.replace('\n', '')
score = soup.find('strong', attrs={'class': 'll rating_num','property':'v:average'}).getText()
comment=soup.find('span', attrs={'property': 'v:votes'}).getText()
mm=soup.find('div', attrs={'class': 'ratings-on-weight'}).get_text().strip('\n')
cc = mm.replace(' ', '')
more_comment = cc.replace('\n', '')
ii=soup.find('span', attrs={'property': 'v:summary'}).getText().strip()
indent = ii.replace('\n', '')
# mood=soup.find('div', attrs={'class': 'mod'}).getText()
move=[index,title,score,comment,more_comment,indent,actor]
movie_list=[]
movie_list.append(move)
解析得到我们想要的信息以后,我们就要保存,写入文件
def write_more_page(items):
with codecs.open('豆瓣电影top250.csv','a',encoding='utf8')as f:
for movie_info in items:
line_info = ''
for info in movie_info:
line_info += info + ','
print(line_info)
f.write(u'{line_info}\n'.format(line_info=line_info))
到这一步,我们完成了top250的第一个页面的获取,
接下来我们将获取下一个界面的地址
def more_pages(url):
html = get_page(url)
soup = BeautifulSoup(html, 'lxml')
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
if next_page:
return base_url + next_page['href']
return None
接着继续循环爬取详情页的信息。
完整代码如下
import requests
from bs4 import BeautifulSoup
import re
import os
import json
import codecs
from urllib.parse import urlencode
base_url='https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh;intel Mac OS 10_11_4)Applewebkit/537.36(KHTML,like Gecko)Chrome/52.0.2743.116 Safari/537.36'
}
def get_page(url):
try:
res=requests.get(url,headers=headers)
if res.status_code==200:
return res.text
return None
except requests.ConnectionError as f:
print('Error')
def parse_page(html):
pattern=re.compile('<li>.*?<em class="">(.*?)</em>.*? <a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<div class="star">.*?<span class=".*?" property=".*?">(.*?)</span>.*?</li>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'url':item[1],
'title':item[2],
'comment':item[3]
}
def more_pages(url):
html = get_page(url)
soup = BeautifulSoup(html, 'lxml')
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
if next_page:
return base_url + next_page['href']
return None
def save_more_page(html):
soup=BeautifulSoup(html,'lxml')
index=soup.find('span',attrs={'class':'top250-no'}).getText()
title=soup.find('span',attrs={'property':'v:itemreviewed'}).getText()
ss = soup.find('div', attrs={'class':'celebrities related-celebrities'}).getText()
#actor = companybrief.encode('utf-8')
actor = ss.replace('\n', '')
score = soup.find('strong', attrs={'class': 'll rating_num','property':'v:average'}).getText()
comment=soup.find('span', attrs={'property': 'v:votes'}).getText()
mm=soup.find('div', attrs={'class': 'ratings-on-weight'}).get_text().strip('\n')
cc = mm.replace(' ', '')
more_comment = cc.replace('\n', '')
ii=soup.find('span', attrs={'property': 'v:summary'}).getText().strip()
indent = ii.replace('\n', '')
# mood=soup.find('div', attrs={'class': 'mod'}).getText()
move=[index,title,score,comment,more_comment,indent,actor]
movie_list=[]
movie_list.append(move)
return movie_list
def write_more_page(items):
with codecs.open('豆瓣电影top250.csv','a',encoding='utf8')as f:
for movie_info in items:
line_info = ''
for info in movie_info:
line_info += info + ','
print(line_info)
f.write(u'{line_info}\n'.format(line_info=line_info))
def more_page(items):
for item in items:
try:
res=requests.get((item.get('url')))
if res.status_code==200:
#print(res.text)
move=save_more_page(res.text)
write_more_page(move)
except requests.ConnectionError as f:
print("Error")
'''def write_file(connet):
with open('d.csv','a',encoding='utf8')as f:
f.write(json.dumps(connet,ensure_ascii=False)+'\n')'''
def main():
url=base_url
while url:
html=get_page(url)
get=parse_page(html)
more_page(get)
url = more_pages(url)
#for item in parse_page(html):
#print(item)
#write_file(item)
print("done!")
if __name__ == '__main__':
main()