这个是今天写的一个爬取猫眼电影top100的一个完整的代码
# coding:utf-8
import json
import requests
from bs4 import BeautifulSoup
def get_one_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
Response = requests.get(url,headers=headers)
if Response.status_code == 200:
return Response.text
return None
except Exception:
return None
def parse_one_page(html):
Soup=BeautifulSoup(html,'html.parser',from_encoding='utf-8')
board_wrapper=Soup.find('dl',class_='board-wrapper')
dd_wag=board_wrapper.find_all('dd')
list=[]
for i in dd_wag:
rate=i.find('i').get_text(strip=True)
figure=i.find('a',class_='image-link').find('img',class_='board-img')['data-src']
movie=i.find('p',class_='name').find('a',attrs={'data-act':'boarditem-click'}).get_text(strip=True)
cast=i.find('p',class_='star').get_text(strip=True)[3:]
releasetime=i.find('p',class_='releasetime').get_text(strip=True)[5:]
mark=i.find('i',class_='integer').string+i.find('i',class_='fraction').get_text(strip=True)
x = {'rate':rate,
'figure':figure,
'movie':movie,
'cast':cast,
'releasetime':releasetime,
'mark':mark
}
list.append(x)
return list
def write_content(content):
with open('D:/猫眼电影5.txt','a',encoding='utf-8') as f:
for i in content:
f.write(json.dumps(i,ensure_ascii=False)+'\n')
f.close()
if __name__=='__main__':
for i in range(10):
html=get_one_page('http://maoyan.com/board/4?offset=%s'%(i*10))
content=parse_one_page(html)
write_content(content)