爬取热门电影实战
参考书籍:python程序设计
安装的库:
pip install bs4
如果下载的较慢可以加入网址
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple bs4
# 爬取2345网站电影排行前50的电影信息
import requests
from bs4 import BeautifulSoup
def getHtml(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding='gbk'
return r.text
except:
return ''
pass
def saveInfo(html):
soup=BeautifulSoup(html,'html.parser')
move_ls=soup.find('ul',class_='picList clearfix')
movies=move_ls.find_all('li')
for top in movies:
img_url=top.find('img')['src'] # 查找所有图片链接
name=top.find('span',class_='sTit').get_text() #得到电影名称
try:
time=top.find('span',class_='sIntro').get_text()
pass
except:
time='暂时无上映时间信息'
pass
try:
actors=top.find('p',class_='pActor')
actor=''
for act in actors.contents:
actor=actor+act.string+''
pass
pass
except:
actor='暂时无演员姓名'
pass
if top.find('p',class_='pTxt pIntroHide'):
intro=top.find('p',class_='pTxt pIntroHide').get_text()
pass
else:
intro=top.find('p',class_='pTxt pIntroShow').get_text()
pass
print('影片名: {}\t{}\n{}\n{}\n\n'.format(name,time,actor,intro))
# 下载图片
with open('E:/下载/sp/'+name+'.jpg','wb+')as f:
img_url="http:"+img_url
imgdata=requests.get(img_url).content
f.write(imgdata)
pass
pass
pass
def main():
url="https://dianying.2345.com/top/"
html=getHtml(url)
saveInfo(html)
pass
main()
运行结果;
等50个电影信息