导库
import re
import requests
from bs4 import BeautifulSoup
之前预抓取无法返回html,发现是需要headers
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
抓取单页
res = requests.get('http://maoyan.com/board/4?offset=0',headers = headers)
print(res.text)
成功返回
解析html
soup = BeautifulSoup(res.text,'lxml')
items = soup.find_all('dd')#先取出dd标签,之后对该标签下的内容继续解析
目标是取出电影的index、图片链接、名字、主演、上映时间、评分
for item in items:
index = item.select('.board-index')[0].get_text()#用css解析
print(index)
1 2 3 4 5 6 7 8 9 10
没什么问题
解析其他的内容也用这个方法
for item in items:
img = item.select('.board-img')[0]['data-src']
name = item.select('.name a')[0].get_text()
star = item.select('.star')[0].get_text().strip()
releasetime = item.select('.releasetime')[0].get_text()
score = item.select('.integer')[0].get_text()+item.select('.fraction')[0].get_text()#因为评分的分数整数和小数是分开的,这里把他们拼接后返回
print(img,name,star,releasetime,score)