最近刚看完爬虫课程,由于对正则表达式掌握的不太好,恰好课程内容又是Python100例,看了一下猫眼电影的网页架构,也是非常简单的,唯一的一个不好处理的点在于评分,
还好,在折腾了一个晚上的情况下,总算搞定,废话不多说,代码如下:
小白刚刚接触Python大法,也是对自己学习做一个简单的笔记,不完善之处,还望勿喷,谢谢
from lxml import etree from bs4 import BeautifulSoup import requests head = {'User-Agent':''}
#经测试,无头访问不成功,加入headers def get_cont(url): x = requests.get(url,headers = head) x.encoding = 'utf8' soup = BeautifulSoup(x.text,'lxml') ranks = soup.select('dl.board-wrapper > dd > i') names = soup.select('div.movie-item-info p.name a') actors = soup.select('div.movie-item-info p.star') times = soup.select('div.movie-item-info p.releasetime') # #app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p score1 = soup.select('#app > div > div > div > dl > dd > div > div > div.movie-item-number.score-num > p i.integer') score2 = soup.select('#app > div > div > div > dl > dd > div > div > div.movie-item-number.score-num > p i.fraction') #此处由于打分是用两个i标签封装包裹起来的,故用了两个变量来存储整数和小数,在下面将两个拼接即可 # print(ranks) for rank,name,actor,time,score1,score2 in zip(ranks,names,actors,times,score1,score2): yield { 'rank' : rank.text.strip(), 'name': name.text.strip(), 'actor':actor.text.strip()[3:], 'time': time.text.strip()[5:], 'score': score1.text.strip() + score2.text.strip() } # print(score) def majorfunc(): for j in range(0,91,10): u = 'http://maoyan.com/board/4?offset=0' url = u + str(j) re = requests.get(url,headers = head).text for k in get_cont(url): print(k) if __name__ == '__main__': majorfunc()