import requests from pyquery import PyQuery as pq import json #获取网页源码 def getPage(url): headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } response = requests.get(url,headers=headers) if response.status_code==200: return response.text else: return None #解析页面 def parsePage(html): list=[] doc=pq(html) items=doc('dd').items() for item in items: #排名 ranking=item('.board-index').text() #图片 img=item('.board-img').attr('data-src') #标题 title=item('.name a').text() #演员 actor=item('.star').text() actor=actor.replace('主演:','') #评分 score=item('.score').text() #date showTime=item('.releasetime').text().replace('上映时间:','') info={'排名':ranking,'图片':img,'标题':title,'演员':actor,'评分':score,'上映时间':showTime} list.append(info) return list #文件写入 def writeToFile(content): with open('F:\\topTen.txt','a') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n--------------------------------------------------------------------\n') def main(offset): url = 'https://maoyan.com/board/4?offset=' + str(offset) html = getPage(url) if html is None: return items = parsePage(html) for item in items: writeToFile(item) for page in range(0,10): offset = page * 10 main(offset)
抓取猫眼电影排行信息
最新推荐文章于 2019-04-18 11:57:31 发布