import requests
import re
import time
import json
def get_one_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
}
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
else:
print(None)
def parse_one_page(html):
# <i class="board-index board-index-9">9</i>
# <a href="/films/1212" title="千与千寻" class="image-link" data-act="boarditem-click" data-val="{movieId:1212}">
# <img src="//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
# <img data-src="http://p0.meituan.net/movie/b076ce63e9860ecf1ee9839badee5228329384.jpg@160w_220h_1e_1c" alt="千与千寻" class="board-img" />
# </a>
# <div class="board-item-main">
# <div class="board-item-content">
# <div class="movie-item-info">
# <p class="name"><a href="/films/1212" title="千与千寻" data-act="boarditem-click" data-val="{movieId:1212}">千与千寻</a></p>
# <p class="star">
# 主演:柊瑠美,入野自由,夏木真理
# </p>
# <p class="releasetime">上映时间:2001-07-20(日本)</p> </div>
# <div class="movie-item-number score-num">
# <p class="score"><i class="integer">9.</i><i class="fraction">3</i></p>
# </div>
pattern = re.compile(r"<dd>.*?board-index.*?>(.*?)</i>.*?<img data-src=(.*?) alt=(.*?)class=.*?/>.*?<p class=.*?><a href=.*?</a></p>.*?<p class=.*?>(.*?)</p>.*?<p class=.*?>(.*?)</p>.*?<p class=.*?><i class=.*?>(.*?)</i><i class=.*?>(.*?)</i></p>",re.S)
item = re.findall(pattern,html)
# ('1', '"http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c"', '"霸王别姬" ', '\n 主演:张国荣,张丰毅,巩俐\n ', '上映时间:1993-01-01(中国香港)', '9.', '6'),
for ite in item:
yield{
"index": ite[0],
"image": ite[1],
"title": ite[2][1:-2],
"actor": ite[3].strip()[3:],
"time": ite[4][5:],
"score":ite[5]+ite[6]
}
def write(item):
with open("D:/猫眼电影TOP100(1).txt","a") as f:
f.write(json.dumps(item,ensure_ascii=False)+"\n")#这是因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False:
def main():
for page in range(10):
url = "http://maoyan.com/board/4?offset="+str(10*page)
html =get_one_page(url)
for item in parse_one_page(html):
write(item)
time.sleep(1)
main()
python3 网络爬虫学习 3.4 抓取猫眼电影排行
最新推荐文章于 2023-01-29 09:45:21 发布