#python3.7
#author:huangtao
#pycharm
import requests
from requests.exceptions import RequestException
import re
import json
#导入相关库文件
#定义get_url,通过主程序传递的url进行访问
def get_url(url,headers):
#判断访问是否成功,如果响应值为200,则访问成功
try:
response = requests.get(url,headers=headers)
#访问url
if response.status_code == 200:
return response.text
#返回网页源代码
else:
return None
except RequestException:
return None
#否则返回None
def parse_html(html):
#定义def parse_html,通过解析从上面得到的html源代码,解析数据
ranks = re.findall(r'<i class="board-index board-index-\d+">(.*?)</i>',html,re.DOTALL)
titles = re.findall(r'<p class="name"><a href.*?>(.*?)</a></p>',html,re.DOTALL)
lianjies = re.findall(r'<img data-src="(.*?)" alt=".*?".*?/>',html,re.DOTALL)
authors_list = re.findall(r'<p class="star">(.*?)</p>', html, re.DOTALL)
releasetimes = re.findall(r'<p class="releasetime">(.*?)</p>',html,re.DOTALL)
scores = re.findall(r'<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>',html,re.DOTALL)
#通过正则解析网页得到数据
for rank,title,lianjie,authors,releasetime,score in zip(ranks,titles,lianjies, authors_list,releasetimes,scores):
# print(authors.strip())
movie = []
#定义一个movie
movies = {
'排名':rank,
'电影名':title,
'海报链接':lianjie,
'主演':authors.strip()[3:],
'上映时间/国家':releasetime[5:],
'分数':score[0]+score[1]
}
movie.append(movies)
print(movie)
with open('movie','a',encoding='utf-8')as f:
f.write(json.dumps(movie,ensure_ascii=False)+'\n')
f.close()
#保存数据到movie.txt中,movie先转下格式,然后保存,保存的时候要转码
def main(offset):
url = 'http://maoyan.com/board/4?offset='+str(offset)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.77 Safari/537.36'}
html = get_url(url,headers)
parse_html(html)
#main方法,传递各种参数,构造url
if __name__ == '__main__':
#主程序
for i in range(10):
#循环遍历
main(i*10)