import urllib
from urllib import request
import re
def main():
headers = {'User-Agent':'Mozilla/5.0 3578.98 Safari/537.36'}
url = urllib.request.Request("https://maoyan.com/board/4",headers=headers)
content = urllib.request.urlopen(url,timeout=15).read().decode('utf-8')
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
#.*任意数量的不换行的字符 ?重复0次或者n次 \d+匹配1个或更多连续的数字
items = re.findall(pattern, content)
list = []
for item in items:
dict={}
dict = {"排名":item[0],"名称":item[2],"主演:":item[3].strip()[3:],"上映时间:":item[4].strip()[5:],"豆瓣评分:":item[5] + item[6]}
list.append(dict)
print(list)
if __name__ == '__main__':
print("猫眼电影排名榜单")
main()
结果: