话不多说直接贴上代码
import requests
import re
import json
#定义一个函数,用于解析网页内容
def pase_detail(url):
content = requests.get(url).content.decode()
movie_content=re.findall(’
- .?
.? (.?).?
.?(.?)
.?(\d{4}) .? ’,content,re.S)for item in movie_content: movie_name=item[0] movie_direct=item[1].replace("\n","").strip().replace(" ","--") movie_year=item[2] movie_score=item[3] yield { "movie_name":movie_name, "movie_direct":movie_direct, "movie_year":movie_year, "movie_score":movie_score, }
-
#定义一个函数用于获取后的文本存储
def write_content(movie_info):
with open(‘douban_top250.txt’,‘a+’,encoding=‘utf-8’) as fp:
fp.write(json.dumps(movie_info,ensure_ascii=False)+’\n’)if name==“main”:
url = ‘https://movie.douban.com/top250?start={}&filter=’
for page in range(11):
for movie_msg in pase_detail(url.format(page*25)):
print(movie_msg)
write_content(movie_msg)