案例
import requests
import re
f = open("top250.csv",mode='w',encoding='utf-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
resp = requests.get("https://movie.douban.com/top250",headers = headers)
resp.encoding = "utf-8"
pageSource = resp.text
obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</sp'
r'an>.*?<p class="">.*?导演:(?P<dao>.*?) .*?<br>'
r'(?P<year>.*?) .*?<span class="rating_num" property="v:average">'
r'(?P<score>.*?)</span>.*?<span>(?P<num>.*?)人评价</span>',re.S)
result = obj.finditer(pageSource)
for item in result:
name = item.group("name")
dao = item.group("dao")
year = item.group("year").strip()
score = item.group("score")
num = item.group("num")
f.write(f"{name},{dao},{year},{score},{num}\n")
f.close()
resp.close()
print("doubanTOP250提取完毕.")
运行结果及图片展示
如图:
for page in range(0,250,25):
url = 'https://movie.douban.com/top250?start=' + str(page) + '&filter='
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
pageSource = resp.text
obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</sp'
r'an>.*?<p class="">.*?导演:(?P<dao>.*?) .*?<br>'
r'(?P<year>.*?) .*?<span class="rating_num" property="v:average">'
r'(?P<score>.*?)</span>.*?<span>(?P<num>.*?)人评价</span>',re.S)
result = obj.finditer(pageSource)
for item in result:
name = item.group("name")
dao = item.group("dao")
year = item.group("year").strip()
score = item.group("score")
num = item.group("num")
f.write(f"电影:{name},导演:{dao},年份:{year},评分:{score},评分:{num}\n")
f.close()
resp.close()
print("doubanTOP250提取完毕.")
运行结果: