豆瓣电影排行(电影名,年份,人数,内容概括)抓取,并保存在excel表格中:
笔记分享
import requests
import re
import pandas as pd
url = "https://movie.douban.com/top250"
head = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
resp = requests.get(url=url, headers=head)
content = resp.text
"""先把正则表达式写好"""
obj = re.compile(r'<span class="title">(?P<movie_name>.*?)'
r'</span>.*?<br>\n\s+(?P<movie_year>.*?) .*?<span>'
r'(?P<movie_people>.*?)</span>.*?<span class="inq">'
r'(?P<movie_comments>.*?)</span>', re.S)
l_text = obj.finditer(content)
l = []
for i in l_text:
l.append(i.groupdict())
l2 = []
for line in l:
l2.append(line['movie_name'])
l2.append(line['movie_year'])
l2.append(line['movie_people'])
l2.append(line['movie_comments'])
"""这里用pandas模块将数据保存为excel,需要做一个二维列表,将列表的数据每4个组成一个新列表"""
"""正常for循环写法"""
l3 = []
for k in range(0, len(l2), 4):
l3.append(l2[k:k+4])
#result = l3
"""列表推导式写法"""
result = [l2[i:i+4] for i in range(0, len(l2), 4)]
"""这里写一个函数,来完成操作,只需要将二维列表传入即可"""
def func(result):
company_name_list = result
df = pd.DataFrame(company_name_list,columns=['电影名','播出年份','人数','电影概括'])
df.to_excel('豆瓣.xlsx',index=False)
func(result)
print("保存完毕")