import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import time
# 爬取前250部电影的名称、评分、导演、链接
def getHTMLText(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
# print(r.status_code)# 查看状态码
r.encoding = r.apparent_encoding
# print('success')
return r.text
except:
print('error') # 异常处理
return "产生异常"
def getMovieList(html):
soup = BeautifulSoup(html, 'html.parser')
Movies = soup.find_all('div', class_='item')
movielist = []
for movie in Movies:
key = {}
# 提取电影名称
key['movieName'] = movie.find('span', class_='title').string
# 提取电影评分
key['rating_num'] = movie.find('span', class_='rating_num').string
# 获取链接
key['link'] = movie.find('a')['href']
movie_info = movie.find('div', class_='bd').find('p', class_='')
# # print(movie_info.get_text())
# # 提取导演和主演
director_and_actors = movie_info.get_text().split('...')[0]
# # print(director_and_actors)
key['director'] = director_and_actors.split('导演: ')[1].split('主')[0].strip()
# #删除key['director']中的空格
# key['director'] = re.sub('\s','',key['director'])
movielist.append(key)
return movielist
def save_to_csv(movielist):
with open('DouBanMovieT250.csv', 'a+', encoding='UTF-8', newline='') as fp:
writer = csv.writer(fp)
for key in movielist:
writer.writerow([key['movieName'], key['rating_num'], key['director'], key['link']])
def save_to_excel(movielist):
# 将movielist转换为DataFrame
df = pd.DataFrame(movielist)
print(df.shape)
# 指定列名,确保与字典的键一致
df.columns = ['movieName', 'rating_num','link', 'director']
# 将DataFrame保存到Excel文件,使用'xlsxwriter'作为引擎
df.to_excel('DouBanMovieT250.xlsx', index=False, engine='xlsxwriter')
def main():
movielist = []
allmovie=[]
for page in range(0, 226, 25):
time.sleep(2) # 根据需要进行设置
url = f'https://movie.douban.com/top250?start={page}&filter='
html = getHTMLText(url)
movielist = getMovieList(html)
save_to_csv(movielist)
allmovie=allmovie+movielist
save_to_excel(allmovie)
if __name__ == '__main__':
main()
【python爬虫】爬取豆瓣前250电影信息并保存在csv和excel中(附源码)
最新推荐文章于 2024-06-06 18:25:37 发布