from bs4 import BeautifulSoup
import requests
import csv
'''获取页面内容'''
def get_url_content(page):
'''伪装报头'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/83.0.4103.61 Safari/537.36 '
}
response = requests.get("https://movie.douban.com/top250?start=" + str(page), headers=headers)
return response
'''分析页面内容,找到电影的名称、评分、链接、推荐语'''
def get_movie_table(response):
soup = BeautifulSoup(response.content, "html5lib")
movies = soup.find_all('div', class_='item')
movies_table = [['名称', '评分', '链接', '推荐语']]
for item in movies:
try:
name = item.find('div', class_='info').find('span', class_='title').text
except BaseException:
name = ''
try:
star = item.find('div', class_='info').find('span', class_='rating_num').get_text()
except BaseException:
star = ''
try:
link = item.find('div', class_='info').find('a')['href']
except BaseException:
link = ''
try:
comment = item.find('div', class_='info').find('span', class_='inq').get_text()
except BaseException:
comment = ''
movies_table.append([name, star, link, comment])
return movies_table
if __name__ == '__main__':
movie_table = []
'''点击下一页,观察url的变化,发现page的间隔是25,开始于0,结束于255'''
for page in range(0, 226, 25):
temp_movies_table = get_movie_table(get_url_content(page))
movie_table.extend(temp_movies_table)
print('获取信息完成')
'''将生成的列表存储在csv中,文件位于项目文件夹里'''
try:
with open('movies.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for row in movie_table:
writer.writerow(row)
print("存储完成")
except BaseException:
print('存储失败')
爬虫练习:获取豆瓣top250的电影
最新推荐文章于 2024-08-13 18:54:58 发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)