豆瓣电影10页:电影名称 ,评分,评论人数,描述信息(上映时间,出版国家,电影类型),最终将数据写到csv文件中
from bs4 import BeautifulSoup
import requests
from re import fullmatch, findall
import csv
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
# 1、取名字
title = soup.select('#content .title')
# print(title)
for x in title:
print(x.text)
#我想把数据通过正则取出来,None
# movies_title = fullmatch(r'[\u4e00-\u9fa5]+|\w+', str(title))
# print(movies_title)
# 2、取分数
# <span class="rating_num" property="v:average">9.7</span>
grades = soup.select('#content .rating_num')
# print(grades)
for y in grades:
print(y.text)
# 3、取评论人数,因为没有属性值,所以直接用正则快
nums = findall(r'(\d+)人评价', str(soup))
print(nums)
# div_numbers = soup.select('#content .star')
# print(div_numbers)
# 4、电影评语
comment = soup.select('#content .inq')
# print(comment)
for z in comment:
print(z.text)
#5、没有属性取不出来
# 上映时间
# 国家
# 电影类型
#6、因为前面正则没取出来,所以这个表格做不成,g
# data = map(lambda i1, i2, i3: {'电影名称': i1, '评分': i2, '电影评语': i3,}, x.text, y.text, z.text)
# f = open('files/movies.csv', 'w', encoding='utf-8', newline='')
# w = csv.DictWriter(f, ['电影名称', '评分', '电影评语'])
# w.writeheader()
# w.writerows(list(data))
#小改了一点,用append制作列表,但是还是不能遍历十页
# 取名字
# title = soup.select('#content .title')
# # print(title)
#
# new_title = []
# for x in title:
# Title = x.text
# new_title.append(Title)
title = findall(r'<span class="title">([\u4e00-\u9fa5]+)</span>', html)
# movies_title = fullmatch(r'[\u4e00-\u9fa5]+|\w+', str(title))
# print(movies_title)
# 取分数
# <span class="rating_num" property="v:average">9.7</span>
grades = soup.select('#content .rating_num')
# print(grades)
new_score = []
for y in grades:
score = y.text
new_score.append(score)
# 取评论人数
nums = findall(r'(\d+)人评价', str(soup))
# print(nums)
# div_numbers = soup.select('#content .star')
# print(div_numbers)
# 电影评语
comment = soup.select('#content .inq')
# print(comment)
new_comment=[]
for z in comment:
introduce= z.text
new_comment.append(introduce)
# 上映时间
# 国家
# 电影类型
# data = map(lambda i1, i2, i3: {'电影名称': i1, '评分': i2, '电影评语': i3,}, x.text, y.text, z.text)
#
# f = open('files/movies.csv', 'w', encoding='utf-8', newline='')
# w = csv.DictWriter(f, ['电影名称', '评分', '电影评语'])
# w.writeheader()
# w.writerows(list(data))
data = map(lambda i1, i2, i3, i4: {'电影名': i1, '评分': i2, '评价人数': i3, '简介': i4}, title, new_score, nums, new_comment)
# print(list(data))
f = open('files/豆瓣电影.csv', 'w', encoding='utf-8', newline='')
w = csv.DictWriter(f, ['电影名', '评分', '评价人数', '简介'])
w.writeheader()
w.writerows(list(data))