day17 作业

百事不可乐BOOM

已于 2023-01-05 09:15:53 修改

阅读量166

点赞数 1

文章标签： python 爬虫

于 2023-01-04 20:05:51 首次发布

本文链接：https://blog.csdn.net/qq_63449560/article/details/128553619

版权

该代码片段使用Python的BeautifulSoup库从豆瓣电影Top250页面抓取电影名称、评分和评论人数，并尝试使用正则表达式提取信息。目标是将数据写入CSV文件，但部分信息如上映时间、出版国家和电影类型未成功提取。

摘要由CSDN通过智能技术生成

豆瓣电影10页：电影名称，评分，评论人数，描述信息(上映时间，出版国家，电影类型),最终将数据写到csv文件中

from bs4 import BeautifulSoup
import requests
from re import fullmatch, findall
import csv

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text

soup = BeautifulSoup(html, 'lxml')

# 1、取名字
title = soup.select('#content .title')
# print(title)
for x in title:
    print(x.text)

#我想把数据通过正则取出来，None
# movies_title = fullmatch(r'[\u4e00-\u9fa5]+|\w+', str(title))
# print(movies_title) 

# 2、取分数
# <span class="rating_num" property="v:average">9.7</span>
grades = soup.select('#content .rating_num')
# print(grades)
for y in grades:
    print(y.text)

# 3、取评论人数，因为没有属性值，所以直接用正则快
nums = findall(r'(\d+)人评价', str(soup))
print(nums)
# div_numbers = soup.select('#content .star')
# print(div_numbers)

# 4、电影评语
comment = soup.select('#content .inq')
# print(comment)
for z in comment:
    print(z.text)

#5、没有属性取不出来
# 上映时间
# 国家
# 电影类型


#6、因为前面正则没取出来，所以这个表格做不成，g
# data = map(lambda i1, i2, i3: {'电影名称': i1, '评分': i2, '电影评语': i3,}, x.text, y.text, z.text)
# f = open('files/movies.csv', 'w', encoding='utf-8', newline='')
# w = csv.DictWriter(f, ['电影名称', '评分', '电影评语'])
# w.writeheader()
# w.writerows(list(data))

#小改了一点，用append制作列表，但是还是不能遍历十页
# 取名字
# title = soup.select('#content .title')
# # print(title)
#
# new_title = []
# for x in title:
#     Title = x.text
#     new_title.append(Title)
title = findall(r'<span class="title">([\u4e00-\u9fa5]+)</span>', html)


# movies_title = fullmatch(r'[\u4e00-\u9fa5]+|\w+', str(title))
# print(movies_title)

# 取分数
# <span class="rating_num" property="v:average">9.7</span>
grades = soup.select('#content .rating_num')
# print(grades)

new_score = []
for y in grades:
    score = y.text
    new_score.append(score)

# 取评论人数
nums = findall(r'(\d+)人评价', str(soup))
# print(nums)
# div_numbers = soup.select('#content .star')
# print(div_numbers)

# 电影评语
comment = soup.select('#content .inq')
# print(comment)

new_comment=[]
for z in comment:
    introduce= z.text
    new_comment.append(introduce)

# 上映时间
# 国家
# 电影类型



# data = map(lambda i1, i2, i3: {'电影名称': i1, '评分': i2, '电影评语': i3,}, x.text, y.text, z.text)
#
# f = open('files/movies.csv', 'w', encoding='utf-8', newline='')
# w = csv.DictWriter(f, ['电影名称', '评分', '电影评语'])
# w.writeheader()
# w.writerows(list(data))

data = map(lambda i1, i2, i3, i4: {'电影名': i1, '评分': i2, '评价人数': i3, '简介': i4}, title, new_score, nums, new_comment)
# print(list(data))

f = open('files/豆瓣电影.csv', 'w', encoding='utf-8', newline='')
w = csv.DictWriter(f, ['电影名', '评分', '评价人数', '简介'])
w.writeheader()
w.writerows(list(data))