1.导包
from bs4 import BeautifulSoup
import requests
import xlwt
使用BeautifulSoup解析网页, requests获取服务器响应, 把结果写入表格。
2.解析网页
# 获得解析后的网页
def GetHTML(url):
# 添加头标签
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"
}
# 获取html源代码
response = requests.get(url, headers=headers)
if response.ok:
html = response.text
# print(content.status_code)
return html
else:
print("获取失败")
return None
3.获得网页中的数据
def GetDate(baseurl):
numbers = []
graph_links = []
movie_links = []
cnames = [] # 中文电影名
onames = [] # 外国电影名
details = [] # 相关信息
marks = [] # 电影评分
remark_num = [] # 评价人数
quotes = []
for i in range(10):
url = baseurl + str(i * 25)
html = GetHTML(url)
# 从网页中获得数据
soup = BeautifulSoup(html, "html.parser")
all_items = soup.findAll("div", attrs={"class": "item"})
for item in all_items:
# 序号
em = item.find("em", attrs={"class": ""})
numbers.append(em.string)
# 图片链接
pic = item.find("div", attrs={"class": "pic"})
graph_links.append(pic.a.get("href"))
# 电影详情链接
hd = item.find("div", attrs={"class": "hd"})
movie_links.append(hd.a.get("href"))
# 电影名
names = item.findAll("span", attrs={"class": "title"})
if len(names) == 2:
cnames.append(names[0].string)
oname = names[1].string
onames.append(oname[3:])
else:
cnames.append(names[0].string)
onames.append("无")
# 相关信息
bd = item.find("div", attrs={"class": "bd"})
details.append(bd.p.get_text().replace(" ", "").replace("\n", "").replace("\xa0", " "))
# 评分
rate_num = item.find("span", attrs={"class": "rating_num"})
marks.append(rate_num.text)
# 评价人数
star = item.find("div", attrs={"class": "star"})
all_span = star.find_all("span")
remark_num.append(all_span[3].string[:-3])
# 概述
quote = item.find("p", attrs={"class": "quote"})
if quote is None:
quotes.append("无")
else:
quotes.append(quote.span.get_text())
print(f"第{i+1}页获取成功")
data_list = [numbers, graph_links, movie_links, cnames, onames, details, marks, remark_num, quotes]
print("所有网页数据获取成功")
return data_list
4.保存数据到指定路径
# 保存数据到表格
def Save_Data(datalist, save_path):
print("save.......")
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True) # 创建工作表
col = ("序号", "图片链接", "电影详情链接", "影片中文名", "电影外文名", "相关信息", "评分", "评价数", "概述")
for i in range(0, len(col)):
sheet.write(0, i, col[i]) # 列名
for i in range(0, len(col)):
# print("第%d条" %(i+1)) #输出语句,用来测试
data = datalist[i]
for j in range(0, 250):
sheet.write(j + 1, i, data[j]) # 数据
book.save(save_path) # 保存
print("保存完成")
5.整体运行
# 主函数
def main():
# 要爬取的网页
baseurl = "https://movie.douban.com/top250?start="
datalist = GetDate(baseurl)
# 保存数据
save_path = "豆瓣电影Top250-2.xls"
Save_Data(datalist, save_path)