import pprint
import pandas as pd
import requests
from bs4 import BeautifulSoup
import openpyxl
def download_all_html():
"""
下载所有页面的HTML
:return:
"""
htmls = []
for num in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start={num}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code != 200:
raise Exception("error")
htmls.append(response.text)
return htmls
htmls = download_all_html()
print(htmls)
def parse_single_html(html):
"""
解析单个HTML,得到数据
:param html:
:return:
"""
soup = BeautifulSoup(html, 'html.parser')
article_items = (
soup.find('div', class_='article')
.find('ol', class_='grid_view')
.find_all('div', class_='item')
)
datas = []
for article_item in article_items:
rank = article_item.find('div', class_='pic').find('em').get_text()
info = article_item.find('div', class_='info')
title = info.find('div', class_='hd').find('span', class_='title').get_text()
stars = (
info.find('div', class_='bd')
.find('div', class_='star')
.find_all('span')
)
rating_star = stars[0]['class'][0]
rating_num = stars[1].get_text()
comments = stars[3].get_text()
datas.append({
"rank": rank,
"title": title,
"rating_star": rating_star.replace("rating", "").replace("-t", ""),
"rating_num": rating_num,
"comments": comments.replace("人评价", "")
})
return datas
all_datas = []
for html in htmls:
all_datas.extend(parse_single_html(html))
pprint.pprint(all_datas)
# 结果存入excel
df = pd.DataFrame(all_datas)
df.to_excel("豆瓣电影Top250.xlsx")
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交