python3.9 BeautifulSoup爬取豆瓣电影 Top 250
工作之余学习了python,今天就用python写了一个爬取豆瓣电影Top 250的脚本,不得不感慨python在爬虫这一块有天然的优势!好了,进入正题。
数据全是包装在li下,所以就简单了,我们就可以利用BeautifulSoup去获取所有的li标签,然后循环取出我们想要的数据!
BeautifulSoup可以在https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/ 上面学,中文翻译的,对英语不太好的小伙伴很友好,比如说我,哈哈哈。。。虽然翻译的不是最新版,但也无妨我们去学习!
直接贴代码了,里面有一些必要的注释。
有点冗余,等有空看看还能优化不
import requests
from bs4 import BeautifulSoup
""""
获取指定url路径的页面
"""
def douban_rank250(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
r = requests.get(url, headers=headers)
return r.text
"""
保存页面,主要用做本地测试
"""
def save_html(html):
filename = "D:\\learning\\python\\douban_rank_250.html"
with open(filename, "w", encoding="UTF-8") as file_object:
file_object.write(html)
"""
本地测试,读取页面
"""
def open_html():
filename = "D:\\learning\\python\\豆瓣电影 Top 250.html"
data = ""
with open(filename, "r", encoding="UTF-8") as file_object:
lines = file_object.readlines()
for line in lines:
data += line
return data
"""
解析页面元素
"""
def parse(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_="item")
for item in items:
result = ""
print("---*******************---")
rank = item.find("em")
print("rank = " + rank.string)
result += "rank " + rank.string + "\n"
# 获取图片的url
img_tag = item.find("img")
img_src = img_tag["src"]
print("image url = " + img_src)
result += "image url = " + img_src + "\n"
# 获取内容
hd_tag = item.find("div", class_="hd")
movie_url = hd_tag.a['href']
print("movie url = " + movie_url)
result += "movie url = " + movie_url + "\n"
spans = hd_tag.a.find_all("span")
movie_title = ""
for span_content in spans:
movie_title += span_content.string.strip()
print("movie title = " + movie_title)
result += "movie title = " + movie_title + "\n"
bd_tag = item.find("div", class_="bd")
p_tags = bd_tag.find_all("p")
if len(p_tags) == 2:
desc = p_tags[0].text.strip()
quote = p_tags[1].text.strip()
print("desc = ")
result += "desc =" + "\n"
result += desc + "\n"
print(desc)
print("quote = " + quote)
result += "quote = " + quote + "\n"
star_tag = item.find("div", class_="star")
spans = star_tag.find_all("span")
print("star and comments = ")
result += "star and comments = " + "\n"
for span_content in spans:
print(span_content.string)
result += str(span_content.string) + "\n"
result += "\n"
result += "\n"
save_to_file(result)
"""
存储结果到本地文件
"""
def save_to_file(content):
with open("D:\\learning\\python\\result.txt", "a", encoding="UTF-8") as file_object:
file_object.write(content)
if __name__ == '__main__':
# res = douban_rank250()
# save_html(res)
# res = open_html()
# parse(res)
max_page = 10
size = 25
for page in range(max_page):
url = "https://movie.douban.com/top250?start=" + str(page * size) + "&filter="
res = douban_rank250(url)
parse(res)
生成的结果文件如下