# _*_ coding:utf-8 _*_
"""
file_name:py_movie
author:Sam
"""
from urllib import request
from bs4 import BeautifulSoup
def main(request_url=""):
# 设置头部信息
request_url = request.Request(request_url)
request_url.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
# 请求地址并转码为utf-8
res = request.urlopen(request_url)
html = res.read()
html = html.decode('utf-8')
print_data = []
# 用BeautifulSoup获取信息
soup = BeautifulSoup(html, "html.parser")
# print(soup)
for item in soup.find_all('div', class_="item"):
item_data = {}
# 排名
item_data['rank'] = item.find('em').get_text()
# 播放地址
item_data['play_url'] = item.find('a').get('href')
# 主图
item_data['img'] = item.find('img').get("src")
# 标题
item_data['title'] = item.find('span', class_='title').get_text()
# 其他
item_data['other'] = item.find('span', class_='other').get_text()
# 评分
item_data['grade'] = item.find('span', class_='rating_num').get_text()
# 评论人数
people = item.find('div', class_='star')
item_data['people'] = people.contents[7].get_text()
# 评语 本来不打算写判断的,刚好排名246的那个刚好没有这个评语,报错了
if item.find('span', class_='inq') is not None:
item_data['inq'] = item.find('span', class_='inq').get_text()
else:
item_data['inq'] = '暂无'
# info
info = item.find('div', class_='bd')
item_data['info'] = info.find('p').contents[0].strip()
item_data['info'] += " 年份产地:"
item_data['info'] += info.find('p').contents[2].strip()
print_data.append(item_data)
# print(print_data)
fpath = 'd:/project/Python/demo/test_py_movie.txt'
write_data = ""
for item in print_data:
write_data += "[排名]:"+item['rank']
write_data += "\n"
write_data += "[标题]:" + item['title']
write_data += "\n"
write_data += "[其他]:"+item['other']
write_data += "\n"
write_data += "[评语]:"+item['inq']
write_data += "\n"
write_data += "[详情]:"+item['info']
write_data += "\n"
write_data += "[评分]:"+item['grade']
write_data += "\n"
write_data += "[评分人数]:"+item['people']
write_data += "\n"
write_data += "[主图]:"+item['img']
write_data += "\n"
write_data += "[播放路径]:"+item['play_url']
write_data += "\n"
write_data += "\n"
write_data += "**********************************************************************************************************************"
write_data += "\n"
write_data += "\n"
with open(fpath, 'a+', encoding='utf-8') as f:
f.write(write_data)
f.close()
if __name__ == '__main__':
url = "https://movie.douban.com/top250?start="
for n in range(10):
start = n*25
# print(start)
main(url+str(start))
Python-使用BeautifulSoup爬取豆瓣TOP250电影
最新推荐文章于 2024-05-01 03:47:40 发布