环境:python3.6 + BeautifulSoup
爬取一页的电影信息 对应网址:https://movie.douban.com/top250
import requests # 导入网页请求库
from bs4 import BeautifulSoup # 导入网页解析库
import json
# 用于发送请求,获得网页源代码以供解析
def start_requests(url):
r = requests.get(url)
return r.content
# 接收网页源代码解析出需要的信息
def parse(text):
soup = BeautifulSoup(text, 'html.parser')
movie_list = soup.find_all('div', class_ = 'item')
result_list = []
for movie in movie_list:
mydict = {}
mydict['title'] = movie.find('span', class_ = 'title').text
mydict['score'] = movie.find('span', class_ = 'rating_num').text
mydict['quote'] = movie.find('span', class_ = 'inq').text
star = movie.find('div', class_ = 'star')
mydict['comment_num'] = star.find_all('span')[-1].text[:-3]
result_list.append(mydict)
return result_list
# 将数据写入json文件
def write_json(result):
s = json.dumps(result, indent = 4, ensure_ascii=False)
with open('movies.json', 'w', encoding = 'utf-8') as f:
f.write(s)
# 主运行函数,调用其他函数
def main():
url = 'https://movie.douban.com/top250'
text = start_requests(url)
result = parse(text)
write_json(result)
# 一般做法
if __name__ == '__main__':
main()```