运用到的库有:
time库
requests库
bs4库
不懂的可以在下方留言交流
#!D:\Anaconda\Python 3.7.0
# -*- coding: utf-8 -*-
# @author:LiuQing
import requests
from bs4 import BeautifulSoup
import time
time.perf_counter()
def get_url(url): #获取url链接
kv = {'user-agent': 'Mozilla/5.0'}
response = requests.get(url, headers=kv, timeout=30)
response.encoding = response.apparent_encoding
return response.text
def get_info(txt): #解析网页
soup = BeautifulSoup(txt, 'html.parser')
items = soup.find_all('div', class_='item')
for item in items:
#1.获取排名
rank = item.find_all('div', class_='pic')[0].text.strip('\n')
#2.获取评分人数
value = item.find_all('span')[-2].text
#3.获取标题
title = item.find('span', class_='title').text
#4.获取分数
score = item.find('span', class_='rating_num').text
#5.获取演员
actor = item.find_all('p')[0].text.strip()
#6.获取url
url = item.a.get('href')
print(f'{rank},{value},{score},{title},{actor},{url}')
def main():
print("{}{}{}{}{}{}".format('排名','评分人数','标题','分数','演员','url'))
#遍历url
for i in range(0, 250, 25):
html = f'https://movie.douban.com/top250?start={i}&filter='
txt=get_url(html)
get_info(txt)
print(f'程序共运行{time.perf_counter()}')
if __name__ == '__main__':
main()