参考上篇文章
附上代码:
import requests
from bs4 import BeautifulSoup
import mysql.connector
def get_pages_link():
# 插入到数据库
conn = mysql.connector.connect(user='root', password='root', database='test')
cursor = conn.cursor()
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Connection': 'keep-alive'
}
for item in range(0, 250, 25):
url = "https://book.douban.com/top250?start={}".format(item)
web_data = requests.get(url, headers=header)
soup = BeautifulSoup(web_data.content, 'lxml')
for movie in soup.select('.item'):
href = movie.find('a')["href"]
name = movie.get_text().strip()[:20].strip() # 片名