1.导包
from lxml import etree
import requests
import csv
2.拿到目标url
doubanUrl = ‘https://movie.douban.com/top250?start={}&filter=’
3.获取网页源码
def getSource(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36’}
response = requests.get(url, headers=headers)
response.encoding = ‘utf-8’
return response.text
4.解析数据
def getEveryItem(source):
html_element = etree.HTML(source)
class = ‘info’ 电影的名字,评分,引言,详情页的
movieItemList = html_element.xpath(“//div[@class=‘info’]”)
定义一个空列表,添加字典数据
movieList = []
for eachMovie in movieItemList: