- 基本原理:
- requests模块获取每一页的html文件,需要注意的是猫眼电影加了反爬虫,所以需要加上User-Agent
- 通过xpath提取字段信息,为了整洁,通过切片和替换,去掉了一些字段信息
- 通过csv模块,将数据写入csv
- 代码如下:
import requests
from lxml import etree
import csv
def getSource(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
def MovieItem(source):
elements = etree.HTML(source)
movie_items = elements.xpath('//div[@class="movie-item-info"]')
item = []
for movie_item in movie_items:
dict = {}
clip_url = movie_item.xpath('p[@class="name"]/a/@href')[0]
clip_title = movie_item.xpath('p[@class="name"]/a/@title')[0]
date_time = movie_item.xpath('p[@class="releasetime"]/text()')[0][5:]
star = movie_item.xpath('p[@class="star"]/text()')[0].replace('\n', '').replace(' ', '')[3:]
dict['url'] = f'https://maoyan.com{clip_url}'
dict['title'] = clip_title
dict['上映时间'] = date_time
dict['主演'] = star
item.append(dict)
return item
def writeDate(item):
with open('猫眼电影爬取.csv', 'w', encoding='utf-8') as f:
f_csv = csv.DictWriter(f, fieldnames=['url','title','上映时间','主演'])
f_csv.writeheader()
f_csv.writerows(item)
if __name__ == '__main__':
base_url = 'https://maoyan.com/board/4?offset={}'
item = []
for i in range(0,100,10):
page_link = base_url.format(i)
source = getSource(page_link)
item += MovieItem(source)
writeDate(item)