豆瓣电影top250的url
https://movie.douban.com/top250
每页的url有规律,可以构造url列表
本文是找到后页的链接构造完整的url链接的方式
今天要爬取的数据有
电影的标题
电影的年份
电影的国家
电影的类型
电影的评分
电影的主题
查看源码可以发现页面标签和源码一样,可以直接提取数据
# -*- encoding:utf-8 -*-
"""
@python: 3.7
@Author: xiaobai_IT_learn
@Time: 2019-11-07 10:00
"""
import csv
import re
import requests
from lxml import etree
import time
MOVIE_EXCEL_PATH = 'douban_movie_top250.xlsx'
class DoubanMovieSpider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.120 Safari/537.36'
}
def spider(self, url):
resp = requests.get(url, headers=self.headers).content.decode()
html = etree.HTML(resp)
li_list = html.xpath('//ol/li')
item_list = []
for li in li_list:
item = {}
item['title'] = ''
item['year'] = ''
item['country'] = ''
item['category'] = ''
item['score'] = ''
item['des'] = ''
# 加入判断条件,防止少了文本内容,代码报错(可以看下绿皮书没有主题的)
if li.xpath(".//span[@class='title']/text()"):
item['title'] = li.xpath(".//span[@class='title']/text()")[0]
if li.xpath(".//div[@class='bd']/p/text()"):
str_ = li.xpath(".//div[@class='bd']/p/text()")[1].strip()
str_ = str_.replace('\xa0', '')
item['year'] = re.findall(r'\d+', str_)[0]
item['country'] = re.findall(r'/(.*?)/', str_)[0]
item['category'] = re.findall(r'/.*?/(.*)', str_)[0]
if li.xpath(".//span[@class='rating_num']/text()"):
item['score'] = li.xpath(".//span[@class='rating_num']/text()")[0]
if li.xpath(".//p[@class='quote']/span/text()"):
item['des'] = li.xpath(".//p[@class='quote']/span/text()")[0]
item_list.append(item)
# 找到后页的标签,构造url,当url不存在是返回false,停止循环
next_url = False
if html.xpath(".//span[@class='next']/a/@href"):
next_url = 'https://movie.douban.com/top250' + html.xpath(".//span[@class='next']/a/@href")[0]
return next_url, item_list
def run(self):
temp_url = 'https://movie.douban.com/top250'
# 写入文件夹中
fp = open('./doupanbooktest02.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow((
'title',
'year',
'country',
'category',
'score',
'des',
))
while True:
url, item_list = self.spider(temp_url)
for item in item_list:
print(item)
title = item['title']
year = item['year']
country = item['country']
category = item['category']
score = item['score']
des = item['des']
writer.writerow((
title,
year,
country,
category,
score,
des,
))
temp_url = url
if not url:
break
fp.close()
if __name__ == '__main__':
start_time = time.time()
doubanspider = DoubanMovieSpider()
doubanspider.run()
print(time.time()-start_time)
数据截图如下