安装插件
xpath helper
chrome://flags/#extensions-on-chrome-urls
测试抓取
# -*- coding: utf-8 -*-
import scrapy
class DoubanSpiderSpider(scrapy.Spider):
#爬虫名
name = 'douban_spider'
allowed_domains = ['movie.douban.com']
#入口url,扔到调度器中
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
# 打印抓取过来的内容
print(response.text)
使用谷歌copy xpath
# -*- coding: utf-8 -*-
import scrapy
class DoubanSpiderSpider(scrapy.Spider):
#爬虫名
name = 'douban_spider'
allowed_domains = ['movie.douban.com']
#入口url,扔到调度器中
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
movie_list = response.xpath('//*[@id="content"]/div/div[1]/ol').extract()
for i_item in movie_list:
print(i_item)
不要使用上面那个
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class DoubanSpiderSpider(scrapy.Spider):
# 爬虫名
name = 'douban_spider'
allowed_domains = ['movie.douban.com']
# 入口url,扔到调度器中
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
movie_list = response.xpath('//ol/li')
for i_item in movie_list:
douban_item = DoubanItem()
# [<Selector xpath='.//em/text()' data='1'>]
# 上面是未加:extract_first()的时候
douban_item['serial_number'] = i_item.xpath('.//em/text()').extract_first()
print(douban_item['serial_number'])