scrapy创建项目
# 创建一个项目
scrapy startproject 项目名称
cd 进入项目目录
# 生成一个爬虫文件
scrapy genspider 爬虫名称 www.baidu.com
# 运行一个爬虫
scrapy crawl 爬虫名称
scrapy.cfg :项目的配置文件
items.py :项目的目标文件
pipelines.py :项目的管道文件
settings.py :项目的设置文件
spiders/ :存储爬虫代码目录
# -*- coding: utf-8 -*-
import scrapy
class BaiduSpider(scrapy.Spider):
# 项目唯一的名字
name = 'baidu'
# 允许爬取的域名
allowed_domains = ['baidu.com']
# 初始请求 url
start_urls = ['http://baidu.com/']
def parse(self, response):
pass
ROBOTSTXT_OBEY 要设置为 False
scrapy 提供2种规则: 一种css一种xpaht
# -*- coding: utf-8 -*-
import scrapy
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://python.jobbole.com/89316/']
def parse(self, response):
# 通过 xpath
title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·','').strip()
zan = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
sc = response.xpath("//div[contains(@class,'post-adds')]//span[2]/text()").extract()[0].replace('收藏','').strip()
ywcc = response.xpath("//div[contains(@class,'copyright-area')]/a/text()").extract()[0]
yw_url = response.xpath("//div[contains(@class,'copyright-area')]/a/@href").extract()[0]
content = response.xpath("//div[contains(@class,'entry')]").extract()[0]
# 通过 css
title_css = response.css(".entry-header h1::text").extract()[0]
time_css = response.css(".entry-meta-hide-on-mobile::text").extract()[0].replace('·','').strip()
zan_css = response.css(".post-adds span h10::text").extract()[0]
sc_css = response.css(".btn-bluet-bigger.href-style.bookm