crawlspider

crawlspider提取url

创建一个crawlspider爬虫

scrapy genspider --t crawl baidu baidu.com
rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )            
allow  正则匹配
restrict_css css匹配, 自动提取url
restrict_xpath xpath匹配, 自动提取url
# restrict_xpaths=("//div[@class='a'/li]") 能够去除li下面所有a标签的url地址并进行请求
Rule(LinkExtractor(restrict_xpaths=("//div[@class='a'/li]")), callback='parse_item', follow=True),

 

创建的爬虫

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CfSpider(CrawlSpider):
    name = 'cf'
    allowed_domains = ['circ.gov.cn']
    start_urls = ['http://circ.gov.cn/']
    # 提取规则 follow=True 继续提取(提取下一页地址 需要True)
    rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        return item

 爬去腾讯招聘职位

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class HrSpider(CrawlSpider):
    name = 'hr'
    allowed_domains = ['tencent.com']
    start_urls = ['https://hr.tencent.com/index.php']
    rules = (
        # https://hr.tencent.com/social.php
        Rule(LinkExtractor(allow=r'https://hr.tencent.com/position.php'),
             callback='parse_item', follow=True),
     # next page Rule(LinkExtractor(allow
=r'https://hr.tencent.com/position.php?keywords=&tid=0&start=\d{1}0#a'), follow=True), ) def parse_item(self, response): item = {} tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] for tr in tr_list: item['title'] = tr.xpath("./td[1]/a/text()").extract_first() item['position'] = tr.xpath("./td[2]/text()").extract_first() item['pub_date'] = tr.xpath("./td[5]/text()").extract_first() yield item

 

posted on 2019-04-27 14:40 .Tang 阅读( ...) 评论( ...) 编辑 收藏

转载于:https://www.cnblogs.com/tangpg/p/10778715.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值