十.scrapy项目 爬取主页http://cuiqingcai.com/获取所有url与title

一.分析采用crawlspider,利用rule规则提取url,并且follow=True追踪下去

rules = (
    Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True),
    # Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True),
   )

二.spider为

#coding:utf-8
from scrapy.spiders import CrawlSpider, Rule, Request
from scrapy.linkextractors import LinkExtractor
from ..items import CuiqingcaiItem


class myspider(CrawlSpider):
    name = 'cqc'
    allowed_domains = ['cuiqingcai.com']
    count_all = 0
    url_all = []
    start_urls = ['http://cuiqingcai.com']
    label_tags = [u'爬虫', 'scrapy', 'selenium']

    rules = (
        Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True),
        # Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True),
    )
    '''
    # 将爬虫相关的数据存入数据库
    def parse_pachong(self, response):
        print_tag = False
        title_name = u""
        for tag in self.label_tags:
            title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]
            if tag in title_name.lower().encode("utf-8"):
                print_tag = True
        if print_tag == True:
            self.count_all = self.count_all + 1
            self.url_all.append(response.url)
            item = CuiqingcaiItem()
            item['url'] = response.url
            item['title'] = title_name.encode("utf-8")
            return item
    '''
    # 将全站数据存入json文件
    def parse_all(self, response):
        title_name = None
        if response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]:
            title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]
        item = CuiqingcaiItem()
        item['url'] = response.url
        item['title'] = title_name
        return item

三.pipelines为

import json
from pymongo import MongoClient
import settings
from items import CuiqingcaiItem

class CuiqingcaiPipeline(object):
    def __init__(self):
        cn=MongoClient('127.0.0.1',27017)
        db=cn[settings.Mongodb_DBNAME]
        self.table=db[settings.Mongodb_DBTable]
    def process_item(self, item, spider):
        if isinstance(item,CuiqingcaiItem):
            try:
                self.table.insert(dict(item))
            except Exception, e:
                pass
            return item

四.item为

import scrapy


class CuiqingcaiItem(scrapy.Item):

    title = scrapy.Field()  # 标题
    url = scrapy.Field()  # 页面的地址

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值