crawlspider 获取搜狐网数据

1. 创建爬虫项目

scrapy startproject sohu

2. 创建spider文件

scrapy genspider -t crawl sohu_spider aaa

3. 定义字段
import scrapy


class SohuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    # 详情地址
    url = scrapy.Field()

    # 标题
    title = scrapy.Field()

    # 发布日期
    date = scrapy.Field()

    # 文本信息
    text = scrapy.Field()
4. spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import SohuItem
import re


class SohuSpiderSpider(CrawlSpider):
    name = 'sohu_spider'
    # allowed_domains = ['aaaa']
    start_urls = ['http://sohu.com/']

    rules = (
        # 定义url解析规则(follow=True 跟进url)
        Rule(LinkExtractor(allow=r'http://www.sohu.com/a/.*?'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = SohuItem()
        item['url'] = response.url
        item['title'] = response.xpath('//title/text()').extract_first()
        item['date'] = re.findall(r'<meta itemprop="datePublished" content="(.*?)" />', response.text)[0]
        item['text'] = response.xpath('//article[@id="mp-editor"]/p/text()').extract()

        yield item

5. 保存数据
import json

class SohuPipeline(object):

    def open_spider(self, spider):
        self.file = open('sohu.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        self.file.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
        return item

    def close_spider(self, spider):
        self.file.close()

6. 配置settings
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'sohu.pipelines.SohuPipeline': 300,
}
DOWNLOAD_DELAY = 1
7. 执行脚本
from scrapy import cmdline

cmdline.execute('scrapy crawl sohu_spider'.split())
8. 返回效果

在这里插入图片描述

发布了288 篇原创文章 · 获赞 51 · 访问量 1万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 书香水墨 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览