python爬取中华网科技新闻

677 篇文章 322 订阅

爬取 http://tech.china.com/articles/

抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。

创建项目

scrapy startproject China
scrapy genspider -t crawl chinatech 

items.py

from scrapy import Field, Item


class ChinaItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = Field()
    text = Field()
    datetime = Field()
    source = Field()
    url = Field()
    website = Field()

chinatech.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from China.items import *
from China.loaders import *
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class ChinatechSpider(CrawlSpider):
    name = 'chinatech'
    allowed_domains = ['tech.china.com']
    start_urls = ['http://tech.china.com/articles/']

    rules = (
        Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
    )

    def parse_item(self, response):
        loader = ChinaLoader(item=ChinaItem(), response=response)
        loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
        loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
        loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
        loader.add_value('website', '中华网')
        yield loader.load_item()

loads.py

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Join, Compose


class NewsLoader(ItemLoader):
    default_output_processor = TakeFirst()


class ChinaLoader(NewsLoader):
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())

pipelines.py

import json

class ChinaPipeline(object):

    def __init__(self):
        self.filename = open("china.json", "w")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
        self.filename.write(text)
        return item

    def close_spider(self, spider):
        self.filename.close()

settings.py

BOT_NAME = 'China'

SPIDER_MODULES = ['China.spiders']
NEWSPIDER_MODULE = 'China.spiders'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'China.pipelines.ChinaPipeline': 300,
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值