python scrapy框架 简书_python scrapy 实战简书网站

1:创建项目

2:创建爬虫

3:编写start.py文件用于运行爬虫程序# -*- coding:utf-8 -*-

#作者: baikai

#创建时间: 2018/12/14 14:09

#文件: start.py

#IDE: PyCharm

from scrapy import cmdline

cmdline.execute("scrapy crawl js".split())

4:设置settings.py文件的相关设置

爬取详情页数据

编写items.py文件# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class ArticleItem(scrapy.Item):

# 定义我们需要的存储数据字段

title=scrapy.Field()

content=scrapy.Field()

article_id=scrapy.Field()

origin_url=scrapy.Field()

author=scrapy.Field()

avatar=scrapy.Field()

pub_time=scrapy.Field()

编写js.py# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from jianshu_spider.items import ArticleItem

class JsSpider(CrawlSpider):

name = 'js'

allowed_domains = ['jianshu.com']

start_urls = ['https://www.jianshu.com/']

rules = (

# 匹配地址https://www.jianshu.com/p/d8804d18d638

Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),

)

def parse_detail(self, response):

# 获取内容页数据并解析数据

title=response.xpath("//h1[@class='title']/text()").get()

#作者图像

avatar=response.xpath("//a[@class='avatar']/img/@src").get()

author=response.xpath("//span[@class='name']/a/text()").get()

#发布时间

pub_time=response.xpath("//span[@class='publish-time']/text()").get()

#详情页id

url=response.url

#https://www.jianshu.com/p/d8804d18d638

url1=url.split("?")[0]

article_id=url1.split("/")[-1]

#文章内容

content=response.xpath("//div[@class='show-content']").get()

item=ArticleItem(

title=title,

avatar=avatar,

author=author,

pub_time=pub_time,

origin_url=response.url,

article_id=article_id,

content=content

)

yield item

设计数据库和表

数据库jianshu

表article

id设置为自动增长

将爬取到的数据存储到mysql数据库中# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql

from twisted.enterprise import adbapi

from pymysql import cursors

class JianshuSpiderPipeline(object):

def __init__(self):

dbparams = {

'host': '127.0.0.1',

'port': 3306,

'user': 'root',

'password': '8Wxx.ypa',

'database': 'jianshu',

'charset': 'utf8'

}

self.conn = pymysql.connect(**dbparams)

self.cursor = self.conn.cursor()

self._sql = None

def process_item(self, item, spider):

self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],item['article_id']))

self.conn.commit()

return item

@property

def sql(self):

if not self._sql:

self._sql = """

insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)

"""

return self._sql

return self._sql

运行start.py效果如下

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值