python scrapy框架 简书_好故事尽在简书,python(scrapy+selenium)帮你爬故事

编乎,分享你刚编的故事ε=ε=ε=(~ ̄▽ ̄)~

虽然只是一句调侃,但小破乎有些回答确实有些“编”的嫌疑。不过既然都是编故事,那么我们不妨去薅一下简书的羊毛。(俺觉得简书上的故事比咱这精彩(→_→))

来看看简书的网页布局

看样子有一个动态加载的按钮,如果用只用scrapy的话是无法加载动态网页的,怎么办呢

只好请selenium来协助喽( $ _ $ )

可以用selenium不停的模拟点击直到没有动态加载的按钮为止。

到底之后就可以用通用爬虫来逐个爬取啦

每个li标签都是一篇文章

文章内部我们挑有代表性的抓取

好滴上代码(由于小的第一次用selenium中间件所以上网咨询,找到了一篇文章加以优化。以下代码与CSDN作者s_kangkang_A高度重复,在此向作者致歉,擅自借用了您的代码。如果有想向这位大佬学习的小伙伴请看这里https://blog.csdn.net/s_kangkang_A)

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from jianshu.items import JianshuItem

class JsSpider(CrawlSpider):

name = 'js'

allowed_domains = ['www.jianshu.com']

start_urls = ['https://www.jianshu.com/']

# 提取规则根据首页和文章页下的推荐url获取,放入parse_detail解析,跟进爬取

rules = (

Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),

callback='parse_detail', follow=True),

)

def parse_detail(self, response):

item=JianshuItem()

item["title"] = response.xpath("//h1[@class='title']/text()").extract_first()

item["author"] = response.xpath("//span[@class='name']/a/text()").extract_first()

item["url"]=response.url

# item["content"] = "".join(response.xpath("//div[@class='show-content-free']/p/text()").extract()).strip()

# tags 返回的是列表,用join方法将元素隔开,转化为字符串

item["tags"] = " ".join(response.css(" div .name::text ").extract()).strip()

yield item

from jianshu.settings import *

import pymysql

import logging

class JianshuPipeline(object):

def __init__(self):

self.connect=pymysql.connect(MYSQL_HOST,MYSQL_ROOT,MYSQL_PASSWORD,MYSQL_DATABASE)

self.cursor=self.connect.cursor()

self.cursor.execute(USE) # 选定数据库

self.cursor.execute(DROP)

self.cursor.execute(CREATE)

def process_item(self, item, spider):

try:

self.cursor.execute(SAVEIN,(item["title"],item["author"],item["url"],item["tags"]))

self.connect.commit()

except Exception as error:

logging.log(error)

return item

def close_spider(self,spider):

self.connect.close()

from scrapy import signals

from selenium import webdriver

import time

from scrapy.http.response.html import HtmlResponse

import random

import logging

class SeleniumDownloadMiddleware(object):

def __init__(self):

self.driver = webdriver.Chrome()

def process_request(self, request, spider):

self.driver.get(request.url)

time.sleep(2)

# 如果有多个专题收入该文章,获取所有专题

# 点击“展开更多”,直到没有为止

try:

while True:

tags_btn = self.driver.find_element_by_class_name('load-more')

tags_btn.click()

time.sleep(2)

if not tags_btn:

break

except:

pass

source = self.driver.page_source

response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf-8')

return response

class RandomUserAgentMiddleware():

def __init__(self):

self.user_agents=[

"Mozilla/5.0(Windows;U;MSIE 9.0;Windows NT 9.0; en-US)","Mozilla/5.0(Windows NT 6.1)AppleWebKit/537.2(KHTML,like Gecko)Chrome/22.0.1216.0 Safari/537.2","Mozilla/5.0(X11;Ubuntu;Linux i686;rv:15.0)Gecko/20100101 Firefox/15.0.1" ,

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",

"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",

"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",

"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",

"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",

"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",

"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",

]

def process_request(self,request,spider):

request.headers["User-Agent"]=random.choice(self.user_agents)

DOWNLOADER_MIDDLEWARES = {

'jianshu.middlewares.RandomUserAgentMiddleware': 544,

'jianshu.middlewares.SeleniumDownloadMiddleware':545,

}

ITEM_PIPELINES = {

'jianshu.pipelines.JianshuPipeline': 300,

}

MYSQL_HOST='localhost'

MYSQL_DATABASE='spider'

MYSQL_ROOT='root'

MYSQL_PASSWORD='123'

USE='use spider'

TABLE='jianshu'

DROP="drop table if exists %s"%TABLE

CREATE='create table %s(title varchar(255) NOT NULL,author varchar(255),url varchar(255),tags varchar(255))'%TABLE

SAVEIN='insert into '+TABLE+' (title,author,url,tags) values(%s,%s,%s,%s)'

import scrapy

class JianshuItem(scrapy.Item):

title = scrapy.Field()

author = scrapy.Field()

url=scrapy.Field()

content = scrapy.Field()

tags = scrapy.Field()

上个数据库的结果图看看

以上就是这样,小的最近也要期末了,为了不挂科只能暂时先泡图书馆,期待暑假再做出项目来分享\( ̄︶ ̄*\))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值