文章目录
爬简书
- artical_spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import JianshuSpiderItem
class ArticalSpiderSpider(CrawlSpider):
name = 'artical_spider'
allowed_domains = ['www.jianshu.com']
start_urls = ['http://www.jianshu.com/']
rules = (
# 每篇文章中的推荐阅读的 href 是 /p/* ,所以这里用 .*/p/
Rule(LinkExtractor(allow=r'.*/p/[0-9a-f]{12}.*'), callback='parse_item', follow=True),
)
def parse_item(self, response):
title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
article_id = response.url.split('?')[0].split('/')[-1]
origin_url = response.url
content = response.xpath('//article[@class="_2rhmJa"]').get()
avatar = response.xpath('//img[@class="_13D2Eh"]/@src').get()
print(avatar)
author = response.xpath('//span[@class="FxYr8x"]/a[@class="_1OhGeD"]/text()').get()
print(author)
some = response.xpath('//div[@class="s-dsoj"]')[0]
print(some)
pub_time = some.xpath('.//time/text()').get()
word_count = some.xpath('.//span[2]/text()').get()
read_count = some.xpath('.//span[3]/text()').get()
like_count_origin = some.xpath('//span[@class="_1LOh_5"]/text()').get()
like_count = like_count_origin.split('人')[0]
subjects_origin = response.xpath('//div[@class="_2Nttfz"]/a[@class="_3s5t0Q _1OhGeD"]/span[@class="_2-Djqu"]/text()').getall()
subjects = ','.join(subjects_origin)
item = JianshuSpiderItem(title=title, article_id=article_id, origin_url=origin_url, content=content,
avatar=avatar, author=author, pub_time=pub_time, word_count=word_count,
read_count=read_count, like_count=like_count, subjects=subjects)
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
return item
- items.py
import scrapy
class JianshuSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
avatar = scrapy.Field()
author = scrapy.Field()
article_id = scrapy.Field()
pub_time = scrapy.Field()
word_count = scrapy.Field()
read_count = scrapy.Field()
content = scrapy.Field()
subjects = scrapy.Field()
origin_url = scrapy.Field()
like_count = scrapy.Field()
- pipelines.py
import pymysql
from twisted.enterprise import adbapi # 专门做数据库处理
from pymysql import cursors
class JianshuSpiderPipeline(object):
def __init__(self):
dbparams = {
'host': '150.109.61.206',
'port': 3306,
'user': 'root',
'password': '111111',
'database': 'jianshu',
'charset': 'utf8'
}
# **dbparams 相当于将 dbparams 中的键值一一对应到参数中
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
# self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
# item['origin_url'], item['article_id']))
self.cursor.execute(self.sql, (item['title'], item['content'], item['origin_url'], item['article_id']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
# self._sql = """
# insert into article(id, title, content, author, avatar, pub_time, article_id) values(null, %s, %s, %s, %s, %s, %s, %s)
#
# """
self._sql = """
insert into article(id, title, content, origin_url, article_id) values(null, %s, %s, %s, %s)
"""
return self._sql
return self._sql
class JianshuTwistedPipeline(object):
"""
异步
"""
def __init__(self):
dbparams = {
'host': '150.109.61.206',
'port': 3306,
'user': 'root',
'password': '111111',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor #游标的类
}
self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
# self._sql = """
# insert into article(id, title, content, author, avatar, pub_time, article_id) values(null, %s, %s, %s, %s, %s, %s, %s)
#
# """
self._sql = """
insert into article(id, title, content, origin_url, article_id) values(null, %s, %s, %s, %s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
# self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'], item['article_id']))
# self.conn.commit()
# return item
#runInteraction() 调用 insert_item() 插入数据,实现异步,如果直接用 insert_item() 插入数据,就是同步了
# defer = self.dbpool.runInteraction(self.insert_item, item)
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error, item, spider)
return item
def insert_item(self, cursor, item):
# sql 方法加了 property 装饰器,所以调用 sql 不用加 ()
cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
item['origin_url'], item['article_id'], item['word_count'], item['read_count'],
item['like_count'], item['subjects']))
# cursor.execute(self.sql, (item['title'], item['content'], item['origin_url'], item['article_id']))
def handle_error(self, error, item, spider):
print('='*10+'error'+'='*10)
print(error)
print('='*10+'error'+'='*10)
- settings.py
import random
import os
BOT_NAME = 'jianshu_spider'
SPIDER_MODULES = ['jianshu_spider.spiders']
NEWSPIDER_MODULE = 'jianshu_spider.spiders'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = random.randint(1,3)
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'ELinks/0.13.GIT (textmode; Linux 2.6.24-1-686 i686; 175x65-2)'
}
SPIDER_MIDDLEWARES = {
'jianshu_spider.middlewares.JianshuSpiderSpiderMiddleware': 542,
# 'jianshu_spider.middlewares.SeleniumDownloadMiddleWare': 544
}
DOWNLOADER_MIDDLEWARES = {
'jianshu_spider.middlewares.JianshuSpiderDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'jianshu_spider.pipelines.JianshuTwistedPipeline': 300,
# 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
}
- middlewares.py
from scrapy import signals
class JianshuSpiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from scrapy import signals
from scrapy.http.response.html import HtmlResponse
class JianshuSpiderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
USER_AGENTS = [
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362',
'Mozilla/5.0(Windows NT 10.0;WOW64;Trident/7.0;rv:11.0)likeGecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)',
'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; InfoPath.2; .NET CLR 2.0.50727; Alexa Toolbar)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 2.0.50727; FDM; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; InfoPath.2)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows NT 5.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows 98)',
'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; SV1)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Q312461; YComp 5.0.0.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Q312461)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Hotbar 4.2.8.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Hotbar 4.1.7.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (X11; U; OpenBSD ppc; en-US; rv:1.8.1.9) Gecko/20070223 BonEcho/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1.9) Gecko/20071103 BonEcho/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.9) Gecko/20071113 BonEcho/2.0.0.9',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en; rv:1.8.1.12) Gecko/20080206 Camino/1.5.5',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X Mach-O; en; rv:1.8.1.12) Gecko/20080206 Camino/1.5.5',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US; rv:1.0.1) Gecko/20021111 Chimera/0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US; rv:1.0.1) Gecko/20021104 Chimera/0.6',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Crazy Browser 3.0.5) ; .NET CLR 3.0.04506.30; InfoPath.2; InfoPath.3; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; InfoPath.2; Crazy Browser 3.0.5)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; Crazy Browser 2.0.1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; InfoPath.1; Crazy Browser 2.0.1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Crazy Browser 2.0.1)',
'ELinks/0.13.GIT (textmode; Linux 2.6.29 i686; 119x51-2)',
'ELinks/0.13.GIT (textmode; Linux 2.6.27-rc6.git i686; 175x65-3)',
'ELinks/0.13.GIT (textmode; Linux 2.6.26-rc7.1 i686; 119x68-3)',
'ELinks/0.13.GIT (textmode; Linux 2.6.24-1-686 i686; 175x65-2)'
]
def __init__(self):
self.driver = webdriver.Chrome(executable_path=r'D:\downloads\chromedriver_win32\chromedriver.exe')
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
# driver = webdriver.Chrome(executable_path=r'D:\downloads\chromedriver_win32\chromedriver.exe')
# while True:
print(request.url)
self.driver.get(request.url)
# time.sleep(2)
# print(driver.current_url)
# print(driver.page_source)
# avatar_tag = WebDriverWait(driver, 10).until(
# EC.presence_of_all_elements_located((By.XPATH, '//img[@class="_13D2Eh"'))
# )
# if avatar_tag == True:
source = self.driver.page_source
print(self.driver.current_url)
# print(source)
response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf8')
print(response)
# try:
# while True:
# show_more = self.driver.find_element(By.CLASS_NAME, 'H7E3vT')
# show_more.click()
# time.sleep(1)
# if not show_more:
# break
#
# except:
# pass
return response
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
mysql 数据表设置
- 数据库:jianshu
- 表:article
名 | 类型 | 长度 | 小数点 | 不是null | 自动递增 |
---|---|---|---|---|---|
id | int | 1 | 1 | ||
title | varchar | 255 | |||
content | longtext | ||||
author | varchar | 255 | |||
avatar | varchar | 255 | |||
pub_time | datetime | ||||
article_id | varchar | 20 | |||
origin_url | varchar | 255 | |||
read_count | int | 11 | |||
like_count | int | 11 | |||
word_count | int | 11 | |||
subjects | text |
注:不是null 和自动递增中的 1 表示勾选,不写表示不勾选