目录
实战2:爬取简书页面内容并使用 twisted 异步保存到数据库
实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库
简书网站整站爬虫
实战1:爬取简书页面内容并同步保存到数据库
实战2: 实战2:爬取简书页面内容并使用 twisted 异步保存到数据库
实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库
实战1:爬取简书页面内容并同步保存到数据库
通过scrapy 框架,爬取简书首页中的文章,获取文章的标题、内容、作者、发布时间、头像url、文章id、页面url等
项目目录:
spider/js.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
# https://www.jianshu.com/p/907c9f3d8f5b
Rule(LinkExtractor(allow=r'.*/p/[1-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
# title avatar author 这三个属性,是后来script 加载的
title = response.xpath("//h1[@class='_1RuRku']/text()").get()
avatar = response.xpath("//a[@class='_1qp91i _1OhGeD']/img/@src").get()
author = response.xpath("//section[1]/div/div/div/div/span/a/text()").get()
pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
url = response.url
url1 = url.split("?")[0]
article_id = url1.split("/")[-1]
content = response.xpath("//article[@class='_2rhmJa']").get()
item = ArticleItem(
title=title,
avatar=avatar,
author=author,
pub_time=pub_time,
origin_url=url,
article_id=article_id,
content=content
)
print(item)
yield item
items.py
import scrapy
class ArticleItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
pub_time = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
import pymysql
class JianshuPipeline:
def __init__(self):
dbparams = {
'host': "127.0.0.1",
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'runoob',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.curser = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.curser.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
item['pub_time'], item['article_id'], item['origin_url']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id, title, content, author,avatar, pub_time, article_id, origin_url)
values (null, %s, %s, %s, %s, %s, %s, %s)
"""
return self._sql
return self._sql
settings.py
BOT_NAME = 'jianshu'
SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
}
ITEM_PIPELINES = {
'jianshu.pipelines.JianshuPipeline': 300,
}
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl js".split())
mysql 表结构
DROP TABLE IF EXISTS `article`;
CREATE TABLE `article` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`content` longtext,
`author` varchar(255) DEFAULT NULL,
`avatar` varchar(255) DEFAULT NULL,
`pub_time` datetime DEFAULT NULL,
`article_id` varchar(20) DEFAULT NULL,
`origin_url` varchar(255) DEFAULT NULL,
`read_count` varchar(255) DEFAULT '0',
`word_count` varchar(255) DEFAULT '0',
`like_count` int(11) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=62 DEFAULT CHARSET=utf8;
结果:
代码
实战2:爬取简书页面内容并使用 twisted 异步保存到数据库
在实战1的基础上,修改 pipelines.py 文件为如下
pipelines.py
from pymysql import cursors
from twisted.enterprise import adbapi
# 知识点待整理; ConnectionPool
class JianShuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': "127.0.0.1",
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'runoob',
'charset': 'utf8',
"cursorclass": cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id, title, content, author,avatar, pub_time, article_id, origin_url)
values (null, %s, %s, %s, %s, %s, %s, %s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
item['pub_time'], item['article_id'], item['origin_url']))
def handle_error(self, error, item, spider):
print("=" * 10 + "error begin" + "=" * 10)
print(error)
print("=" * 10 + "error end" + "=" * 10)
settings.py
修改使用的pipelines 配置:
ITEM_PIPELINES = {
'jianshu.pipelines.JianShuTwistedPipeline': 300,
}
代码
实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库
定义 SeleniumDownloadMiddleware 下载器中间件,selenium 的 webdriver.Chrome 去请求url, 并将返回的结果封装成 HtmlResponse 返回给 scrapy 引擎。
项目目录:
spider/js.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
# https://www.jianshu.com/p/907c9f3d8f5b
Rule(LinkExtractor(allow=r'.*/p/[1-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
# title avatar author 这三个属性,是后来script 加载的
title = response.xpath("//h1[@class='_1RuRku']/text()").get()
avatar = response.xpath("//a[@class='_1qp91i _1OhGeD']/img/@src").get()
author = response.xpath("//section[1]/div/div/div/div/span/a/text()").get()
pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
url = response.url
url1 = url.split("?")[0]
article_id = url1.split("/")[-1]
content = response.xpath("//article[@class='_2rhmJa']").get()
read_count = response.xpath("//div/div[@class='s-dsoj']/span[3]/text()").get()
word_count = response.xpath("//div/div[@class='s-dsoj']/span[2]/text()").get()
like_count = response.xpath("//span[@class='_3tCVn5']/span/text()").get()
item = ArticleItem(
title=title,
avatar=avatar,
author=author,
pub_time=pub_time,
origin_url=url,
article_id=article_id,
content=content,
read_count=read_count,
word_count=word_count,
like_count=like_count,
)
print(item)
yield item
items.py
import scrapy
class ArticleItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
pub_time = scrapy.Field()
read_count = scrapy.Field()
word_count = scrapy.Field()
like_count = scrapy.Field()
middlewares.py
import time
from selenium import webdriver
from scrapy.http.response.html import HtmlResponse
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")
def process_request(self, request, spider):
print("执行 SeleniumDownloadMiddleware -》 process_request ")
self.driver.get(request.url) # 请求页面数据
time.sleep(1)
try:
while True: # 点击获取所有收录专题数据
showMore = self.driver.find_element_by_class_name("show-more")
if not showMore:
break
showMore.click()
except:
pass
source = self.driver.page_source # 请求的页面数据
response = HtmlResponse(
url=self.driver.current_url,
body=source,
request=request,
encoding="utf-8"
)
return response
pipelines.py
from pymysql import cursors
from twisted.enterprise import adbapi
# 知识点待整理; ConnectionPool
class JianShuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': "127.0.0.1",
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'runoob',
'charset': 'utf8',
"cursorclass": cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id, title, content, author,avatar, pub_time, article_id,
origin_url, read_count, word_count, like_count)
values (null, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
item['pub_time'], item['article_id'], item['origin_url'],
item['read_count'], item['word_count'], item['like_count']))
def handle_error(self, error, item, spider):
print("=" * 10 + "error begin" + "=" * 10)
print(error)
print("=" * 10 + "error end" + "=" * 10)
settings.py
BOT_NAME = 'jianshu'
SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
}
DOWNLOADER_MIDDLEWARES = {
'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
}
ITEM_PIPELINES = {
'jianshu.pipelines.JianShuTwistedPipeline': 300,
}
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl js".split())
代码