问题:piplines 不能写入数据到文件中

 
这是主程序:
1
# -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 from urllib import parse 5 from first_app.items import jobbleItiem 6 7 8 9 class JobboleSpider(scrapy.Spider): 10 name = 'jobbole' 11 allowed_domains = ['blog.jobbole.com'] 12 start_urls = ['http://blog.jobbole.com/all-posts/'] 13 14 def parse(self, response): 15 post_nodes=response.css("#archive .floated-thumb .post-thumb a") 16 next_urls=response.css(".next.page-numbers::attr(href)").extract_first("") 17 for post_node in post_nodes: 18 img_url=post_node.css("img::attr(src)").extract_first("") 19 20 #img_url = post_node.css("img::attr(src)").extract_first("") 21 post_url=post_node.css("::attr(href)").extract_first("") 22 #post_url = post_node.css("::attr(href)").extract_first("") 23 yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":img_url},callback=self.parse_datail) 24 next_url1 = response.css(".next.page-numbers::attr(href)").extract_first("") 25 next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 26 if next_url1: 27 yield Request(url=parse.urljoin(response.url, next_url1), callback=self.parse) 28 29 30 def parse_datail(self,response): 31 aitcle_item=jobbleItiem() 32 # css选择器 33 css_title = response.css(".entry-header h1::text").extract()[0] 34 css_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() 35 css_pinglun = response.css("a[href='#article-comment'] span::text").extract()[0] 36 css_img=response.meta.get("front_image_url","") 37 css_url=response.url 38 aitcle_item["css_title"]=css_title 39 aitcle_item["css_date"]=css_date 40 aitcle_item["css_pinglun"]=css_pinglun 41 42 yield aitcle_item

这是setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for first_app project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'first_app'

SPIDER_MODULES = ['first_app.spiders']
NEWSPIDER_MODULE = 'first_app.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'first_app (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'first_app.middlewares.FirstAppSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'first_app.middlewares.FirstAppDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html


ITEM_PIPELINES = {
   # 'ArticleSpider.pipelines.JsonExporterPipleline': 2,
   #  'scrapy.pipelines.images.ImagesPipeline': 1,
   #  'ArticleSpider.pipelines.ArticleImagePipeline': 2,
   #  'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
    'first_app.pipelines.JsonWithEncodeingPipline': 2,
    'first_app.pipelines.FirstAppPipeline':1

}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

这是item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class FirstAppItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class jobbleItiem(scrapy.Item):
    css_title=scrapy.Field()
    css_date=scrapy.Field()
    css_pinglun=scrapy.Field()
    css_url=scrapy.Field()

这是piplines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import codecs
import json

class FirstAppPipeline(object):
    def process_item(self, item, spider):
        return item
class JsonWithEncodeingPipline(object):
    #自定义json文件的导出
    def __init__(self):
        self.file = codecs.open('article.json', 'w', encoding="utf-8")
    def process_item(self, item, spider):
        lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
        line="111111111111"
        self.file.write(line)  
    #不管是line还是lines都不能写入
print(line) return item def spider_closed(self, spider): self.file.close()

debug信息:

/usr/bin/python3.5 /usr/lib/pycharm-community/helpers/pydev/pydevd.py --multiproc --qt-support --client 127.0.0.1 --port 34159 --file /home/gongjian/PycharmProjects/demo/first_app/main.py
warning: Debugger speedups using cython not found. Run '"/usr/bin/python3.5" "/usr/lib/pycharm-community/helpers/pydev/setup_cython.py" build_ext --inplace' to build.
pydev debugger: process 7439 is connecting

Connected to pydev debugger (build 171.4163.6)
2018-04-20 16:43:05 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: first_app)
2018-04-20 16:43:05 [scrapy.utils.log] INFO: Versions: lxml 3.8.0.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.5.4rc1 (default, Jul 25 2017, 08:53:34) - [GCC 6.4.0 20170704], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h  27 Mar 2018), cryptography 2.2.2, Platform Linux-4.9.0-deepin13-amd64-x86_64-with-Deepin-15.5-unstable
2018-04-20 16:43:05 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'first_app.spiders', 'SPIDER_MODULES': ['first_app.spiders'], 'BOT_NAME': 'first_app'}
2018-04-20 16:43:06 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled item pipelines:
['first_app.pipelines.FirstAppPipeline',
 'first_app.pipelines.JsonWithEncodeingPipline']
2018-04-20 16:43:07 [scrapy.core.engine] INFO: Spider opened
2018-04-20 16:43:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:43:07 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-04-20 16:44:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:45:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:46:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:46:07 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/> (failed 1 times): User timeout caused connection failure: Getting http://blog.jobbole.com/all-posts/ took longer than 180.0 seconds..
2018-04-20 16:47:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:48:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:49:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 16:49:07 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/> (failed 2 times): User timeout caused connection failure: Getting http://blog.jobbole.com/all-posts/ took longer than 180.0 seconds..
2018-04-20 16:49:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/all-posts/> (referer: None)
2018-04-20 16:49:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113896/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 16:53:47 [scrapy.crawler] INFO: Received SIG_SETMASK, shutting down gracefully. Send again to force 

Process finished with exit code 137 (interrupted by signal 9: SIGKILL)

正常运行结果
/usr/bin/python3.5 /home/gongjian/PycharmProjects/demo/first_app/main.py
2018-04-20 17:01:47 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: first_app)
2018-04-20 17:01:47 [scrapy.utils.log] INFO: Versions: lxml 3.8.0.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.5.4rc1 (default, Jul 25 2017, 08:53:34) - [GCC 6.4.0 20170704], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h  27 Mar 2018), cryptography 2.2.2, Platform Linux-4.9.0-deepin13-amd64-x86_64-with-Deepin-15.5-unstable
2018-04-20 17:01:47 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'first_app.spiders', 'BOT_NAME': 'first_app', 'SPIDER_MODULES': ['first_app.spiders']}
2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole']
2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled item pipelines:
['first_app.pipelines.FirstAppPipeline',
 'first_app.pipelines.JsonWithEncodeingPipline']
2018-04-20 17:01:48 [scrapy.core.engine] INFO: Spider opened
2018-04-20 17:01:48 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-20 17:01:48 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-04-20 17:01:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/all-posts/> (referer: None)
2018-04-20 17:01:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113901/> (referer: http://blog.jobbole.com/all-posts/)
2018-04-20 17:01:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113820/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:01:57 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113901/>
{'css_date': '2018/04/17 ·', 'css_pinglun': '  评论', 'css_title': '彩票的数学知识'}
2018-04-20 17:01:57 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113820/>
{'css_date': '2018/04/14 ·', 'css_pinglun': ' 1 评论', 'css_title': '软件复杂性正在杀死我们'}
111111111111
2018-04-20 17:02:02 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113879/> (referer: http://blog.jobbole.com/all-posts/)
2018-04-20 17:02:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113879/>
{'css_date': '2018/04/14 ·', 'css_pinglun': '  评论', 'css_title': 'SQL 入门'}
111111111111
2018-04-20 17:02:02 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113568/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113568/>
{'css_date': '2018/04/16 ·',
 'css_pinglun': '  评论',
 'css_title': '最终一轮面试被 Google 刷掉,这是一种什么样的体验?'}
2018-04-20 17:02:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113875/> (referer: http://blog.jobbole.com/all-posts/)
2018-04-20 17:02:07 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113875/>
{'css_date': '2018/04/13 ·',
 'css_pinglun': ' 2 评论',
 'css_title': '使用 Graylog 和 Prometheus 监视 Kubernetes 集群'}
111111111111
2018-04-20 17:02:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113896/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:12 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113896/>
{'css_date': '2018/04/17 ·',
 'css_pinglun': '  评论',
 'css_title': '工程师如何在工作中提升自己?'}
2018-04-20 17:02:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113858/> (referer: http://blog.jobbole.com/all-posts/)
2018-04-20 17:02:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113858/>
{'css_date': '2018/04/12 ·',
 'css_pinglun': '  评论',
 'css_title': '从 Linux 源码看 socket 的阻塞和非阻塞'}
111111111111
2018-04-20 17:02:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113854/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:23 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113854/>
{'css_date': '2018/04/13 ·',
 'css_pinglun': ' 1 评论',
 'css_title': '命令行乐趣:嘲讽输错 Bash 命令的用户'}
2018-04-20 17:02:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113819/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113819/>
{'css_date': '2018/04/08 ·',
 'css_pinglun': ' 3 评论',
 'css_title': '直白介绍卷积神经网络(CNN)'}
2018-04-20 17:02:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113821/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:39 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113821/>
{'css_date': '2018/04/13 ·',
 'css_pinglun': ' 1 评论',
 'css_title': '机器学习如何发现你喜欢的音乐'}
2018-04-20 17:02:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113817/> (referer: http://blog.jobbole.com/all-posts/)
111111111111
2018-04-20 17:02:42 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113817/>
{'css_date': '2018/04/06 ·',
 'css_pinglun': ' 1 评论',
 'css_title': '给初学者的 type 命令教程'}
2018-04-20 17:02:43 [scrapy.core.downloader.handlers.http11] WARNING: Got data loss in http://blog.jobbole.com/all-posts/page/2/. If you want to process broken responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False -- This message won't be shown in further requests
2018-04-20 17:02:43 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/page/2/> (failed 1 times): [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion.>, <twisted.python.failure.Failure twisted.web.http._DataLoss: Chunked decoder in 'BODY' state, still expecting more data to get to 'FINISHED' state.>]
2018-04-20 17:02:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113828/> (referer: http://blog.jobbole.com/all-posts/)
2018-04-20 17:02:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113828/>
{'css_date': '2018/04/08 ·', 'css_pinglun': '  评论', 'css_title': '计算机语言的巨变'}
111111111111
2018-04-20 17:02:48 [scrapy.extensions.logstats] INFO: Crawled 13 pages (at 13 pages/min), scraped 12 items (at 12 items/min)
2018-04-20 17:02:58 [scrapy.crawler] INFO: Received SIG_SETMASK, shutting down gracefully. Send again to force 
2018-04-20 17:02:58 [scrapy.core.engine] INFO: Closing spider (shutdown)

Process finished with exit code 137 (interrupted by signal 9: SIGKILL)

 

执行结果:

json文件内容为空

 

转载于:https://www.cnblogs.com/niansuo/p/8891454.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
可以使用Python编写一个Scrapy爬虫来爬取数据,并将数据存储到MySQL数据。首先需要安装MySQL连接库,例如mysql-connector-python。然后在Scrapy爬虫的piplines.py文件增加一个MySQL Pipeline,以便将数据存储到MySQL数据。此外,还需要添加MySQL数据库的连接信息和表结构信息,以便能够正确地连接数据库和将数据存储到正确的表和字段。以下是一个示例: ``` python import mysql.connector class MySQLPipeline(object): def __init__(self, mysql_host, mysql_username, mysql_password, mysql_database): self.mysql_host = mysql_host self.mysql_username = mysql_username self.mysql_password = mysql_password self.mysql_database = mysql_database @classmethod def from_crawler(cls, crawler): mysql_host = crawler.settings.get('MYSQL_HOST', 'localhost') mysql_username = crawler.settings.get('MYSQL_USERNAME', 'root') mysql_password = crawler.settings.get('MYSQL_PASSWORD', '') mysql_database = crawler.settings.get('MYSQL_DATABASE', 'test') return cls(mysql_host, mysql_username, mysql_password, mysql_database) def open_spider(self, spider): self.connection = mysql.connector.connect( host=self.mysql_host, user=self.mysql_username, password=self.mysql_password, database=self.mysql_database ) self.cursor = self.connection.cursor() def close_spider(self, spider): self.cursor.close() self.connection.close() def process_item(self, item, spider): sql = "INSERT INTO table_name (column1, column2, column3) VALUES (%s, %s, %s)" values = (item['column1'], item['column2'], item['column3']) self.cursor.execute(sql, values) self.connection.commit() return item ``` 其,MYSQL_HOST、MYSQL_USERNAME、MYSQL_PASSWORD和MYSQL_DATABASE是在Scrapy的settings.py文件设置的变量,用于配置MySQL连接信息。在Scrapy的Spider,通过yield发送数据给MySQL Pipeline进行处理。例如: ``` python class MySpider(Spider): name = 'my_spider' start_urls = ['http://www.example.com'] def parse(self, response): items = response.xpath('//div[contains(@class, "item")]') for item in items: # 解析数据 column1 = item.xpath('a/text()').extract_first().strip() column2 = item.xpath('p/text()').extract_first().strip() column3 = item.xpath('span/text()').extract_first().strip() # 将数据发送给MySQL Pipeline yield { 'column1': column1, 'column2': column2, 'column3': column3 } ``` 这样就可以将Scrapy爬虫爬取的数据存储到MySQL数据了。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值