这是主程序:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 from urllib import parse 5 from first_app.items import jobbleItiem 6 7 8 9 class JobboleSpider(scrapy.Spider): 10 name = 'jobbole' 11 allowed_domains = ['blog.jobbole.com'] 12 start_urls = ['http://blog.jobbole.com/all-posts/'] 13 14 def parse(self, response): 15 post_nodes=response.css("#archive .floated-thumb .post-thumb a") 16 next_urls=response.css(".next.page-numbers::attr(href)").extract_first("") 17 for post_node in post_nodes: 18 img_url=post_node.css("img::attr(src)").extract_first("") 19 20 #img_url = post_node.css("img::attr(src)").extract_first("") 21 post_url=post_node.css("::attr(href)").extract_first("") 22 #post_url = post_node.css("::attr(href)").extract_first("") 23 yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":img_url},callback=self.parse_datail) 24 next_url1 = response.css(".next.page-numbers::attr(href)").extract_first("") 25 next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 26 if next_url1: 27 yield Request(url=parse.urljoin(response.url, next_url1), callback=self.parse) 28 29 30 def parse_datail(self,response): 31 aitcle_item=jobbleItiem() 32 # css选择器 33 css_title = response.css(".entry-header h1::text").extract()[0] 34 css_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() 35 css_pinglun = response.css("a[href='#article-comment'] span::text").extract()[0] 36 css_img=response.meta.get("front_image_url","") 37 css_url=response.url 38 aitcle_item["css_title"]=css_title 39 aitcle_item["css_date"]=css_date 40 aitcle_item["css_pinglun"]=css_pinglun 41 42 yield aitcle_item
这是setting.py
# -*- coding: utf-8 -*- # Scrapy settings for first_app project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'first_app' SPIDER_MODULES = ['first_app.spiders'] NEWSPIDER_MODULE = 'first_app.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'first_app (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'first_app.middlewares.FirstAppSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'first_app.middlewares.FirstAppDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'ArticleSpider.pipelines.JsonExporterPipleline': 2, # 'scrapy.pipelines.images.ImagesPipeline': 1, # 'ArticleSpider.pipelines.ArticleImagePipeline': 2, # 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, 'first_app.pipelines.JsonWithEncodeingPipline': 2, 'first_app.pipelines.FirstAppPipeline':1 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
这是item
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class FirstAppItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class jobbleItiem(scrapy.Item): css_title=scrapy.Field() css_date=scrapy.Field() css_pinglun=scrapy.Field() css_url=scrapy.Field()
这是piplines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import codecs import json class FirstAppPipeline(object): def process_item(self, item, spider): return item class JsonWithEncodeingPipline(object): #自定义json文件的导出 def __init__(self): self.file = codecs.open('article.json', 'w', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" line="111111111111" self.file.write(line)
#不管是line还是lines都不能写入 print(line) return item def spider_closed(self, spider): self.file.close()
debug信息:
/usr/bin/python3.5 /usr/lib/pycharm-community/helpers/pydev/pydevd.py --multiproc --qt-support --client 127.0.0.1 --port 34159 --file /home/gongjian/PycharmProjects/demo/first_app/main.py warning: Debugger speedups using cython not found. Run '"/usr/bin/python3.5" "/usr/lib/pycharm-community/helpers/pydev/setup_cython.py" build_ext --inplace' to build. pydev debugger: process 7439 is connecting Connected to pydev debugger (build 171.4163.6) 2018-04-20 16:43:05 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: first_app) 2018-04-20 16:43:05 [scrapy.utils.log] INFO: Versions: lxml 3.8.0.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.5.4rc1 (default, Jul 25 2017, 08:53:34) - [GCC 6.4.0 20170704], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Linux-4.9.0-deepin13-amd64-x86_64-with-Deepin-15.5-unstable 2018-04-20 16:43:05 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'first_app.spiders', 'SPIDER_MODULES': ['first_app.spiders'], 'BOT_NAME': 'first_app'} 2018-04-20 16:43:06 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.memusage.MemoryUsage', 'scrapy.extensions.telnet.TelnetConsole', 'scrapy.extensions.logstats.LogStats'] 2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2018-04-20 16:43:07 [scrapy.middleware] INFO: Enabled item pipelines: ['first_app.pipelines.FirstAppPipeline', 'first_app.pipelines.JsonWithEncodeingPipline'] 2018-04-20 16:43:07 [scrapy.core.engine] INFO: Spider opened 2018-04-20 16:43:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:43:07 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 2018-04-20 16:44:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:45:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:46:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:46:07 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/> (failed 1 times): User timeout caused connection failure: Getting http://blog.jobbole.com/all-posts/ took longer than 180.0 seconds.. 2018-04-20 16:47:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:48:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:49:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 16:49:07 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/> (failed 2 times): User timeout caused connection failure: Getting http://blog.jobbole.com/all-posts/ took longer than 180.0 seconds.. 2018-04-20 16:49:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/all-posts/> (referer: None) 2018-04-20 16:49:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113896/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 16:53:47 [scrapy.crawler] INFO: Received SIG_SETMASK, shutting down gracefully. Send again to force Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
正常运行结果 /usr/bin/python3.5 /home/gongjian/PycharmProjects/demo/first_app/main.py 2018-04-20 17:01:47 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: first_app) 2018-04-20 17:01:47 [scrapy.utils.log] INFO: Versions: lxml 3.8.0.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.5.4rc1 (default, Jul 25 2017, 08:53:34) - [GCC 6.4.0 20170704], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Linux-4.9.0-deepin13-amd64-x86_64-with-Deepin-15.5-unstable 2018-04-20 17:01:47 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'first_app.spiders', 'BOT_NAME': 'first_app', 'SPIDER_MODULES': ['first_app.spiders']} 2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.memusage.MemoryUsage', 'scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats', 'scrapy.extensions.telnet.TelnetConsole'] 2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2018-04-20 17:01:48 [scrapy.middleware] INFO: Enabled item pipelines: ['first_app.pipelines.FirstAppPipeline', 'first_app.pipelines.JsonWithEncodeingPipline'] 2018-04-20 17:01:48 [scrapy.core.engine] INFO: Spider opened 2018-04-20 17:01:48 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-04-20 17:01:48 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 2018-04-20 17:01:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/all-posts/> (referer: None) 2018-04-20 17:01:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113901/> (referer: http://blog.jobbole.com/all-posts/) 2018-04-20 17:01:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113820/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:01:57 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113901/> {'css_date': '2018/04/17 ·', 'css_pinglun': ' 评论', 'css_title': '彩票的数学知识'} 2018-04-20 17:01:57 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113820/> {'css_date': '2018/04/14 ·', 'css_pinglun': ' 1 评论', 'css_title': '软件复杂性正在杀死我们'} 111111111111 2018-04-20 17:02:02 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113879/> (referer: http://blog.jobbole.com/all-posts/) 2018-04-20 17:02:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113879/> {'css_date': '2018/04/14 ·', 'css_pinglun': ' 评论', 'css_title': 'SQL 入门'} 111111111111 2018-04-20 17:02:02 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113568/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113568/> {'css_date': '2018/04/16 ·', 'css_pinglun': ' 评论', 'css_title': '最终一轮面试被 Google 刷掉,这是一种什么样的体验?'} 2018-04-20 17:02:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113875/> (referer: http://blog.jobbole.com/all-posts/) 2018-04-20 17:02:07 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113875/> {'css_date': '2018/04/13 ·', 'css_pinglun': ' 2 评论', 'css_title': '使用 Graylog 和 Prometheus 监视 Kubernetes 集群'} 111111111111 2018-04-20 17:02:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113896/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:12 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113896/> {'css_date': '2018/04/17 ·', 'css_pinglun': ' 评论', 'css_title': '工程师如何在工作中提升自己?'} 2018-04-20 17:02:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113858/> (referer: http://blog.jobbole.com/all-posts/) 2018-04-20 17:02:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113858/> {'css_date': '2018/04/12 ·', 'css_pinglun': ' 评论', 'css_title': '从 Linux 源码看 socket 的阻塞和非阻塞'} 111111111111 2018-04-20 17:02:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113854/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:23 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113854/> {'css_date': '2018/04/13 ·', 'css_pinglun': ' 1 评论', 'css_title': '命令行乐趣:嘲讽输错 Bash 命令的用户'} 2018-04-20 17:02:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113819/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113819/> {'css_date': '2018/04/08 ·', 'css_pinglun': ' 3 评论', 'css_title': '直白介绍卷积神经网络(CNN)'} 2018-04-20 17:02:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113821/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:39 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113821/> {'css_date': '2018/04/13 ·', 'css_pinglun': ' 1 评论', 'css_title': '机器学习如何发现你喜欢的音乐'} 2018-04-20 17:02:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113817/> (referer: http://blog.jobbole.com/all-posts/) 111111111111 2018-04-20 17:02:42 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113817/> {'css_date': '2018/04/06 ·', 'css_pinglun': ' 1 评论', 'css_title': '给初学者的 type 命令教程'} 2018-04-20 17:02:43 [scrapy.core.downloader.handlers.http11] WARNING: Got data loss in http://blog.jobbole.com/all-posts/page/2/. If you want to process broken responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False -- This message won't be shown in further requests 2018-04-20 17:02:43 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://blog.jobbole.com/all-posts/page/2/> (failed 1 times): [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion.>, <twisted.python.failure.Failure twisted.web.http._DataLoss: Chunked decoder in 'BODY' state, still expecting more data to get to 'FINISHED' state.>] 2018-04-20 17:02:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://blog.jobbole.com/113828/> (referer: http://blog.jobbole.com/all-posts/) 2018-04-20 17:02:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://blog.jobbole.com/113828/> {'css_date': '2018/04/08 ·', 'css_pinglun': ' 评论', 'css_title': '计算机语言的巨变'} 111111111111 2018-04-20 17:02:48 [scrapy.extensions.logstats] INFO: Crawled 13 pages (at 13 pages/min), scraped 12 items (at 12 items/min) 2018-04-20 17:02:58 [scrapy.crawler] INFO: Received SIG_SETMASK, shutting down gracefully. Send again to force 2018-04-20 17:02:58 [scrapy.core.engine] INFO: Closing spider (shutdown) Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
执行结果:
json文件内容为空