stockscrapy项目中的stock.py
import scrapy
import re
class StockSpider(scrapy.Spider):
name = 'stock'
#allowed_domains = ['http://quote.eastmoney.com']
start_urls = ['http://quote.eastmoney.com/stock_list.html']
def parse(self, response):
try:
for href in response.xpath("//*[@id='quotesearch']/ul[1]/li/a/@href").extract():
url='https://xueqiu.com/S/'+re.search(r'[s][zh]\d{6}',href).group(0)
yield scrapy.Request(url=url,callback=self.parse_stock)
except:
pass
def parse_stock(self,response):
infoDict={}
if response=="":
exit()
try:
name=response.css('.stock-name::text').extract_first()
infoDict.update({'股票名称':name})
for i in response.css('.quote-info td::text').extract():
for j in response.css('.quote-info td span').extract():
info.update({i.__str__():j.__str__()})
yield infoDict
except:
print("error")
pipline.py
import codecs
import csv
class CsvPipeline(object):
def __init__(self, item, spider):
self.file=codecs.open('stock.csv','w',encoding='utf-8')
self.writer=csv.DictWriter(self.f)
def process_item(self,item,spider):
self.writer.writerow(infoDict)
return infoDict
def close(self,spider):
self.f.close()
setting.py
BOT_NAME = 'stockscrapy'
SPIDER_MODULES = ['stockscrapy.spiders']
NEWSPIDER_MODULE = 'stockscrapy.spiders'
HTTPERROR_ALLOWED_CODES = [403]#上面报的是403,就把403加入。
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ' Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY =False
# 管道文件配置,BossPipeline是pipelines.py中的类名
ITEM_PIPELINES = {
'boss.pipelines.BossPipeline': 300,
}
scrapy crawl stock出错
2020-05-22 09:13:49 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
Unhandled error in Deferred:
2020-05-22 09:13:49 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "c:\users\admin\anaconda3\lib\site-packages\twisted\internet\defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "c:\users\admin\anaconda3\lib\site-packages\twisted\internet\defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "c:\users\admin\anaconda3\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\crawler.py", line 87, in crawl
self.engine = self._create_engine()
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\crawler.py", line 101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\core\engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\middleware.py", line 34, in from_settings
mwcls = load_object(clspath)
File "c:\users\admin\anaconda3\lib\site-packages\scrapy\utils\misc.py", line 50, in load_object
mod = import_module(module)
File "c:\users\admin\anaconda3\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 953, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 965, in _find_and_load_unlocked
builtins.ModuleNotFoundError: No module named 'boss'
尚未解决