这两天遇到个问题,就是需要循环调用scrapy框架
这是官网给的几种方法(但是这只是调用多个爬虫的方法)
https://doc.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
import time
import logging
from scrapy.utils.project import get_project_settings
import multiprocessing
import psycopg2
import time
#在控制台打印日志
configure_logging()
#CrawlerRunner获取settings.py里的设置信息
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def crawl():
logging.info("new cycle starting")
yield runner.crawl('name')
reactor.stop()
def pgsql():
conn = psycopg2.connect(database="1111", user="1111",
password="1111", host="1111", port="1111")
cursor = conn.cursor()
cursor.execute('SELECT id, subtidf_id, sp_detail, status, tasksum_id FROM public.stask where position({} in subtidf_id)!=0 ;'.format("'netyicc'"))
rows = cursor.fetchall()
print(rows)
for row in rows:
if row[-2] ==101:
cursor.execute(''''';'.format(row[0]))
conn.commit()
crawl()
reactor.run()
cursor.execute('111111;'.format(row[0]))
conn.commit()
conn.close()
cursor.close()
if __name__ == '__main__':
while 1:
pgsql()
time.sleep(60)
我一开始是用的这种方法就会出现异常:twisted.internet.error.ReactorNotRestartable
经过三天的各种搜索都无法解决,后来了解到reactor.run()一个进程只能调用一次
所以我就通过多进程来解决此方法,成功了
这是成功的代码:
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
import time
import logging
from scrapy.utils.project import get_project_settings
import multiprocessing
import psycopg2
import time
#在控制台打印日志
configure_logging()
#CrawlerRunner获取settings.py里的设置信息
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def crawl():
logging.info("new cycle starting")
yield runner.crawl('name')
reactor.stop()
def pgsql():
conn = psycopg2.connect(database="1111", user="1111",
password="1111", host="1111", port="1111")
cursor = conn.cursor()
cursor.execute('1111)!=0 ;'.format("'netyicc'"))
rows = cursor.fetchall()
print(rows)
for row in rows:
if row[-2] ==101:
cursor.execute('1111;'.format(row[0]))
conn.commit()
crawl()
reactor.run()
cursor.execute('111111;'.format(row[0]))
conn.commit()
conn.close()
cursor.close()
if __name__ == '__main__':
while 1:
process = multiprocessing.Process(target=pgsql)
process.start()
process.join()
time.sleep(60)