Pipeline中写通用方法,模版在下方:
def get_kafka_client():
client = KafkaClient(hosts=','.join(KAFKA_HOSTS), broker_version="1.1.0")
return client
def get_connect():
connect = pymysql.Connect(
host=MYSQLDB['host'],
port=MYSQLDB['port'],
user=MYSQLDB['user'],
passwd=MYSQLDB['password'],
db=MYSQLDB['db'],
charset=MYSQLDB['charset']
)
return connect
class BasePipeline(object):
def __init__(self):
self.connect = get_connect()
self.cursor = self.connect.cursor()
self.fetch_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
self.replace_sql_tmpl = 'REPLACE into {} ({}) values ({})'
self.update_sql_tmp = 'INSERT INTO {} ({}) VALUES ({}) ON DUPLICATE KEY UPDATE {}'
def replace_into(self, table_name, keys, datalist):
replace_sql = self.replace_sql_tmpl.format(table_name, ','.join(keys), ','.join(['%s'] * len(keys)))
self.save_mysql(replace_sql, datalist)
def update_into(self, table_name, keys, datalist):
update_sql = self.update_sql_tmp.format(table_name, ','.join(keys), ','.join(['%s'] * len(keys)), ','.join(['{}=values({})'.format(x, x) for x in keys]))
self.save_mysql(update_sql, datalist)
def save_mysql(self, sql, datalist):
try:
self.cursor.executemany(sql, datalist)
self.connect.commit()
except:
try:
self.cursor.close()
self.connect.close()
self.connect = get_connect()
self.cursor = self.connect.cursor()
self.cursor.executemany(sql, datalist)
self.connect.commit()
except Exception as e:
logging.error('插入mysql失败, error={}'.format(e))
class BaseKafkaPipeline(object):
def __init__(self):
# self.client, self.producer = get_client_and_prodecer()
self.client = get_kafka_client()
self.data_list = []
def save_kafka(self, topicname, curdata):
self.data_list.append(curdata)
if len(self.data_list) >= 3:
self.insert_kafka(topicname)
self.data_list = []
def insert_kafka(self, topicname):
try:
topic = self.client.topics[topicname]
with topic.get_sync_producer() as producer:
for curdata in self.data_list:
data = json.dumps(curdata).encode('utf-8')
producer.produce(data)
except:
try:
self.client = get_kafka_client()
topic = self.client.topics[topicname]
with topic.get_sync_producer() as producer:
for curdata in self.data_list:
data = json.dumps(curdata).encode('utf-8')
producer.produce(data)
except Exception as e:
logging.error('插入kafka失败, error={}'.format(e))
##存入kafka
class KafkaPipeline(BaseKafkaPipeline):
data_list = []
data_keys = [
'aaa',
'bbb',
'ccc',
]
def process_item(self, item, spider):
self.data_list .append({x: item.get(x, '') for x in self.data_keys })
if len(self.screen_list) >= 1000:
self.save_kafka('kafka_topic', self.data_list )
self.data_list = []
return item
def close_spider(self, spider):
self.close_kafka('kafka_topic', self.data_list )
##存入mysql
class MysqlPipeline(BasePipeline):
data_list = []
data_keys = [
'aaa',
'bbb',
'ccc',
]
def process_item(self, item, spider):
if isinstance(item, ItemName):
self.data_list .append(tuple(item.get(x) for x in self.data_keys ))
if len(self.data_list ) >= 1000:
self.update_into('mysql_table', self.data_keys , self.data_list )
self.data_list = []
return item
def close_spider(self, spider):
self.update_into('mysql_table', self.data_keys , self.data_list)
self.cursor.close()
self.connect.close()