单线程脚本
导入文件的行数
# wc -l /data/logs/testlog/20120219/testlog1/*
1510503 total
# -*- coding: utf-8 -*-
#!/usr/bin/env python
#create database pythondata
#create table log (logline varchar(500));
#grant all on pythondata.* to 'pyuser'@'localhost' identified by "pypasswd";
import MySQLdb
import os
import time
def writeLinestoDb(sql,content):
conn=MySQLdb.connect(host="localhost",user="pyuser",passwd="pypasswd",db="pythondata")
cur =conn.cursor()
cur.executemany(sql,content)
cur.close()
conn.commit()
conn.close()
def readLinestoList(path):
alllines=[]
for file in os.listdir(path):
files=os.path.join(path,file)
for line in open(files):
alllines.append(line)
return alllines
def main():
insertsql ="INSERT INTO log(logline) VALUES(%s)"
alllines=readLinestoList('/data/logs/testlog/20120219/testlog1')
for line in alllines:
content=line.strip()
print content
writeLinestoDb(insertsql,content)
time.sleep(10)
if __name__=="__main__":
print('starting at:',time.ctime())
main()
print('ending at:',time.ctime())
('starting at:', 'Tue Mar 27 11:09:20 2012')
('ending at:', 'Tue Mar 27 11:13:20 2012')
耗时4分钟
mysql> select count(*) from log ;
+----------+
| count(*) |
+----------+
| 1510551 |
+----------+
多线程脚本
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import MySQLdb
import os
from time import ctime
from threading import Thread
from Queue import Queue
in_num_thread=10
out_num_thread=10
in_queue=Queue()
out_queue=Queue()
def listDir(path):
for filename in os.listdir(path):
in_queue.put(os.path.join(path,filename))
def readFile(iq,in_queue):
filelines=[]
while True:
file=in_queue.get()
for line in open(file):
filelines.append(line)
out_queue.put(filelines)
in_queue.task_done()
def writeLinestoDb(oq,out_queue):
sql=insertsql ="INSERT INTO log(logline) VALUES(%s)"
while True:
content=out_queue.get()
conn=MySQLdb.connect(host="localhost",user="pyuser",passwd="pypasswd",db="pythondata")
cur =conn.cursor()
cur.executemany(sql,content)
cur.close()
conn.commit()
conn.close()
out_queue.task_done()
def main():
listDir('/data/logs/testlog/20120219/testlog1')
for iq in range(in_num_thread):
worker=Thread(target=readFile,args(iq,in_queue))
worker.setDaemon(True)
worker.start()
print "Readfile Main Thread Waiting at",ctime()
in_queue.join()
print "Readfile Done at,",ctime()
for oq in range(out_num_thread):
worker=Thread(target=writeLinestoDb,args(oq,out_queue))
worker.setDaemon(True)
worker.start()
print "Insert into mysql Main Thread at",ctime()
out_queue.join()
print "Insert into mysql at,",ctime()
if __name__=="__main__":
print('starting at:',time.ctime())
main()
print('ending at:',time.ctime())
数据库位于本机
('starting at:', 'Tue Mar 27 10:57:01 2012')Readfile Main Thread Waiting at Tue Mar 27 10:57:01 2012Readfile Done at, Tue Mar 27 10:57:04 2012Insert into mysql Main Thread at Tue Mar 27 10:57:04 2012Insert into mysql at, Tue Mar 27 11:03:34 2012('ending at:', 'Tue Mar 27 11:03:34 2012')mysql> select count(*) from log ;+----------+| count(*) |+----------+| 3676015 |+----------+
两次个数据不一致,多线的导入有问题。
服务器配置4G8核,mysql本地 两个脚本在同一台机器上运行
多线程脚本改进
#!/usr/bin/env python
#create table log ( logline varchar(300));
#grant all on pythondata.* to 'pyuser'@'localhost' identified by "pypasswd"
import MySQLdb
import os
import sys
from time import ctime
from threading import Thread
from Queue import Queue
num_thread=10
queue=Queue()
def listDir(path):
file_list=[]
for filename in os.listdir(path):
file_list.append(os.path.join(path,filename))
return file_list
def readFile(file):
alllines=[]
for line in open(file):
alllines.append(line)
return alllines
def writeLinestoDb(q,queue):
sql=insertsql ="INSERT INTO log(logline) VALUES(%s)"
while True:
content=queue.get()
conn=MySQLdb.connect(host="localhost",user="pyuser",passwd="pypasswd",db="pythondata")
cur =conn.cursor()
cur.executemany(sql,content)
cur.close()
conn.commit()
conn.close()
queue.task_done()
def main():
print "Readfile Start at,",ctime()
for file in listDir('/data/logs/testlog/20120219/testlog1'):
queue.put(readFile(file))
print "Readfile Done at,",ctime()
for q in range(num_thread):
worker=Thread(target=writeLinestoDb,args=(q,queue))
worker.setDaemon(True)
worker.start()
print "Insert into mysql Main Thread at",ctime()
queue.join()
print "Insert into mysql at,",ctime()
if __name__=="__main__":
print('starting at:',ctime())
main()
print('ending at:',ctime())
结果('starting at:', 'Tue Mar 27 14:32:05 2012')Readfile Start at, Tue Mar 27 14:32:05 2012Readfile Done at, Tue Mar 27 14:32:07 2012Insert into mysql Main Thread at Tue Mar 27 14:32:08 2012Insert into mysql at, Tue Mar 27 14:34:31 2012('ending at:', 'Tue Mar 27 14:34:31 2012')mysql> select count(*) from log; +----------+| count(*) |+----------+| 1510551 |+----------+读用了2秒中,插入使用2分23秒
第一个多线程脚本错误的原因是传入队列的数据问题。
还有一个问题,读的文件超过物理内存和虚拟内存的总量,会造成内存溢出程序挂掉,解决办法每次读取指定行