一、需求
有个需求要从oracle表里导出数据,存成csv文本文件。数据量有4亿多行、25g。最普通的解决方案是在sql*plus使用spool。尽管该方案在某些情况下可行,但它的速度太慢,输出大约每秒1m字节,全部导出需要7个多小时,这是不可接受的,需要快速导出数据。
二、解决方案
下面的核心代码出自adrian billington。自定义函数使用utl_file包输出数据,并且使用pipeline函数并行输出。使用这种方案的好处是:
a.txt_0000000677 mydir40583785
67714-aug-15 08.53.20.648485 am +08:00
14-aug-15 09.01.00.522882 am +08:00
a.txt_0000001407 mydir40156213
140714-aug-15 08.53.20.648505 am +08:00
14-aug-15 09.01.02.291799 am +08:00
a.txt_0000000703 mydir40919430
70314-aug-15 08.53.20.648190 am +08:00
14-aug-15 09.01.02.304130 am +08:00
a.txt_0000001535 mydir40475246
153514-aug-15 08.53.20.648586 am +08:00
14-aug-15 09.01.02.322707 am +08:00
a.txt_0000000555 mydir40651496
55514-aug-15 08.53.20.648141 am +08:00
14-aug-15 09.01.03.079116 am +08:00
a.txt_0000001314 mydir40591175
131414-aug-15 08.53.20.648177 am +08:00
14-aug-15 09.01.03.333193 am +08:00
a.txt_0000000506 mydir41898539
50614-aug-15 08.53.20.648098 am +08:00
14-aug-15 09.01.05.103469 am +08:00
a.txt_0000001436 mydir41175847
143614-aug-15 08.53.20.648141 am +08:00
14-aug-15 09.01.08.538071 am +08:00
a.txt_0000000445 mydir41817006
44514-aug-15 08.53.20.648177 am +08:00
14-aug-15 09.01.11.453815 am +08:00
a.txt_0000000518 mydir42811066
51814-aug-15 08.53.20.648097 am +08:00
14-aug-15 09.01.16.936903 am +08:00
elapsed: 00:07:56.41
执行结果:
411079803行、25g数据导出成10个csv文本文件,用时7分56秒。
参考:
oracle fast parallel data unload into ascii file(s)
有个需求要从oracle表里导出数据,存成csv文本文件。数据量有4亿多行、25g。最普通的解决方案是在sql*plus使用spool。尽管该方案在某些情况下可行,但它的速度太慢,输出大约每秒1m字节,全部导出需要7个多小时,这是不可接受的,需要快速导出数据。
二、解决方案
下面的核心代码出自adrian billington。自定义函数使用utl_file包输出数据,并且使用pipeline函数并行输出。使用这种方案的好处是:
- 它是很简单的sql,无需大量的sql*plus命令,不用指定行尺寸或on/off切换
- 因为它是sql,所以可以从几乎任何地方执行它,甚至可以插入到pl/sql里
- 它既有sql执行结果的内部日志,也有可选的外部unix日志文件
- 它很快,如果使用并行,可以到达很高的速度(我的并行度设置为10,可以达到每秒50m)
-- 建立目录
create or replace directory "mydir" as '/home/oracle/';
-- 建立对象
create or replace type dump_ot as object
(
file_name varchar2 (128),
directory_name varchar2 (128),
no_records number,
session_id number,
start_dttm timestamp with time zone,
end_dttm timestamp with time zone
);
-- 建立对象表
create or replace type dump_ntt as table of dump_ot;
-- 建立函数
create or replace function data_unload (
p_source in sys_refcursor,
p_filename in varchar2,
p_directory in varchar2,
p_unique_filename in varchar2 default 'n',
p_create_log_file in varchar2 default 'n')
return dump_ntt
pipelined
parallel_enable(partition p_source by any)
as
/*
p_source sql query you spool
p_filename targe file name
p_directory targe oracle directory
p_unique_filename create unique file name? y/n (appends unique sid to p_filename) - useful only for parallel unload
p_create_log_file create log file? y/n (creates separate log file and logs every 1mm rows) - has very small performance hit on the spool
*/
type row_ntt is table of varchar2 (32767);
v_rows row_ntt;
v_file utl_file.file_type;
v_log_file utl_file.file_type;
v_buffer varchar2 (32767);
v_sid varchar (255);
v_name varchar2 (255);
v_lines pls_integer := 0;
v_start_dttm timestamp with time zone := systimestamp;
v_end_dttm timestamp with time zone;
v_create_log boolean := false;
c_eol constant varchar2 (1) := chr (10);
c_eollen constant pls_integer := lengthb (c_eol);
c_maxline constant pls_integer := 32767;
c_log_limit constant pls_integer := 1000000;
begin
v_sid := lpad (sys_context ('userenv', 'sid'), 10, '0');
v_name := p_filename;
if trim (upper (p_create_log_file)) = 'y'
then
v_create_log := true;
end if;
-- add sid (must be used for parallel spooling, single spooling or spooling across db-link creates alsway one file)
if upper (p_unique_filename) = 'y'
then
v_name := v_name || '_' || to_char (v_sid);
end if;
v_file :=
utl_file.fopen (p_directory,
v_name,
'w',
c_maxline);
if v_create_log
then
v_log_file :=
utl_file.fopen (p_directory,
v_name || '.log',
'w',
c_maxline);
utl_file.put_line (
v_log_file,
to_char (v_start_dttm, 'yyyy-mm-dd hh24:mi:ss:ff3') || ' --> start');
utl_file.fflush (v_log_file);
end if;
loop
fetch p_source
bulk collect into v_rows
limit 10000;
for i in 1 .. v_rows.count
loop
if lengthb (v_buffer) + c_eollen + lengthb (v_rows (i)) <= c_maxline
then
v_buffer := v_buffer || c_eol || v_rows (i);
else
if v_buffer is not null
then
utl_file.put_line (v_file, v_buffer);
end if;
v_buffer := v_rows (i);
end if;
end loop;
v_lines := v_lines + v_rows.count;
if v_create_log and mod (v_lines, c_log_limit) = 0
then
utl_file.put_line (
v_log_file,
to_char (systimestamp, 'yyyy-mm-dd hh24:mi:ss:ff3')
|| ' --> '
|| v_lines);
utl_file.fflush (v_log_file);
end if;
exit when p_source%notfound;
end loop;
close p_source;
utl_file.put_line (v_file, v_buffer);
utl_file.fclose (v_file);
v_end_dttm := systimestamp;
if v_create_log
then
utl_file.put_line (
v_log_file,
to_char (v_end_dttm, 'yyyy-mm-dd hh24:mi:ss:ff3') || ' --> ' || v_lines);
utl_file.put_line (
v_log_file,
to_char (v_end_dttm, 'yyyy-mm-dd hh24:mi:ss:ff3') || ' --> end');
utl_file.fclose (v_log_file);
end if;
pipe row (dump_ot (v_name,
p_directory,
v_lines,
v_sid,
v_start_dttm,
v_end_dttm));
return;
end;
/
-- 显示执行时间
set timing on;
-- 调用函数,并行生成10个csv文件
select *
from table (
data_unload (
cursor (
select /*+ parallel(u,10) */
log_id
|| ','
|| typeid
|| ','
|| to_id
|| ','
|| to_msg
|| ','
|| userid
|| ','
|| nickname
|| ','
|| showing
|| ','
|| fromip
|| ','
|| to_char (createtime, 'yyyy-mm-dd hh24:mi:ss')
from u1.user_visit u),
'a.txt',
'mydir',
'y',
'n'));
查询输出:
a.txt_0000000677 mydir40583785
67714-aug-15 08.53.20.648485 am +08:00
14-aug-15 09.01.00.522882 am +08:00
a.txt_0000001407 mydir40156213
140714-aug-15 08.53.20.648505 am +08:00
14-aug-15 09.01.02.291799 am +08:00
a.txt_0000000703 mydir40919430
70314-aug-15 08.53.20.648190 am +08:00
14-aug-15 09.01.02.304130 am +08:00
a.txt_0000001535 mydir40475246
153514-aug-15 08.53.20.648586 am +08:00
14-aug-15 09.01.02.322707 am +08:00
a.txt_0000000555 mydir40651496
55514-aug-15 08.53.20.648141 am +08:00
14-aug-15 09.01.03.079116 am +08:00
a.txt_0000001314 mydir40591175
131414-aug-15 08.53.20.648177 am +08:00
14-aug-15 09.01.03.333193 am +08:00
a.txt_0000000506 mydir41898539
50614-aug-15 08.53.20.648098 am +08:00
14-aug-15 09.01.05.103469 am +08:00
a.txt_0000001436 mydir41175847
143614-aug-15 08.53.20.648141 am +08:00
14-aug-15 09.01.08.538071 am +08:00
a.txt_0000000445 mydir41817006
44514-aug-15 08.53.20.648177 am +08:00
14-aug-15 09.01.11.453815 am +08:00
a.txt_0000000518 mydir42811066
51814-aug-15 08.53.20.648097 am +08:00
14-aug-15 09.01.16.936903 am +08:00
elapsed: 00:07:56.41
执行结果:
411079803行、25g数据导出成10个csv文本文件,用时7分56秒。
参考:
oracle fast parallel data unload into ascii file(s)