本文解决的问题:指定SQL指定分隔符远程从MySQL数据库导出数据文件到本地。
ETL开发过程中数据库与文本文件是交互最多的一种形式,其中涉及到数据库的表落地文件,指定分隔符,以前都是Oracle有现成的数据导出工具sqluldr、ociuldr。最近转型MySQL数据库,同样的业务场景,尝试了Mysql的select tablename into outfile只能在数据库服务端运行,不满足要求。
借鉴Oracle的ociuldr/sqluldr使用经验,周末时间开发了mysqluldr,其中部分函数代码几乎照搬ociuldr/sqluldr的源码,改动了部分数据类型,(ociuldr/sqluldr源码已在github开源),特此声明,不用做商用,仅作为学习交流。
用了大概半天时间开发调试,导出百万数据,效率客观,虚拟机资源有限没有更多的数据可以测试。测试结果如下,源码及Makefile也在本文档中一并发布。
[root@hadoop qcp]# date
Sat Sep 3 20:56:48 CST 2022
[root@hadoop qcp]# ./mysqluldr user=root pass=root host=127.0.0.1 port=3307 db=test2 query='select * from student' field='^|'
Welcome to use this Unload Data Tools From Mysql DB
Unload data from MySQL DB Release 1.0 .2022.09.03 Created by qichaopu
connected mysql user:root
100000 rows exported at 2022-09-03 20:56:59
200000 rows exported at 2022-09-03 20:56:59
300000 rows exported at 2022-09-03 20:56:59
400000 rows exported at 2022-09-03 20:56:59
500000 rows exported at 2022-09-03 20:56:59
600000 rows exported at 2022-09-03 20:56:59
700000 rows exported at 2022-09-03 20:56:59
800000 rows exported at 2022-09-03 20:56:59
900000 rows exported at 2022-09-03 20:56:59
1000000 rows exported at 2022-09-03 20:56:59
1100000 rows exported at 2022-09-03 20:56:59
1200000 rows exported at 2022-09-03 20:56:59
1300000 rows exported at 2022-09-03 20:56:59
1400000 rows exported at 2022-09-03 20:57:00
1500000 rows exported at 2022-09-03 20:57:00
1600000 rows exported at 2022-09-03 20:57:00
1700000 rows exported at 2022-09-03 20:57:00
1800000 rows exported at 2022-09-03 20:57:00
1900000 rows exported at 2022-09-03 20:57:00
2000000 rows exported at 2022-09-03 20:57:00
2100000 rows exported at 2022-09-03 20:57:00
2200000 rows exported at 2022-09-03 20:57:00
2300000 rows exported at 2022-09-03 20:57:00
2400000 rows exported at 2022-09-03 20:57:00
2500000 rows exported at 2022-09-03 20:57:00
2600000 rows exported at 2022-09-03 20:57:00
2700000 rows exported at 2022-09-03 20:57:00
2800000 rows exported at 2022-09-03 20:57:00
2900000 rows exported at 2022-09-03 20:57:00
3000000 rows exported at 2022-09-03 20:57:00
3100000 rows exported at 2022-09-03 20:57:00
3200000 rows exported at 2022-09-03 20:57:00
3300000 rows exported at 2022-09-03 20:57:00
3400000 rows exported at 2022-09-03 20:57:01
3500000 rows exported at 2022-09-03 20:57:01
3600000 rows exported at 2022-09-03 20:57:01
3700000 rows exported at 2022-09-03 20:57:01
3800000 rows exported at 2022-09-03 20:57:01
3900000 rows exported at 2022-09-03 20:57:01
4000000 rows exported at 2022-09-03 20:57:01
4100000 rows exported at 2022-09-03 20:57:01
4194304 rows exported at 2022-09-03 20:57:01
[root@hadoop qcp]# ls -rtl mysqluldrdata.txt
-rw-r--r-- 1 root root 109051904 Sep 3 20:57 mysqluldrdata.txt
源码如下,与上图的测试结果有所调整,代码进行了优化,
/*
NAME
mysqluldr.c - unload text data from mysql database
MODIFIED (MM/DD/YY)
qichaopu 2022.09.03 -
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <mysql.h>
#define MAX_ITEM_BUFFER_SIZE 33
#if defined(_WIN32)
#define STRNCASECMP memicmp
#else
#define STRNCASECMP strncasecmp
#endif
#define MIN(a,b) ((a) > (b) ? (b) : (a))
int return_code = 0;
void printRow(char *fname,MYSQL_RES *res,int cols,char *field,int flen,int rlen,char *record);
int convertOption(const char *src, char* dst, int mlen);
FILE *openFile(const char *fname, char tempbuf[], int batch);
char getHexIndex(char c);
void printRowInfo(int row);
int main(int argc, char *argv[])
{
char user[32]="";
char pass[32]="";
char host[32]="";
char port[32]="";
char db[32]="";
char query[8192]="";
char fname[255]="mysqluldrdata.txt";
char field[32]=",";
char record[32]="\n";
int n,i,argcount=0;
int v_help=0;
int rlen,flen;
rlen=flen=1;
for(i=0;i<argc;i++)
{
if (STRNCASECMP("user=",argv[i],5)==0)
{
memset(user,0,32);
memcpy(user,argv[i]+5,MIN(strlen(argv[i]) - 5,31));
}
else if (STRNCASECMP("pass=",argv[i],5)==0)
{
memset(pass,0,32);
memcpy(pass,argv[i]+5,MIN(strlen(argv[i]) - 5,31));
}
else if (STRNCASECMP("host=",argv[i],5)==0)
{
memset(host,0,32);
memcpy(host,argv[i]+5,MIN(strlen(argv[i]) - 5,31));
}
else if (STRNCASECMP("port=",argv[i],5)==0)
{
memset(port,0,32);
memcpy(port,argv[i]+5,MIN(strlen(argv[i]) - 5,31));
}
else if (STRNCASECMP("db=",argv[i],3)==0)
{
memset(db,0,32);
memcpy(db,argv[i]+3,MIN(strlen(argv[i]) - 3,31));
}
else if (STRNCASECMP("query=",argv[i],6)==0)
{
memset(query,0,8192);
memcpy(query,argv[i]+6,MIN(strlen(argv[i]) - 6,8191));
}
else if (STRNCASECMP("field=",argv[i],6)==0)
{
memset(field,0,32);
flen=convertOption(argv[i]+6,field,MIN(strlen(argv[i]) - 6,31));
}
else if (STRNCASECMP("record=",argv[i],7)==0)
{
memset(record,0,32);
rlen=convertOption(argv[i]+7,record,MIN(strlen(argv[i]) - 7,31));
}
else if (STRNCASECMP("file=",argv[i],5)==0)
{
memset(fname,0,132);
memcpy(fname,argv[i]+5,MIN(strlen(argv[i]) - 5,254));
}
else if (STRNCASECMP("-help",argv[i],4)==0)
{
v_help=1;
}
}
if (strlen(user)==0 || strlen(query)==0|| v_help)
{
if (v_help||1==1)
{
printf("mysqluldr: Unload data from MySQL DB Release 1.0 2022.09.03 Created by qichaopu\n");
printf("\n");
printf("Usage: %s user=... host=... port=... db=... file=...\n",argv[0]);
printf("Notes:\n");
printf(" user = username\n");
printf(" pass = password\n");
printf(" host = mysql host\n");
printf(" port = port\n");
printf(" db = database name\n");
printf(" query = select statement\n");
printf(" field = seperator string between fields\n");
printf(" record= seperator string between records\n");
printf(" file = output file name(default: mysqluldrdata.txt)\n");
printf("\n");
printf(" for field and record, you can use '0x' to specify hex character code,\n");
printf(" \\r=0x%02x \\n=0x%02x |=0x%0x ,=0x%02x \\t=0x%02x\n",'\r','\n','|',',','\t');
printf(" for more hex character code,you can use unix command:man ascii\n");
exit(0);
}
}
printf("Welcome to use this Unload Data Tools From Mysql DB \n");
printf("Unload data from MySQL DB Release 1.0 .2022.09.03 Created by qichaopu \n");
MYSQL dbconn;
MYSQL_RES *res;
mysql_init(&dbconn);
if(mysql_real_connect(&dbconn,host,user,pass,db,atoi(port),NULL,0))
{
printf("connected mysql user:%s\n",user);
}
else
{
printf("connected failed mysql user:%s\n",user);
}
mysql_query(&dbconn,"set names utf8");
mysql_query(&dbconn,query);
res=mysql_use_result(&dbconn);
if(res==NULL)
{
printf("data result is empty\n");
return 0;
}
int cols=mysql_field_count(&dbconn);
printRow(fname,res,cols,field,flen,rlen,record);
mysql_free_result(res);
mysql_close(&dbconn);
}
void printRow(char *fname,MYSQL_RES *res,int cols,char *field,int flen,int rlen,char *record)
{
MYSQL_ROW row;
char tempbuf[512];
FILE *fp;
int bcount=1;
memset(tempbuf,0,512);
int rc=0;
if((fp = openFile(fname,tempbuf,bcount)) == NULL)
{
fprintf(stdout,"ERROR -- Cannot write to file : %s\n", tempbuf);
return_code = 6;
}
int c=0;
printRowInfo(rc);
while(row=mysql_fetch_row(res))
{
c=0;
rc++;
for(c=0;c<cols;c++)
{
if(row[c]!=NULL)
fprintf(fp, "%s",row[c]);
if (c < cols - 1)
fwrite(field,flen,1,fp);
}
fwrite(record,rlen,1,fp);
if(rc%500000==0)
printRowInfo(rc);
}
fclose(fp);
if(rc%500000!=0)
printRowInfo(rc);
}
void printRowInfo(int row)
{
time_t now = time(0);
struct tm *ptm = localtime(&now);
fprintf(stdout,"%8u rows exported at %04d-%02d-%02d %02d:%02d:%02d\n",
row,
ptm->tm_year + 1900,
ptm->tm_mon + 1,
ptm->tm_mday,
ptm->tm_hour,
ptm->tm_min,
ptm->tm_sec);
fflush(stdout);
}
int convertOption(const char *src, char* dst, int mlen)
{
int i,len,pos;
char c,c1,c2;
i=pos=0;
len = strlen(src);
while(i<MIN(mlen,len))
{
if ( *(src+i) == '0')
{
if (i < len - 1)
{
c = *(src+i + 1);
switch(c)
{
case 'x':
case 'X':
if (i < len - 3)
{
c1 = getHexIndex(*(src+i + 2));
c2 = getHexIndex(*(src+i + 3));
*(dst + pos) = (char)((c1 << 4) + c2);
i=i+2;
}
else if (i < len - 2)
{
c1 = *(src+i + 2);
*(dst + pos) = c1;
i=i+1;
}
break;
default:
*(dst + pos) = c;
break;
}
i = i + 2;
}
else
{
i ++;
}
}
else
{
*(dst + pos) = *(src+i);
i ++;
}
pos ++;
}
*(dst+pos) = '\0';
return pos;
}
FILE *openFile(const char *fname, char tempbuf[], int batch)
{
FILE *fp=NULL;
int i, j, len;
time_t now = time(0);
struct tm *ptm = localtime(&now);
len = strlen(fname);
j = 0;
for(i=0;i<len;i++)
{
tempbuf[j++] = *(fname+i);
}
tempbuf[j]=0;
if (tempbuf[0] == '+')
fp = fopen(tempbuf+1, "ab+");
else
fp = fopen(tempbuf, "wb+");
return fp;
}
char getHexIndex(char c)
{
if ( c >='0' && c <='9') return c - '0';
if ( c >='a' && c <='f') return 10 + c - 'a';
if ( c >='A' && c <='F') return 10 + c - 'A';
return 0;
}
Makefile如下:
MYSQL_PUBLIC_H=/data/mysql/include
CC = gcc
CPP = g++
OBJS = mysqluldr.o
PROG = mysqluldr
all:$(PROG)
$(PROG):$(OBJS)
@echo "[link] ... "
@echo "---------------------"
$(CC) -g -o $(PROG) $(OBJS) -lmysqlclient
@echo
clean:
rm -f $(PROG) $(OBJS)
.SUFFIXES: .cpp .c
.c.o:
@echo "[$*.o]"
@echo "---------------------"
$(CC) -g -o $*.o -I$(MYSQL_PUBLIC_H) -c $*.c
@echo