python3脚本使用sql loader批量导入字节文件并二次处理

最新推荐文章于 2023-02-13 17:51:29 发布

啊啊啊西吧

最新推荐文章于 2023-02-13 17:51:29 发布

阅读量4.5k

点赞数 1

分类专栏： Python3 C/C++ 文章标签： sqlldr cx-oracle sql-loader

本文链接：https://blog.csdn.net/drdairen/article/details/74194584

版权

Python3 同时被 2 个专栏收录

45 篇文章 0 订阅

订阅专栏

C/C++

43 篇文章 1 订阅

订阅专栏

由于使用cx_oracle连接oracle然后导入数据文件速度太慢（要导的数据有几百G），出于速度的考虑，选择了oracle自带的sql loader工具，使用python脚本写界面，与用户交互，然后使用system（）去批量调用sql loader，对于导入失败的bad数据以及日志数据，我用了c++写了一个程序，把bad文件转换成了ok的数据，然后又写了一个脚本处理这些bad数据。
1、python3调用sqlldr工具导入数据的脚本实现。

#!C:\Python34\python.exe
'''
by dairen 20170703
'''
from tkinter import *
import tkinter.messagebox as messagebox
import cx_Oracle
import time
import datetime
import os
class Application(Frame):
    def __init__(self, master=None):
        Frame.__init__(self, master,height=2000,width=4000)
        self.pack()
        self.createWidgets()

    def createWidgets(self):
        self.label_1 = Label(self,text = "文件起始日期:")
        self.label_2 = Label(self,text = "文件结束日期:")
        self.label_1.grid(row = 0,column = 0)
        self.label_2.grid(row = 1,column = 0)
        self.Input1 = Entry(self)
        self.Input2 = Entry(self)
        self.Input1.grid(row = 0,column = 1)
        self.Input2.grid(row = 1,column = 1)
        self.Button1 = Button(self, text='开始导入',width=20, command=self.runmain)
        self.Button1.grid(row =2,columnspan=2)

    def runmain(self):
        begin_date = self.Input1.get() or '20170607'
        end_date = self.Input2.get() or '20170607'
        messagebox.showinfo('Tips!', '您的输入是：%s 和 %s \n 导入即将开始！' %(begin_date,end_date))
        #offset:358O b_Info 40-60

        aaa=[' CHAR(42)',' CHAR(42)',' CHAR(42)',' CHAR(8)',' CHAR(3)',
         ' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',
         ' CHAR(6)',' CHAR(10)',' CHAR(11)',' CHAR(11)',' CHAR(11)',
         ' CHAR(11)',' CHAR(11)',' CHAR(21)',' CHAR(21)',' CHAR(21)',
         ' CHAR(4)',' CHAR(6)',' CHAR(12)',' CHAR(4)',' CHAR(6)',
         ' CHAR(4)',' CHAR(3)',' CHAR(2)',' CHAR(12)',' CHAR(6)',
         ' CHAR(8)',' CHAR(15)',' CHAR(65)',' CHAR(3)',' CHAR(4)',
         ' CHAR(6)',' CHAR(10)',' CHAR(1)',' CHAR(1)',' CHAR(5)'
        ,' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(4)'
        ,' CHAR(4)',' CHAR(11)',' CHAR(11)',' NUMBER(12,0)',' NUMBER(12,0)'
        ,' NUMBER(12,0)',' NUMBER(12,0)',' CHAR(3)',' CHAR(3)',' NUMBER(8,0)'
        ,' CHAR(1)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
        ,' NUMBER(8,0)',' CHAR(8)',' CHAR(8)',' CHAR(8)',' NUMBER(8,0)'
        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
        ,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' CHAR(16)',' CHAR(2)'
        ,' CHAR(1)',' CHAR(2)',' CHAR(14)',' CHAR(4)',' CHAR(3)'
        ,' CHAR(9)',' NUMBER(12,0)',' CHAR(3)',' CHAR(1)',' CHAR(8)'
        ,' CHAR(1)',' CHAR(12)',' CHAR(15)',' CHAR(6)',' CHAR(4)'
        ,' CHAR(100)']
        lenlist=[42,42,42,8,3,1,1,1,1,1,
        6,10,11,11,11,11,11,21,21,21,
        4,6,12,4,6,4,3,2,12,6,
        8,15,40,3,4,6,10,1,1,5,
        2,2,2,2,4,4,11,11,12,12,
        12,12,3,3,8,1,8,8,8,8,
        8,8,8,8,8,8,8,8,8,8,
        8,8,8,8,8,8,8,8,16,2,
        1,2,14,4,3,9,12,3,1,8,
        1,12,15,6,4,100]
        title=[
        'Main_Key','Origin_Key','Er_Key','PostDate','Curr',
        'If_Curr','If_Cen','Cups_Sett','If_Cvm_In','If_Single',
        'Ssn_Trans','Time_Trans','Ob_Code','Trans_Code','Rec_Code',
        'Cb_Code','Cvm_Code','Account11','Account_In','Account_Out',
        'MsgType','ProcCode','TranAmt','ActDate','ActTime',
        'MerChant_Type','Sevr_Point_In','Sevr_Point_Con','Search','AuthCode',
        'Term','Ob_Id','Ob_Info','Curr_Tran','Reason_Code',
        'Ssn_Trans_Ori','Time_Trans_Ori','Status_Snd','Status_Rec','Status_Trans',
        'RejCode1','RejCode2','RejCode3','RejCode4','Sett_Snd_AreaCode',
        'Sett_Rec_AreaCode','Sett_Snd_Code','Sett_Rec_Code' ,'Amt_Snd_Sett_dr','Amt_Snd_Sett_cr',
        'Amt_Rec_Sett_dr','Amt_Rec_Sett_cr','Curr_Sett_Snd','Curr_Sett_Rec','Comm',
        'Comm_Way','Comm_Snd_dr','Comm_Snd_cr','Comm_Rec_dr','Comm_Rec_cr',
        'Fee_Snd_dr','Fee_Snd_cr','Fee_Rec_dr','Fee_Rec_cr','Comm_Cen_dr',
        'Comm_Cen_cr','Fee_Snd_Branch_dr','Fee_Snd_Branch_cr','Fee_Rec_Branch_dr','Fee_Rec_Branch_cr',
        'Fee_CanCel_Snd_dr','Fee_CanCel_Snd_cr','Fee_CanCel_Rec_dr','Fee_CanCel_Rec_cr','Fee_CanCel_Cen_dr',
        'Fee_CanCel_Cen_cr','Fee_Cl_Snd_Branch_dr','Fee_Cl_Snd_Branch_cr','Fee_Cl_Rec_Branch_dr','Fee_Cl_Rec_Branch_cr',
        'TranType','CardKind','CardType','Bin','Card',
        'Er_Max','Ssn_Cen','TranAmt_Ori','TranCode_Ori','Reserved',
        'variab1','variab2','variab3','variab4','variab5','variab6'
        ]
        x=len(title)
        fields=[]
        fields = [title[i]+aaa[i] for i in range(0,x)]
        fields_str = ', '.join(fields)

    #begin_date=input("input start date(格式：20170506):")
    #end_date=input("input end date(格式：20170606):")
        date_list = []
        begin_date = datetime.datetime.strptime(begin_date, "%Y%m%d")
        end_date = datetime.datetime.strptime(end_date, "%Y%m%d")
        if (begin_date > end_date):
            messagebox.showinfo('error!', 'begin_date or end_date error!)')
        while( begin_date <= end_date):
            date_str = begin_date.strftime("%Y%m%d")
            date_list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        #print(date_list)

        file_name_list=[]
        numlist=['01','02','03','04']
        for datename in date_list:
            for num in numlist:
                bsi_str='BSI%s01_%s'%(datename,num)
                file_name_list.append(bsi_str)
            for num in numlist:
                bsi_str='BSI%s51_%s'%(datename,num)
                file_name_list.append(bsi_str)
            file_name_list.append('BSI%s88_01'%datename)
            file_name_list.append('BSI%s99_01'%datename)

            for num in numlist:
                bsa_str='BSA%s01_%s'%(datename,num)
                file_name_list.append(bsa_str)
            for num in numlist:
                bsa_str='BSA%s51_%s'%(datename,num)
                file_name_list.append(bsa_str)
            file_name_list.append('BSA%s88_01'%datename)
            file_name_list.append('BSA%s99_01'%datename)
        #print(file_name_list)

        table_list=[]
        for datename in date_list:
            bsi_str='BSI_%s'%datename
            table_list.append(bsi_str)
            bsa_str='BSA_%s'%datename
            table_list.append(bsa_str)
        #print(table_list)

        #开始计时
        starttime=time.clock()
        try:
            conn = cx_Oracle.connect('xxx/xxx@xxx/orcl')   
            #print("connect oracle success!!!")
        except:
            #print("connect oracle failed!!!")
            messagebox.showinfo('warning!', 'failed connect oracle!')
            return
        cursor = conn.cursor()

        data_num=len(date_list)
        file_num=len(file_name_list)
        table_num=len(table_list)

        #create table
        for k in range(0,table_num):
            table_name= table_list[k]
            #print("当前表名：%s "%table_name)

            sql_createtb = "create table %s (%s)" % (table_name, fields_str)
            sql_droptb='drop table %s purge'%table_name
            #print(sql_createtb)
            #print(sql_droptb)
            try:
                #print("开始建表！")
                cursor.execute(sql_createtb)
                #print("建表成功！")
            except:
                #print("表已存在！开始删表!")
                cursor.execute(sql_droptb)
                #print("删表成功！开始重新建表！")
                cursor.execute(sql_createtb)
                #print("重新建表成功！")

        conn.commit()
        cursor.close()   
        conn.close()

        #execute cmd according to fiel_name
        succ=0
        errornum=0
        for i in range(0,file_num):
            try:
                file_name=file_name_list[i]   
                if os.path.isfile(file_name):
                    table_name=table_list[i//10]

                    #CHARACTERSET ZHS16GBK
                    #edit ctl files according to table_name
                    lines=open('bsa.ctl','r').readlines()
                    for k in range(len(lines)):
                        if 'INTO TABLE' in lines[k]:
                            lines[k]='INTO TABLE %s\n'%table_name
                    f=open('bsa.ctl','w')
                    f.writelines(lines)
                    f.close()

                    basedir=os.getcwd()
                    newname=file_name+".dat"
                    os.rename(file_name,newname)
                    bad_dir='%s\\bad_files'%basedir
                    log_dir='%s\\log_files'%basedir
                    if not os.path.exists(bad_dir):
                        os.mkdir(bad_dir)
                    if not os.path.exists(log_dir):
                        os.mkdir(log_dir)

                    #import
                    cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa.ctl data=%s "%(basedir,newname)
                    cmd_str+="bad=%s\\bad_files\\%s.bad log=%s\\log_files\\%s.log direct=true"%(basedir,file_name,basedir,file_name)
                    #print(cmd_str)
                    os.system(cmd_str)
                    succ+=1
                    #process bad file
            except:
                errornum+=1
        endtime=time.clock()

        messagebox.showinfo('完成！!','耗时约：%f秒,success in %d files'%((endtime-starttime),succ))
app = Application()
# 设置窗口标题:
app.master.title('CUPS IMPORTER')
# 主消息循环:
app.mainloop()

2、sqlldr规则文件的编写.
sqlldr读取文件有好几种方式，我要处理的源文件是字节流的，所以使用了按字节分隔的控制文件，如下：

options(ERRORS=20000)
load DATA
APPEND
INTO TABLE BSA_20170607
(
Main_Key  POSITION(01:42)  CHAR,
Origin_Key  POSITION(43:84)  CHAR,
Er_Key  POSITION(85:126)   CHAR,
PostDate  POSITION(127:134)   CHAR,
Curr  POSITION(135:137)   CHAR,
If_Curr  POSITION(138:138)  CHAR,
If_Cen  POSITION(139:139)  CHAR,
Cups_Sett  POSITION(140:140)  CHAR,
If_Cvm_In  POSITION(141:141)  CHAR,
If_Single  POSITION(142:142)  CHAR,
Ssn_Trans  POSITION(143:148)  CHAR,
Time_Trans  POSITION(149:158)  CHAR,
Ob_Code  POSITION(159:169)  CHAR,
Trans_Code  POSITION(170:180)  CHAR,
Rec_Code  POSITION(181:191)  CHAR,
Cb_Code  POSITION(192:202)  CHAR,
Cvm_Code  POSITION(203:213)  CHAR,
Account11  POSITION(214:234)  CHAR,
Account_In  POSITION(235:255)  CHAR,
Account_Out  POSITION(256:276)  CHAR,
MsgType  POSITION(277:280)  CHAR,
ProcCode  POSITION(281:286)  CHAR,
TranAmt  POSITION(287:298)  CHAR,
ActDate  POSITION(299:302)  CHAR,
ActTime  POSITION(303:308)  CHAR,
MerChant_Type  POSITION(309:312)  CHAR,
Sevr_Point_In  POSITION(313:315)  CHAR,
Sevr_Point_Con  POSITION(316:317)  CHAR,
Search  POSITION(318:329)  CHAR,
AuthCode  POSITION(330:335)  CHAR,
Term  POSITION(336:343)  CHAR,
Ob_Id  POSITION(344:358)  CHAR,
Ob_Info  POSITION(359:398)  CHAR,
Curr_Tran  POSITION(399:401)  CHAR,
Reason_Code  POSITION(402:405)  CHAR,
Ssn_Trans_Ori  POSITION(406:411)  CHAR,
Time_Trans_Ori  POSITION(412:421)  CHAR,
Status_Snd  POSITION(422:422)  CHAR,
Status_Rec  POSITION(423:423)  CHAR,
Status_Trans  POSITION(424:428)  CHAR,
RejCode1  POSITION(429:430)  CHAR,
RejCode2  POSITION(431:432)  CHAR,
RejCode3  POSITION(433:434)  CHAR,
RejCode4  POSITION(435:436)  CHAR,
Sett_Snd_AreaCode  POSITION(437:440)  CHAR,
Sett_Rec_AreaCode  POSITION(441:444)  CHAR,
Sett_Snd_Code  POSITION(445:455)  CHAR,
Sett_Rec_Code  POSITION(456:466)  CHAR,
Amt_Snd_Sett_dr  POSITION(467:478),
Amt_Snd_Sett_cr  POSITION(479:490),
Amt_Rec_Sett_dr  POSITION(491:502),
Amt_Rec_Sett_cr  POSITION(503:514),
Curr_Sett_Snd  POSITION(515:517)  CHAR,
Curr_Sett_Rec  POSITION(518:520)  CHAR,
Comm  POSITION(521:528)   ,
Comm_Way  POSITION(529:529)  CHAR,
Comm_Snd_dr  POSITION(530:537)    ,
Comm_Snd_cr  POSITION(538:545)   ,
Comm_Rec_dr  POSITION(546:553)    ,
Comm_Rec_cr  POSITION(554:561)   ,
Fee_Snd_dr  POSITION(562:569)    ,
Fee_Snd_cr  POSITION(570:577)   ,
Fee_Rec_dr  POSITION(578:585)    ,
Fee_Rec_cr  POSITION(586:593)   ,
Comm_Cen_dr  POSITION(594:601)    ,
Comm_Cen_cr  POSITION(602:609)    ,
Fee_Snd_Branch_dr  POSITION(610:617)  CHAR,
Fee_Snd_Branch_cr  POSITION(618:625)  CHAR,
Fee_Rec_Branch_dr  POSITION(626:633)  CHAR,
Fee_Rec_Branch_cr  POSITION(634:641)    ,
Fee_CanCel_Snd_dr  POSITION(642:649)    ,
Fee_CanCel_Snd_cr  POSITION(650:657)   ,
Fee_CanCel_Rec_dr  POSITION(658:665)    ,
Fee_CanCel_Rec_cr  POSITION(666:673)   ,
Fee_CanCel_Cen_dr  POSITION(674:681)    ,
Fee_CanCel_Cen_cr  POSITION(682:689)    ,
Fee_Cl_Snd_Branch_dr  POSITION(690:697)   ,
Fee_Cl_Snd_Branch_cr  POSITION(698:705)    ,
Fee_Cl_Rec_Branch_dr  POSITION(706:721)  CHAR,
Fee_Cl_Rec_Branch_cr  POSITION(722:723)  CHAR,
TranType  POSITION(724:724)  CHAR,
CardKind  POSITION(725:726)  CHAR,
CardType  POSITION(727:740)  CHAR,
Bin  POSITION(741:744)  CHAR,
Card  POSITION(745:747)  CHAR,
Er_Max  POSITION(748:756)  CHAR,
Ssn_Cen  POSITION(757:768)    ,
TranAmt_Ori  POSITION(769:771)  CHAR,
TranCode_Ori  POSITION(772:772)  CHAR,
Reserved  POSITION(773:780)  CHAR,
variab1  POSITION(781:781)  CHAR,
variab2  POSITION(782:793)  CHAR,
variab3  POSITION(794:808)  CHAR,
variab4  POSITION(809:814)  CHAR,
variab5  POSITION(815:818)  CHAR,
variab6  POSITION(819:918)  CHAR
)

其中，我每次要针对不同的文件，把文件数据插入到不同的表，我都会读取ctl文件，修改其中的表名，然后再调用sqlldr命令。
3、c++处理sqlldr生成的bad文件，使之变成可以再次导入的文件.
头文件和cpp文件如下：
先是从网上找到的一个把gbk编码的字符数组转换成unicode编码的简单类：

// UtfFile.h: interface for the UtfFile class.
///////////////////////////////////////////////////////////

#if !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)
#define AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#include <windows.h> 
#include <stdio.h>
#include <locale.h>
#include <IOSTREAM>
#include <FSTREAM>
using namespace std;

class UtfFile 
{
public:
 static char * UnicodeToGB2312(WCHAR uData, char buffer[2]);  // Unicode 转换成 GB2312
 static WCHAR * UTF_8ToUnicode(char *pText, WCHAR &unicode);  // 把UTF-8 转化成 Unicode
 static char * TranslateUTF8ToGB(char *str, size_t len);   // 把UTF-8字符串转化成ANSI（GB2312）编码形式
 char * GetString(char *str, int maxLen = 1024);  // 读取一个字符串，以换行符为结束标示
 char * GetLine(char *str, int maxLen);    // 读取一行字符
 void close();          // 关闭文件流
 int open(const char *sFileName);     // 用于打开一个文件
 UtfFile(const char *sour);
 virtual ~UtfFile();

public:
 ifstream inf;
};

#endif // !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)

该类的源文件：


/*-----------------------------------------------以下为.cpp文件内容------------------------------------------*/

// UtfFile.cpp: implementation of the UtfFile class.
//
//////////////////////////////////////////////////////////////////////

#include "UtfFile.h"

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

UtfFile::UtfFile(const char *sour)
:inf(sour)
{
 if( !inf.is_open() )
 {
  char str[1024];
  sprintf(str, "错误的文件路径，文件无法打开：%s", sour);
  throw runtime_error(str);
 }
}

UtfFile::~UtfFile()
{

}

void UtfFile::close()
{
 inf.close();
}


char * UtfFile::GetLine(char *str, int maxLen)
{
 if ( inf.eof() )
 {
  str[0] = '\0';
 }
 else
 {
  inf.getline(str, maxLen);
  TranslateUTF8ToGB(str, maxLen);
 }
 return str;
}


char * UtfFile::GetString(char *str, int maxLen)
{
 if ( inf.eof() )
 {
  str[0] = '\0';
 }
 else
 {
  inf >> str;
  TranslateUTF8ToGB(str, maxLen);
 }
 return str;
}



char * UtfFile::UnicodeToGB2312(WCHAR uData, char buffer[2])
{
 WideCharToMultiByte(CP_ACP,NULL, &uData, 1,buffer,sizeof(WCHAR),NULL,NULL);
 return buffer;
}

WCHAR * UtfFile::UTF_8ToUnicode(char *pText, WCHAR &unicode)
{ 
/*    http://blog.csdn.net/liuzhiyuan1982/article/details/3911150
UTF-8是一种多字节编码的字符集，表示一个Unicode字符时，它可以是1个至多个字节，在表示上有规律：
1字节：0xxxxxxx
2字节：110xxxxx 10xxxxxx
3字节：1110xxxx 10xxxxxx 10xxxxxx
4字节：11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
 char *uchar = (char *)&unicode; 
 uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
 uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
 return &unicode;
}

char * UtfFile::TranslateUTF8ToGB(char *str, size_t len)
{
 char * newCharBuffer = new char[len];
 int index =0;
 int nCBIndex = 0;
 WCHAR wTemp = 0;
 char cTemp[2] = " ";
 while(index < len)
 {
  if ( str[index] == 0 )
   break;
  else if(str[index] > 0)  // 如果是GB2312的字符
  {
   newCharBuffer[nCBIndex] = str[index];    //直接复制
   index += 1;    //源字符串偏移量1
   nCBIndex += 1;   //目标字符串偏移量1
  }
  else      //如果是UTF-8的字符
  {
   UTF_8ToUnicode(str + index, wTemp);   //先把UTF-8转成Unicode
   UnicodeToGB2312(wTemp, &newCharBuffer[nCBIndex]); //再把Unicode 转成 GB2312
   index += 3;    //源字符串偏移量3
   nCBIndex += 2;   //目标字符串偏移量2  因为一个中文UTF-8占3个字节，GB2312占两个字节
  }
 }
 newCharBuffer[nCBIndex] = '\0'; //结束符
 strcpy( str, newCharBuffer );
 delete newCharBuffer;  //避免内存泄漏，这是对源代码的稍许修改
 newCharBuffer = NULL;
 return str;  
}

int UtfFile::open(const char *sFileName)
{
 inf.open(sFileName);
 return inf.is_open();
}

下面是我完成的main文件，大致就是对要处理的文件，读出一行，转码，替换里面的半中文字符‘？’，然后写入要输出的文件：

#include "UtfFile.h"
#include<string>
#include<vector>
#include<io.h>
#include<cstdio>
using namespace std;
void getFiles(string path, vector<string>& files)
{
    //文件句柄
    intptr_t   hFile = 0;
    //文件信息
    struct _finddata_t fileinfo;
    string p;
    if ((hFile = _findfirst(p.assign(path).append("\\*").c_str(), &fileinfo)) != -1)
    {
        do
        {
            //如果是目录,迭代之
            //如果不是,加入列表
            if ((fileinfo.attrib &  _A_SUBDIR))
            {
                if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)
                    getFiles(p.assign(path).append("\\").append(fileinfo.name), files);
            }
            else
            {
                files.push_back(p.assign(path).append("\\").append(fileinfo.name));
            }
        } while (_findnext(hFile, &fileinfo) == 0);
        _findclose(hFile);
    }
}
char* G2U(const char* gb2312)
{
    int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
    wchar_t* wstr = new wchar_t[len + 1];
    memset(wstr, 0, len + 1);
    MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
    len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
    char* str = new char[len + 1];
    memset(str, 0, len + 1);
    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
    if (wstr) delete[] wstr;
    return str;
}
void do_bad(string in_file, string out_file)
{
    UtfFile  myfile(in_file.c_str());  //读bad文件

    ofstream outfile;
    outfile.open(out_file, ios::trunc); //写文件
    string tempstr;
    char* mystr;

    while (std::getline(myfile.inf, tempstr))
    {

        mystr = const_cast<char*>(tempstr.c_str());
        mystr = G2U(mystr);
        UtfFile::TranslateUTF8ToGB(mystr, 918);
        char* str_bk = mystr;
        for (int j = 0; j<918; j++)//可用find函数优化
        {
            if (*mystr == '?') //replace 
            {
                *mystr = '1';
                break;

            }
            mystr++;
        }
        outfile << str_bk << '\n';//
    }
    outfile.close();
    myfile.close();


}

int main()
{

  //char strbuffer[1024]={0};
  //mystr=strbuffer;
  //
  char * filePath = "C:\\pythonfile\\sqlldr_bsibsa\\bad_process";
  //\\bad_files
  vector<string> files;
  vector<string> in_files;
  vector<string> out_files;

  获取该路径下的所有文件
  getFiles(filePath, files);
  //BSA2017060701_01.bad

  int total_size = files.size();
  for (int i = 0; i < total_size; i++)
  { 
      //cout << files[i] << endl;
      int pos = files[i].find_last_of('\\');
      string name_with_exp(files[i].substr(pos + 1));
      string base_dir(files[i].substr(0,pos+1));
      cout << name_with_exp.size() << endl;
      if (name_with_exp.size() == 20)
      {

          in_files.push_back(files[i]);

          //cout << s << endl;
          string name_only = name_with_exp.substr(0, 16);
         // char str[50];
          cout << name_only << endl;
         // sprintf(str,"%s.bad",name_only);
         // cout << str << endl;
          string full_path;
          full_path = base_dir + name_only + ".dat";
          //cout << full_path << endl;
          out_files.push_back(full_path);

      }
  }
  int input_size = in_files.size();
  for (int i = 0; i < input_size; i++)
  {
      do_bad(in_files[i], out_files[i]);
  }
  //C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.dat
  //C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.bad

  //

  //UtfFile  myfile("C:\\pythonfile\\sql_loader\\bad_test\\1.dat");  //读bad文件

  return 0;
}

4、c++程序处理好bad数据使之可以用之后，我又写了一个脚本，和专门的ctl文件，再次把这些bad文件批量导进去：

import os
basedir=os.getcwd()
#print(basedir)
files_list=os.listdir(basedir)
bad_dir='%s\\badd_files'%basedir
log_dir='%s\\logg_files'%basedir
if not os.path.exists(bad_dir):
    os.mkdir(bad_dir)
if not os.path.exists(log_dir):
    os.mkdir(log_dir)
succ=0
for file_name in files_list:
    if len(file_name) != 20:
        continue
    if ('BS'  in file_name) and ('dat' in file_name):
        #print(file_name)
        table_name=file_name[0:3]+'_'+file_name[3:11]
        #print(table_name)
        #newname = file_name[:-4] + '.dat'
        #os.rename(file_name,newname)
        lines=open('bsa_bad.ctl','r').readlines()
        for k in range(len(lines)):
            if 'INTO TABLE' in lines[k]:
                lines[k]='INTO TABLE %s\n'%table_name
        f=open('bsa_bad.ctl','w')
        f.writelines(lines)
        f.close()


        file_base=file_name[0:16]
        #print(file_base)
            #import
        cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa_bad.ctl data=%s "%(basedir,file_name)
        cmd_str+="bad=%s\\badd_files\\%s.bad log=%s\\logg_files\\%s.log direct=true"%(basedir,file_base,basedir,file_base)
        #print(cmd_str)
        os.system(cmd_str)
        succ+=1
    else:
        continue

print("succ:%d"%succ)

这个文件又生成了极小的bad文件，我分析了下，都是个别在源文件就是乱码的数据，可以分门别类留存下来，不必理会。

注意：其中oracle登陆远程服务器的用户名，密码，ip地址都被我用xxx代替了
欢迎提问！

啊啊啊西吧

关注

1
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
python3脚本使用sql loader批量导入字节文件并二次处理

由于使用cx_oracle连接oracle然后导入数据文件速度太慢（要导的数据有几百G），出于速度的考虑，选择了oracle自带的sql loader工具，使用python脚本写界面，与用户交互，然后使用system（）去批量调用sql loader，对于导入失败的bad数据以及日志数据，我用了c++写了一个程序，把bad文件转换成了ok的数据，然后又写了一个脚本处理这些bad数据。 1、pyth
复制链接

扫一扫