dbf文件导入到oracle数据库中

在工作中需要将dbf文件导入到oracle数据库中,包含有两个文件,一个文件是用于读取dbf并生成pandas,文件名为simpledbf.py,该文件修改自simpledbf中的simpledbf文件,该文件中没有对number数据类型进行区分是int行还是float型,修改后的simpledbf进行了区分。另一个文件为dbf_import.py,通过调用该文件可以实现数据导入到数据库中。该文件基本思路如下:

  1. 配置必须的项目项;
  2. 读取dbf文件生成csv文件;
  3. 根据dbf文件的字段信息创建表;
  4. 导入csv文件到创建的表中;
  5. 校验dbf文件行数以及导入到数据库以后的行数

该程序的不足之处:

  1. 该文件只处理了字符串、数字类型;没有对日期等其他的数据类型进行进一步处理,而是统一转换成了varchar2(200)类型。

相关配置内容如下【位于dbf_import.py文件中】:

#########################################################################
#只需要修改如下的参数
#参数定义 需要柜台首先创建指定用户下的表信息
#output_path最后不需要加反斜杠
dict_value={'schema_name':'db_user','table_name':'table_name','dbf_name':r'D:\dbf_name.DBF','output_path':r'd:\path_of_dir'} 
#如果是全部的字段则可以写成import_cols='*'
#如果是选择部分字段类型,可以定义为list类型,大小写无关,会统一转换为小写
import_cols= ['ymth','khrq','zjlb','zjdm']
#import_cols='*'
#指定连接数据库的用户名密码以及数据库的编码,用户名默认为dict_value中的schema_name
oracle_source=(dict_value['schema_name'],'password','db_name','GB18030')
#如果dbf文件中的列含有数据库中的关键字,则需要增加前缀来避免问题的出现
column_prefix='st_'
#########################################################################

simpledbf.py文件内容如下:

import struct
import datetime
import os
import codecs

# Check for optional dependencies.
try:
    import pandas as pd
except:
    print("Pandas is not installed. No support for DataFrames, HDF, or SQL.")
else:
    try:
        import sqlalchemy as sql
    except:
        print("SQLalchemy is not installed. No support for SQL output.")

class DbfBase(object):
    '''
    Base class for DBF file processing objects.

    Do not instantiate this class. This provides some of the common functions
    for other subclasses.
    '''
    def _chunker(self, chunksize):
        '''Return a list of chunk ints from given chunksize.

        Parameters
        ----------
        chunksize : int
            The maximum chunk size 

        Returns
        -------
        list of ints
            A list of chunks necessary to break up a given file. These will
            all be equal to `chunksize`, except for the last value, which is
            the remainder (<= `chunksize).
        '''
        num = self.numrec//chunksize
        # Chunksize bigger than numrec
        if num == 0:
            return [self.numrec,]
        else:
            chunks = [chunksize,]*num
            remain = self.numrec%chunksize
            if remain != 0:
                chunks.append(remain) 
            return chunks

    def _na_set(self, na):
        '''Set the value used for missing/bad data.

        Parameters
        ----------
        na : various types accepted
            The value that will be used to replace missing or malformed
            entries. Right now this accepts pretty much anything, and that
            value will be used as a replacement. (May not do what you expect.)
            However, the strings 'na' or 'nan' (case insensitive) will insert
            float('nan'), the string 'none' (case insensitive) or will insert
            the Python object `None`.  Float/int columns are always
            float('nan') regardless of this setting.
        '''
        if na.lower() == 'none':
            self._na = None
        elif na.lower() in ('na', 'nan'):
            self._na = float('nan')
        else:
            self._na = na
          


    def to_dataframe(self, chunksize=None, na='nan'):
        '''Return the DBF contents as a DataFrame.

        Parameters
        ----------
        chunksize : int, optional
            Maximum number of records to process at any given time. If 'None'
            (defalut), process all records.

        na : various types accepted, optional
            The value that will be used to replace missing or malformed
            entries. Right now this accepts pretty much anything, and that
            value will be used as a replacement. (May not do what you expect.)
            However, the strings 'na' or 'nan' (case insensitive) will insert
            float('nan'), the string 'none' (case insensitive) or will insert
            the Python object `None`. Default for DataFrame is NaN ('nan');
            however, float/int columns are always float('nan')

        Returns
        -------
        DataFrame (chunksize == None)
            The DBF file contents as a Pandas DataFrame

        Generator (chunksize != None)
            This generator returns DataFrames with the maximum number of
            records equal to chunksize. (May be less)

        Notes
        -----
        This method requires Pandas >= 0.15.2.
        '''
        self._na_set(na)
        if not chunksize:
            # _get_recs is a generator, convert to list for DataFrame
            results = list(self._get_recs())
            df = pd.DataFrame(results, columns=self.columns)
            del(results) # Free up the memory? If GC works properly
            return df
        else:
            # Return a generator function instead
            return self._df_chunks(chunksize)

    def _df_chunks(self, chunksize):
        '''A DataFrame chunk generator.

        See `to_dataframe`.
        '''
        chunks = self._chunker(chunksize)
        # Keep track of the index, otherwise every DataFrame will be indexed
        # starting at 0
        idx = 0
        for chunk in chunks:
            results = list(self._get_recs(chunk=chunk))
            num = len(results) # Avoids skipped records problem
            df = pd.DataFrame(results, columns=self.columns, 
                              index=range(idx, idx+num))
            idx += num
            del(results) 
            yield df
    
        

class dbf5(DbfBase):
    '''
    DBF version 5 file processing object.

    This class defines the methods necessary for reading the header and
    records from a version 5 DBF file.  Much of this code is based on an
    `ActiveState DBF example`_, which only worked for Python2.

    .. ActiveState DBF example: http://code.activestate.com/recipes/
            362715-dbf-reader-and-writer/

    Parameters
    ----------

    dbf : string
        The name (with optional path) of the DBF file.

    codec : string, optional
        The codec to use when decoding text-based records. The default is
        'utf-8'. See Python's `codec` standard lib module for other options.

    Attributes
    ----------

    dbf : string
        The input file name.

    f : file object
        The opened DBF file object

    numrec : int
        The number of records contained in this file.
    
    lenheader : int
        The length of the file header in bytes.

    numfields : int
        The number of data columns.

    fields : list of tuples
        Column descriptions as a tuple: (Name, Type, # of bytes).

    columns : list
        The names of the data columns.

    fmt : string
        The format string that is used to unpack each record from the file.

    fmtsiz : int
        The size of each record in bytes.
    '''
    def __init__(self, dbf, codec='utf-8'):
        self._enc = codec
        path, name = os.path.split(dbf)
        self.dbf = name
        # Escape quotes, set by indiviual runners
        self._esc = None
        # Reading as binary so bytes will always be returned
        self.f = open(dbf, 'rb')

        self.numrec, self.lenheader = struct.unpack('<xxxxLH22x', 
                self.f.read(32))    
        self.numfields = (self.lenheader - 33) // 32

        # The first field is always a one byte deletion flag
        fields = [('DeletionFlag', 'C', 1,0),]
        for fieldno in range(self.numfields):
            name, typ, size,deci = struct.unpack('<11sc4xBB14x', self.f.read(32))
            # eliminate NUL bytes from name string  
            name = name.strip(b'\x00')        
            fields.append((name.decode(self._enc), typ.decode(self._enc), size,deci))
        self.fields = fields
        # Get the names only for DataFrame generation, skip delete flag
        self.columns = [f[0].lower() for f in self.fields[1:]]
        
        terminator = self.f.read(1)
        assert terminator == b'\r'
     
        # Make a format string for extracting the data. In version 5 DBF, all
        # fields are some sort of structured string
        self.fmt = ''.join(['{:d}s'.format(fieldinfo[2]) for 
                            fieldinfo in self.fields])
        self.fmtsiz = struct.calcsize(self.fmt)

    def _get_recs(self, chunk=None):
        '''Generator that returns individual records.

        Parameters
        ----------
        chunk : int, optional
            Number of records to return as a single chunk. Default 'None',
            which uses all records.
        '''
        if chunk == None:
            chunk = self.numrec

        for i in range(chunk):
            # Extract a single record
            record = struct.unpack(self.fmt, self.f.read(self.fmtsiz))
            # If delete byte is not a space, record was deleted so skip
            if record[0] != b' ': 
                continue  
            
            # Save the column types for later
            result = []
            for idx, value in enumerate(record):
                name, typ, size,deci = self.fields[idx]
                if name == 'DeletionFlag':
                    continue

                # String (character) types, remove excess white space
                if typ == "C":
                    value = value.strip()
                    # Convert empty strings to NaN
                    if value == b'':
                        value = self._na
                    else:
                        value = value.decode(self._enc,errors='ignore')
                        # Escape quoted characters
                        if self._esc:
                            value = value.replace('"', self._esc + '"')

                # Numeric type. Stored as string
                elif typ == "N":
                    # A decimal should indicate a float
                    if b'.' in value:
                        value = float(value)
                    # No decimal, probably an integer, but if that fails,
                    # probably NaN
                    else:
                        try:
                            value = int(value)
                        except:
                            # I changed this for SQL->Pandas conversion
                            # Otherwise floats were not showing up correctly
                            value = float('nan')

                # Date stores as string "YYYYMMDD", convert to datetime
                elif typ == 'D':
                    try:
                        y, m, d = int(value[:4]), int(value[4:6]), \
                                  int(value[6:8])
                    except:
                        value = self._na
                    else:
                        value = datetime.date(y, m, d)

                # Booleans can have multiple entry values
                elif typ == 'L':
                    if value in b'TyTt':
                        value = True
                    elif value in b'NnFf':
                        value = False
                    # '?' indicates an empty value, convert this to NaN
                    else:
                        value = self._na

                # Floating points are also stored as strings.
                elif typ == 'F':
                    try:
                        value = float(value)
                    except:
                        value = float('nan')

                else:
                    err = 'Column type "{}" not yet supported.'
                    raise ValueError(err.format(value))

                result.append(value)
            yield result
    

dbf_import.py文件内容如下:

from sqlalchemy import MetaData,Table,create_engine
import numpy as np
import pandas as pd
import time,subprocess,pathlib,os
from simpledbf import dbf5
#########################################################################
#只需要修改如下的参数
#参数定义 需要柜台首先创建指定用户下的表信息
#output_path最后不需要加反斜杠
dict_value={'schema_name':'db_user','table_name':'table_name','dbf_name':r'D:\dbf_name.DBF','output_path':r'd:\path_of_dir'} 
#如果是全部的字段则可以写成import_cols='*'
#如果是选择部分字段类型,可以定义为list类型,大小写无关,会统一转换为小写
import_cols= ['ymth','khrq','zjlb','zjdm']
#import_cols='*'
#指定连接数据库的用户名密码以及数据库的编码,用户名默认为dict_value中的schema_name
oracle_source=(dict_value['schema_name'],'password','db_name','GB18030')
#如果dbf文件中的列含有数据库中的关键字,则需要增加前缀来避免问题的出现
column_prefix='st_'
#########################################################################

#sqlload模板
sqlloader_tpl='''
    LOAD DATA
    INFILE '{a[output_path]}\{a[table_name]}.csv'
    BADFILE '{a[output_path]}\{a[table_name]}.bad'
    DISCARDFILE '{a[output_path]}\{a[table_name]}.dsc'
    truncate
    INTO TABLE {a[table_name]}
    fields terminated by X'2C'
    enclosed by '"'
    TRAILING NULLCOLS
  ({a[column_list]})
'''
#创建表的模板
create_table_sql='''
create table {a[schema_name]}.{a[table_name]}
(
  {a[create_table_colunms]}
)
'''
#判断数据库连接是否正确
source_engin=create_engine('oracle+cx_oracle://{0[0]}:{0[1]}@{0[2]}'.format(oracle_source),connect_args={'encoding':oracle_source[3]},echo=True) 

try:
    source_engin.execute('select 1 from dual')
    print('数据库连接正常')
except Exception as  e:
    print(repr(e))
    print('数据库连接失败,请检查配置项')
    os._exit(0)
    
#判断路径是否存在,不存在则创建:
if not pathlib.Path(dict_value['output_path']).is_dir():
    try:
        pathlib.Path(dict_value['output_path']).mkdir()
        print('创建目录'+dict_value['output_path']+'成功')
    except Exception as  e:
        print(repr(e))
        print('创建目录'+dict_value['output_path']+'失败')
        os._exit(0)
if not pathlib.Path(dict_value['dbf_name']).is_file():
    print('检查配置项dict_value[dbf_name]是否正确,该文件['+dict_value['dbf_name']+']不存在')
    os._exit(0)

    
#检查参数import_columns是否正确
dbf_info=dbf5(dict_value['dbf_name'],codec=oracle_source[3])
if import_cols =='*':
    import_columns=[str(x).lower() for x in dbf_info.columns]
elif isinstance(import_cols,list):
    import_columns=[str(x).lower() for x in import_cols]
else:
    print('import_columns必须为\'*\'或者一个列表')
#创建数据库的时候使用
list_a=[]
#生成ctl文件使用
list_b=[]
#生成csv文件使用
list_c=[]
for x in dbf_info.fields:
    if (x[0].lower() in import_columns):
        if x[1]=='C':
            list_a.append(column_prefix+x[0].lower()+ " varchar2("+str(x[2])+") default ' '" )
            list_b.append(column_prefix+x[0].lower())
            list_c.append(x[0].lower())
        elif x[1]=='N':
            list_a.append(column_prefix+x[0].lower()+ " number("+str(x[2])+","+str(x[3])+") default 0")
            list_b.append(column_prefix+x[0].lower())
            list_c.append(x[0].lower())
        else:
            list_a.append(column_prefix+x[0].lower()+ " varchar2(200) default ' ' " )
            list_b.append(column_prefix+x[0].lower())
            list_c.append(x[0].lower())
dict_value['create_table_colunms']=','.join((list_a))       
dict_value['column_list']=','.join((list_b))

if not list_b:
    print('没有可以导入的列,请检查import_columns参数是否正确')
    os._exit(0)
#创建表
table_count=source_engin.execute("select count(1) from user_tables a where a.table_name=upper('{a[table_name]}')".format(a=dict_value)).fetchone()[0]
if table_count ==1:
    try:
        source_engin.execute("drop table {a[schema_name]}.{a[table_name]}".format(a=dict_value))
        print('删除表'+dict_value['schema_name']+'.'+dict_value['table_name']+'成功')
        source_engin.execute(create_table_sql.format(a=dict_value))
        print('创建表'+dict_value['schema_name']+'.'+dict_value['table_name']+'成功')
    except Exception as  e:
        print(repr(e))
        print('执行sql出错,请检查相关权限')
        os._exit(0)
else:
    try:
        source_engin.execute(create_table_sql.format(a=dict_value))
        print('创建表'+dict_value['schema_name']+'.'+dict_value['table_name']+'成功')
    except Exception as  e:
        print(repr(e))
        print('执行sql出错,请检查相关权限')
        os._exit(0)
#生成ctl文件
print('#'*20,'生成ctl与csv文件开始','#'*20)
try:
    start_time=time.time()
    with open(r'{a[output_path]}\{a[table_name]}.ctl'.format(a=dict_value),'w') as f:
        f.write(sqlloader_tpl.format(a=dict_value))
    #读取dbf文件内容并生成csv文件
    file_name=pathlib.Path('{a[output_path]}\{a[table_name]}.csv'.format(a=dict_value))
    if file_name.exists():
        file_name.unlink()
    with open(str(file_name),'a',encoding=oracle_source[3]) as file_handle:
        for x in dbf_info.to_dataframe(chunksize=1000000):
            x.to_csv(file_handle,index=False,columns=list_c,header=False,quoting=1,encoding=oracle_source[3])
            
    print('生成CSV文件成功,使用时间为:',time.time()-start_time)
except Exception as  e:
    print(repr(e))
    print('生成ctl文件和CSV文件失败,请检查所在目录[{a[output_path]}]是否有写权限或文件[{a[output_path]}\{a[table_name]}.ctl|{a[output_path]}\{a[table_name]}.txt]文件是否被占用'.format(a=dict_value))
    os._exit(0)
print('#'*20,'生成ctl与csv文件结束','#'*20)
load_start=time.time()
print('#'*20,'导入数据开始','#'*20)
k=subprocess.run(['sqlldr', oracle_source[0]+'/'+oracle_source[1]+'@'+oracle_source[2],r'{a[output_path]}\{a[table_name]}.ctl'.format(a=dict_value)],stdout=open(r'{a[output_path]}\{a[table_name]}.log'.format(a=dict_value),'a'),stderr=open(r'{a[output_path]}\{a[table_name]}.err'.format(a=dict_value),'a'),encoding=oracle_source[3])
if k.returncode == 0:
    print('导入数据成功!')
    print('导入数据库,使用时间为:',time.time()-load_start)
else:
    print('导入数据失败!')
print('#'*20,'导入数据结束','#'*20)
print('dbf文件中一共有:',dbf_info.numrec,'条记录')
try:
    count_value=source_engin.execute('select count(1) as rownums from {a[schema_name]}.{a[table_name]}'.format(a=dict_value)).fetchone()
    print('数据库中一共有:'+str(count_value[0])+' 条记录!')
except Exception as  e:
    print('查询数据库中表的记录数失败,失败原因为:'+repr(e))
    os._exit(0)

执行过程:

  1. 将两个文件放到同一个目录下,否则simpledbf文件导入会有问题
  2. 修改dbf_import.py文件中的相关配置信息
  3. 执行python dbf_import.py

转载于:https://my.oschina.net/u/1403503/blog/2966818

  • 0
    点赞
  • 5
    收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
©️2022 CSDN 皮肤主题:编程工作室 设计师:CSDN官方博客 返回首页
评论
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值