一、背景:公司要从FPT上下载一些数据文件,主要有XLS,XLSX,CSV三种文件,XLS的解析用了第三方库libxls 请看链接 C语言接口libxls使用_skv00d00的博客-CSDN博客
至于xlsx就比较麻烦了,虽然有不少诸如openxlsx这种优秀的开源库,但是无奈服务器上的编译器版本太低,gcc 4.8.5的,cmake2.8.x,这就给编译一些开源库带来太大的麻烦。好在xlsx文件的本质是zip+xml这种开发式的文件,于是动手自己解析
动手自己解析的前提:
0. centos7 os
1.借助unzip解压缩
由于minizip需要cmake3.1.5较服务器的版本比较高,但是服务器上面的诸如编译器版本不能随意升级。所以无法直接使用Minizip,于是先要借助unzip命令去解压缩 xlsx的文件。
unzip security.xlsx -d xlsxzip
1. 不在使用以前的unzip 命令解压缩,而是将miniunz.c和unzip.c从zlib源码中拆出来,然后和readxlsx.c一起编译已在github上开源。
https://github.com/overzeus/readxlsx
2. yum -y install libxml2-devel
3.此解析程序只解析基础数据,例如公式不在解析范畴
4.该程序只是演示。
二、代码
1.先解压缩security.xlsx 文件
unzip security.xlsx -d xlsxzip
得到如下文件夹,红框的文件都是将要解析的文件
二、头文件
readxlsx.h
#ifndef _READXLSX_H_
#define _READXLSX_H_
#ifndef _IN_
#define _IN_
#endif
#ifndef _OUT_
#define _OUT_
#endif
#ifndef _IN_OUT_
#define _IN_OUT_
#endif
#define SHEET_1_IDX 1
#define SHEET_2_IDX 2
#define SHEET_3_IDX 3
#define SHEET_4_IDX 4
#define SHEET_5_IDX 5
#define SHEET_6_IDX 6
#define SHEET_7_IDX 7
#define SHEET_8_IDX 8
#define COL_A_IDX 0
#define COL_B_IDX 1
#define COL_C_IDX 2
#define COL_D_IDX 3
#define COL_E_IDX 4
#define COL_F_IDX 5
#define COL_G_IDX 6
#define COL_H_IDX 7
#define COL_I_IDX 8
#define COL_J_IDX 9
#define COL_K_IDX 10
#define COL_L_IDX 11
#define COL_M_IDX 12
#define COL_N_IDX 13
#define COL_O_IDX 14
#define COL_P_IDX 15
#define COL_Q_IDX 16
#define COL_R_IDX 17
#define COL_S_IDX 18
#define COL_T_IDX 19
#define COL_U_IDX 20
#define COL_V_IDX 21
#define COL_W_IDX 22
#define COL_X_IDX 23
#define COL_Y_IDX 24
#define COL_Z_IDX 25
#define _DEBUG_XLSX_
#ifdef _DEBUG_XLSX_
#define _DEBUG_STR_LN_(STR) printf(#STR":[%s]\n",(STR))
#define _DEBUG_INT_LN_(INTNUM) printf(#INTNUM":[%d]\n",(INTNUM))
#define _DEBUG_CHAR_LN_(CH) printf(#CH":[%c]\n",(CH))
#endif
#endif
三、代码文件
readxlsx.c
#include<stdio.h>
#include<string.h>
#include<libxml/parser.h>
#include<libxml/tree.h>
#include <errno.h>
#include <ctype.h>
#include "readxlsx.h"
#define XL_DIR "xl/"
#define WORKSHEETS_XML_DIR XL_DIR"worksheets/"
#define WORKBOOK_XML_RELS_PATH XL_DIR"_rels/workbook.xml.rels"
#define WORKBOOK_XML_PATH XL_DIR"workbook.xml"
#define SHAREDSTRINGS_XML_PATH XL_DIR"sharedStrings.xml"
typedef unsigned char bytes_t; /* 与libxml2中的xmlChar相同类型 */
typedef struct _CELL_
{
char value[2048];
}Cell;
typedef struct _SHEET_
{
char sheetName[100+1];
char id[10+1];
Cell **cells;
int cellrows;
int cellcols;
char target[256+1];/* 在xl/_rels/workbook.xml.rels可以找到sheetname与sheet.xml对应关系,例如第一个sheet的i名字叫 aaa , 则对应xl/worksheets/sheet1.xml,即target */
}WorkSheet;
typedef struct _SHARED_STRING_
{
char value[2048];
}SharedString;
typedef struct _BOOK_
{
WorkSheet *workSheets;
int sheetcnt;
char unziprootdir[256+1];/* xlsx格式分两层,第一层是zip格式,第二层是xml。所以先要进行zip格式的解压缩到一个目录下 */
SharedString *sharedstrings;/* 对应shareStrings.xml中的值 */
}WorkBook;
/*
函数:在workbook.xml文件中获取各个sheet的名字
返回值:0-成功,<0失败
*/
int ParseWorkBookXml(const char * _IN_ xmlcontent,const int _IN_ size, WorkBook * _OUT_ wBook)
{
if( xmlcontent == NULL || size <= 0 || wBook == NULL )
return -1;
xmlDocPtr doc = NULL;
xmlNodePtr root = NULL,node = NULL, sheetsnode = NULL, detail = NULL;
xmlChar * sheetname = NULL, *value = NULL, *id = NULL;
int sheetcnt = 0;
int sheetidx = 0;
int wsheetsize = 0;
doc = xmlParseMemory(xmlcontent,size); //parse xml in memory
if( doc == NULL )
return -1;
root = xmlDocGetRootElement(doc);
/*
取每一个sheet的名字
<sheets>
<sheet name="11111" sheetId="1" r:id="rId1" />
<sheet name="213213123" sheetId="2" r:id="rId2" />
<sheet name="Sheet3" sheetId="3" r:id="rId3" />
<sheet name="Sheet1" sheetId="4" r:id="rId4" />
</sheets>
*/
for( node = root->children; node; node = node->next ){
if(xmlStrcasecmp(node->name,BAD_CAST"sheets")==0)
break;
}
if(node==NULL){
return -1;
}
sheetsnode = node;
/* 计算有多少个sheet */
sheetcnt = 0;
for(node=node->children;node;node=node->next){
sheetcnt++;
}
/* 由于sheetId从1开始数,为了取数方便,所以浪费数组0位置 */
wsheetsize = (sizeof(WorkSheet))*(sheetcnt+1);
wBook->workSheets = (WorkSheet *)malloc(wsheetsize);
if( wBook->workSheets == NULL )
return -errno;
memset(wBook->workSheets, 0x00, wsheetsize);
wBook->sheetcnt = sheetcnt+1;
node = sheetsnode;
for(node=node->children;node;node=node->next){
if(xmlStrcasecmp(node->name,BAD_CAST"sheet")==0){
sheetname = xmlGetProp(node,BAD_CAST"name");
sheetidx = atoi(xmlGetProp(node,BAD_CAST"sheetId"));
id = xmlGetProp(node,BAD_CAST"id");
strcpy(wBook->workSheets[sheetidx].sheetName, sheetname);
strcpy(wBook->workSheets[sheetidx].id, id);
}
}
return 0;
}
/*
函数:查找Id是否在worksheet数组中
返回值:0-否, 非0-是,即WorkSheet数组中对应的下标
*/
int ContainsId(char * _IN_ id, WorkSheet * _IN_ wsheets, int _IN_ sheetscnt)
{
if( wsheets == NULL || sheetscnt <= 0 )
return 0;
int ii = 0;
for(ii = SHEET_1_IDX; ii < sheetscnt; ii++ ){
if( strcmp(wsheets[ii].id, id) == 0 ){
return ii;
}
}
return 0;
}
/*
函数:通过sheetname的id,在 xl/_rels/workbook.xml.rels找出对应的sheet[1...n].xml文件
返回值:0-成功,<0-失败
*/
int ParseWorkBookXmlRels(const char * _IN_ xmlcontent,const int _IN_ size, WorkBook * _OUT_ wBook)
{
if( xmlcontent == NULL || size <= 0 || wBook == NULL )
return -1;
xmlDocPtr doc = NULL;
xmlNodePtr root = NULL,node = NULL;
xmlChar * id = NULL, *target = NULL, *pend = NULL;
int sheetcnt = 0;
int idx = 0;
doc = xmlParseMemory(xmlcontent,size); //parse xml in memory
if( doc == NULL )
return -1;
root = xmlDocGetRootElement(doc);
for( node = root->children; node; node = node->next ){
if(xmlStrcasecmp(node->name,BAD_CAST"Relationship")==0){
id = xmlGetProp(node,BAD_CAST"Id");
if((idx=ContainsId(id,wBook->workSheets,wBook->sheetcnt))){
target = xmlGetProp(node,BAD_CAST"Target");
pend = strstr(target,"/");
pend++;
snprintf(wBook->workSheets[idx].target,sizeof(wBook->workSheets[idx].target),"%s/%s/%s",wBook->unziprootdir,WORKSHEETS_XML_DIR,pend);
sheetcnt++;
}
}
}
return (sheetcnt > 0 ? 0:-1 );
}
/*
函数:解析sharedStrings.xml,这个xml存放着各个sheet中类型为字符串的值(或者说t=s)
返回值:0-成功,<0失败
*/
int ParseSharedStringsXml( const char * _IN_ xmlcontent, const int _IN_ size, WorkBook *wBook)
{
if( xmlcontent == NULL || size <= 0 || wBook == NULL)
return -1;
xmlDocPtr doc = NULL;
xmlNodePtr root = NULL,node = NULL, tnode = NULL;
xmlChar * t = NULL, *scnt = NULL;
int count = 0;
int ii = 0;
doc = xmlParseMemory(xmlcontent,size); //parse xml in memory
if( doc == NULL )
return -1;
root = xmlDocGetRootElement(doc);
scnt = xmlGetProp(root,BAD_CAST"count");
if( scnt == NULL )
return 0;
count = atoi(scnt);
if(count == 0)
return 0;
wBook->sharedstrings = (SharedString *)malloc(sizeof(SharedString)*count);
if(wBook->sharedstrings == NULL)
return -errno;
ii = 0;
for( node = root->children; node; node = node->next ){
if(xmlStrcasecmp(node->name,BAD_CAST"si")==0){
for(tnode=node->children; tnode ; tnode=tnode->next ){
if(xmlStrcasecmp(tnode->name,BAD_CAST"t")==0){
t=xmlNodeGetContent(tnode);
strcpy( wBook->sharedstrings[ii].value, t);
ii++;
}
}
}
}
return 0;
}
/*
函数:解析dimension的值,计算出总共有多少行,多少列
返回值:0-成功,<0-失败
*/
int CalcRowsAndCols(char * _IN_ dimension, int * _OUT_ rows, int * _OUT_ cols)
{
if( dimension == NULL || rows == NULL || cols == NULL )
return -1;
char *p = NULL;
char start[10+1];
char end[10+1];
char startrow[10+1];
char endrow[10+1];
char startcol[10+1];
char endcol[10+1];
int ii = 0;
int jj = 0;
int iendcol = 0;
memset(start,0x00,sizeof(start));
memset(end,0x00,sizeof(end));
memset(startrow,0x00,sizeof(startrow));
memset(endrow,0x00,sizeof(endrow));
memset(startcol,0x00,sizeof(startcol));
memset(endcol,0x00,sizeof(endcol));
/*
多行多列情况下,例如dimension为 A1:E5
多行单列情况下,例如dimension为 A1:A9
单行单列请款下,例如dimension为 A1
*/
p = strstr(dimension,":");
if( p != NULL ){
strncpy( start, dimension, p-dimension );
start[p-dimension] = '\0';
strcpy( end, p+1 );
}else{
strcpy( start, dimension);
strcpy( end, dimension);
}
ii = 0;
jj = 0;
for( p = start; *p != '\0'; p++ ){
if( isdigit(*p) )
startrow[ii++] = *p;
else if(isalpha(*p))
startcol[jj++] = *p;
}
ii = 0;
jj = 0;
for( p = end; *p != '\0'; p++ ){
if( isdigit(*p) )
endrow[ii++] = *p;
else if(isalpha(*p))
endcol[jj++] = *p;
}
/* 先以最大的行号和列号分别减去第一行,A列来计算出总共有多少行多少列,如果发生1~3行是空行,就用NULL值来占位置 */
*rows = atoi(endrow);
AlphaColToNum(endcol,&iendcol);
*cols = iendcol + 1;
return 0;
}
/*
函数:或者A,BC...Z列的基本下标
返回值:0-成功,<0失败
*/
int AlphaColToIndex(char _IN_ ch,int * _OUT_ index)
{
char tch = toupper(ch);
if( (tch < 'A' || tch > 'Z') || index == NULL )
return -1;
switch (tch)
{
case 'A':
*index = COL_A_IDX;
break;
case 'B':
*index = COL_B_IDX;
break;
case 'C':
*index = COL_C_IDX;
break;
case 'D':
*index = COL_D_IDX;
break;
case 'E':
*index = COL_E_IDX;
break;
case 'F':
*index = COL_F_IDX;
break;
case 'G':
*index = COL_G_IDX;
break;
case 'H':
*index = COL_H_IDX;
break;
case 'I':
*index = COL_I_IDX;
break;
case 'J':
*index = COL_J_IDX;
break;
case 'K':
*index = COL_K_IDX;
break;
case 'L':
*index = COL_L_IDX;
break;
case 'M':
*index = COL_M_IDX;
break;
case 'N':
*index = COL_N_IDX;
break;
case 'O':
*index = COL_O_IDX;
break;
case 'P':
*index = COL_P_IDX;
break;
case 'Q':
*index = COL_Q_IDX;
break;
case 'R':
*index = COL_R_IDX;
break;
case 'S':
*index = COL_S_IDX;
break;
case 'T':
*index = COL_T_IDX;
break;
case 'U':
*index = COL_U_IDX;
break;
case 'V':
*index = COL_V_IDX;
break;
case 'W':
*index = COL_W_IDX;
break;
case 'X':
*index = COL_X_IDX;
break;
case 'Y':
*index = COL_Y_IDX;
break;
case 'Z':
*index = COL_Z_IDX;
break;
default:
*index = -1;
break;
}
return (*index == -1?-1:0);
}
/*
函数:列号转换成数字,例如A列转成第0列,AA列为26列 AB列变成27 。。。以此类推
返回值:0-成功 <0失败
备注:如果为英语字母则作为列号处理,如果字符串中跟数字则返回报错
*/
int AlphaColToNum(char * _IN_ pos, int * _OUT_ pcol)
{
if( pos == NULL )
return -1;
int ii = 0, len = 0, ival = 0,index = 0;
char scol[5+1];
char *curp = pos;
memset(scol,0x00,sizeof(scol));
ii = 0;
len = strlen(pos);
for( ;*curp != '\0'; curp++ ){
if(!isalpha(*curp))
return -1;
AlphaColToIndex(*curp,&index);
if( len > 1 )
ival += (26*len-26)*(index+1);
else if( len ==1 )
ival += index;
len--;
}
*pcol = ival;
return 0;
}
/*
函数:分解出纯字母串
返回值:0-成功 <0-失败
备注:输出参数as必须是非空的。
*/
int SplitAlphaStr(char * _IN_ str, char * _OUT_ as )
{
if( str == NULL || as == NULL )
return -1;
int ii = 0;
for(; *str != '\0'; str++ )
if(isalpha(*str))
as[ii++] = *str;
return 0;
}
/*
函数:解析sheet[1...n].xml其中一个
返回值:0-成功,<0返回
*/
int ParseSheetXml(const char * _IN_ xmlcontent, const int _IN_ size, const int _IN_ wsheetidx, WorkBook * _OUT_ wBook)
{
if( xmlcontent == NULL || size <= 0 || wBook == NULL || wsheetidx < SHEET_1_IDX)
return -1;
xmlDocPtr doc = NULL;
xmlNodePtr root = NULL, curnode = NULL, dinode = NULL, sdnode = NULL, rnode = NULL, cnode = NULL, vnode = NULL;
xmlChar * row = NULL, *col = NULL, *diref = NULL,*rnum = NULL,*cnum = NULL,*t = NULL, *v = NULL;
int rows = 0, cols = 0,irowpos = 0, icolpos = 0, ivpos = 0;
int ii = 0,jj = 0;
char strcol[5+1];
memset(strcol,0x00,sizeof(strcol));
doc = xmlParseMemory(xmlcontent,size); //parse xml in memory
if( doc == NULL )
return -1;
root = xmlDocGetRootElement(doc);
for( curnode = root->children; curnode; curnode = curnode->next ){
if(xmlStrcasecmp(curnode->name,BAD_CAST"dimension")==0){
diref = xmlGetProp(curnode,BAD_CAST"ref");
CalcRowsAndCols(diref,&rows,&cols);
wBook->workSheets[wsheetidx].cellrows = rows;
wBook->workSheets[wsheetidx].cellcols = cols;
wBook->workSheets[wsheetidx].cells = (Cell **)malloc(rows*sizeof(Cell*));
memset(wBook->workSheets[wsheetidx].cells, 0x00, rows*sizeof(Cell*));
for( ii = 0; ii < rows; ii++ ){
wBook->workSheets[wsheetidx].cells[ii] = (Cell*)malloc(cols*sizeof(Cell));
memset(wBook->workSheets[wsheetidx].cells[ii], 0x00, cols*sizeof(Cell));
for(jj = 0; jj < cols; jj++){
memset( wBook->workSheets[wsheetidx].cells[ii][jj].value, 0x00, sizeof(wBook->workSheets[wsheetidx].cells[ii][jj].value) );
}
}
break;
}
}
for( curnode = root->children; curnode; curnode = curnode->next ){
if(xmlStrcasecmp(curnode->name,BAD_CAST"sheetData")==0){
/*
<sheetData>
<row r="1" spans="1:3">
<c r="A1" t="s">
<v>0</v>
</c>
<c r="B1">
<v>111</v>
</c>
<c r="C1" t="s">
<v>1</v>
</c>
</row>
</sheetData>
*/
for( rnode=curnode->children; rnode; rnode = rnode->next ){/* <row r="1" spans="1:3"> </row> */
rnum = xmlGetProp(rnode,BAD_CAST"r");
for(cnode = rnode->children; cnode; cnode = cnode->next){ /* <c r="A1" t="s"> */
cnum = xmlGetProp(cnode,BAD_CAST"r");
if( cnum != NULL ){
irowpos = atoi(rnum) - 1; /* 当前的行号,因为要导入二维数组中,所以从0开始算 */
SplitAlphaStr(cnum,strcol);
AlphaColToNum(strcol,&icolpos); /* 当前的列号,已做处理从0开始算 */
t = xmlGetProp(cnode,BAD_CAST"t");
vnode = cnode->children; /* <v>0</v> */
v = xmlNodeGetContent(vnode);
if( v != NULL ){
if( t != NULL ){
if( t[0] == 's' ){
/* 单元格的值 <v>0</v> 在sharedstrings数组中 */
ivpos = atoi(v);
strcpy( wBook->workSheets[wsheetidx].cells[irowpos][icolpos].value, wBook->sharedstrings[ivpos].value);
}
}else{
/* 单元格的值在标签 <v>111</v> 中 */
strcpy( wBook->workSheets[wsheetidx].cells[irowpos][icolpos].value, v);
}
}
}//end if cnum != NULL
}//end if cnode
} // end if rnode
}//end if(xmlStrcasecmp(curnode->name,BAD_CAST"sheetData")==0)
}//end for( curnode = root->children; curnode ;.....
return 0;
}
/*
函数:解析worksheets下面的sheets[1...n].xml至结构体wBook
返回值:0-成功,<0失败
备注:对于一些特别大的excel会出现内存占用特别高的情况。
*/
int GetWorkSheets(WorkBook * _OUT_ wBook)
{
if( wBook == NULL || wBook->sheetcnt < 1)
return -1;
int ii = 0,bufsize = 0, ret = 0,sheetidx = 0;
char *path = NULL,*xmlbuf = NULL;
for( ii = 0; ii < wBook->sheetcnt; ii++ ){
if( wBook->workSheets[ii].sheetName[0] != '\0' ){
path = wBook->workSheets[ii].target;
ret = GetXmlFileContent(path, &xmlbuf, &bufsize);
if( ret < 0 )
return -errno;
ParseSheetXml(xmlbuf,bufsize,ii,wBook);
}
}
return 0;
}
int PrintWorkBook(WorkBook *wBook)
{
if( wBook == NULL )
return -1;
int ii = 0;
int row = 0, col = 0;
WorkSheet *wsheet = NULL;
for( ii = 0; ii < wBook->sheetcnt; ii++){
wsheet = wBook->workSheets+ii;
if( wsheet->sheetName[0] != '\0' ){
printf("sheetname:[%s]\n",wsheet->sheetName);
for( row = 0; row < wsheet->cellrows; row++ ){
for (col = 0; col < wsheet->cellcols; col++){
printf("%s\t",wsheet->cells[row][col].value);
}
printf("\n");
}
}
}
return 0;
}
int FreeWorkBook(WorkBook *wBook)
{
if( wBook == NULL )
return -1;
int ii = 0, row = 0;
WorkSheet *wsheet = NULL;
for( ii = 0; ii < wBook->sheetcnt; ii++ ){
wsheet = wBook->workSheets+ii;
for( row = 0; row <wsheet->cellrows; row++ ){
free(wsheet->cells[row]);
wsheet->cells[row] = NULL;
}
free(wsheet->cells);
wsheet->cells = NULL;
}
free(wBook->workSheets);
wBook->workSheets = NULL;
free(wBook->sharedstrings);
wBook->sharedstrings = NULL;
return 0;
}
/*
函数:从指定的XML文件中读取数据
返回值:0-成功,<0失败
*/
int GetXmlFileContent(char * _IN_ path, char ** _OUT_ xmlcontent, int * _OUT_ outsize )
{
char *content = NULL;
FILE *fp = NULL;
int fileSize = 0;
if( path == NULL )
return -1;
fp = fopen(path,"r");
if( fp == NULL )
return -1;
fseek(fp,0,SEEK_END);
fileSize=ftell(fp);
rewind(fp);
content = (char *)malloc(fileSize);
if( content == NULL )
return -1;
fread(content,1,fileSize,fp);
fclose(fp);
*xmlcontent = content;
*outsize = fileSize;
return 0;
}
int main()
{
int ret = -1;
char *xmlbuf = NULL;
int bufsize = -1;
char path[1024+1];
WorkBook wBook;
SharedString *sharestrings;
memset(&wBook, 0x00, sizeof(WorkBook));
memset(path, 0x00, sizeof(path));
strcpy(wBook.unziprootdir, "xlsxzip");
snprintf(path,sizeof(path),"%s/%s",wBook.unziprootdir,WORKBOOK_XML_PATH);
ret = GetXmlFileContent(path,&xmlbuf,&bufsize);
if( ret < 0 ){
printf("Load %s failed,exit\n", WORKBOOK_XML_PATH);
return -1;
}
ParseWorkBookXml(xmlbuf, bufsize, &wBook);
free(xmlbuf);
xmlbuf = NULL;
bufsize = 0;
memset(path,0x00,sizeof(path));
snprintf(path,sizeof(path),"%s/%s",wBook.unziprootdir,WORKBOOK_XML_RELS_PATH);
ret = GetXmlFileContent(path,&xmlbuf,&bufsize);
if( ret < 0 ){
printf("Load %s failed,exit\n", WORKBOOK_XML_RELS_PATH);
return -1;
}
ParseWorkBookXmlRels(xmlbuf,bufsize,&wBook);
free(xmlbuf);
xmlbuf = NULL;
bufsize = 0;
memset(path,0x00,sizeof(path));
snprintf(path,sizeof(path),"%s/%s",wBook.unziprootdir,SHAREDSTRINGS_XML_PATH);
ret = GetXmlFileContent(path,&xmlbuf,&bufsize);
if( ret < 0 ){
printf("Load %s failed,exit\n", SHAREDSTRINGS_XML_PATH);
return -1;
}
ParseSharedStringsXml(xmlbuf,bufsize,&wBook);
free(xmlbuf);
GetWorkSheets(&wBook);
PrintWorkBook(&wBook);
FreeWorkBook(&wBook);
memset(&wBook,0x00,sizeof(wBook));
return 0;
}
四、输出
cc readxlsx.c -lxml2
a.out