利用美团龙猫用libxml2编写XML转CSV文件C程序
提示词
编写c程序,实现读取xml的row和c标签信息,并输出到csv格式,要求是:数值型c,输出标签的内容,字符串型c(t = “inlineStr”),输出的内容,row的r属性表是行号,c的r属性是字母列号行号,如果不连续,就在中间补空列,用逗号分隔各列,一行完全输出后换行,如果1行全为空,不输出该行,为便于查对,每行开头输出row的r属性行号,利用内存映射文件实现。只输出代码,不干别的。
他输出了如下程序
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <libxml/parser.h>
#include <libxml/tree.h>// 字母列号转数字,A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {int num = 0;for (int i = 0; i < len; i++) {if (col[i] >= 'A' && col[i] <= 'Z')num = num * 26 + (col[i] - 'A' + 1);else if (col[i] >= 'a' && col[i] <= 'z')num = num * 26 + (col[i] - 'a' + 1);elsebreak;}return num - 1;
}// 数字转列字母
void num_to_col_letter(int num, char* out) {int len = 0;int tmp = num + 1;char stack[10];int sp = 0;while (tmp) {int rem = (tmp - 1) % 26;stack[sp++] = rem + 'A';tmp = (tmp - 1) / 26;}while (sp--) out[len++] = stack[sp];out[len] = 0;
}// 获取属性值
const char* get_prop(xmlNodePtr node, const char* name) {xmlChar* val = xmlGetProp(node, (const xmlChar*)name);if (!val) return NULL;return (const char*)val;
}// 解析c节点的值
const char* parse_c_value(xmlNodePtr cnode) {xmlNodePtr child = cnode->children;const char* t_prop = get_prop(cnode, "t");if (t_prop && strcmp(t_prop, "inlineStr") == 0) {while (child) {if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "t") == 0) {xmlChar* tval = xmlNodeGetContent(child);return (const char*)tval;}child = child->next;}return "";} else {while (child) {if (child->type == XML_ELEMENT_NODE && strcmp((const char*)child->name, "v") == 0) {xmlChar* vval = xmlNodeGetContent(child);return (const char*)vval;}child = child->next;}return "";}
}int main(int argc, char* argv[]) {if (argc != 2) {fprintf(stderr, "Usage: %s <input.xml>\n", argv[0]);exit(1);}const char* fname = argv[1];int fd = open(fname, O_RDONLY);if (fd < 0) { perror("open"); exit(1); }struct stat sb;if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }size_t flen = sb.st_size;char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }//printf("mmap");// 初始化libxml2xmlInitParser();// 解析内存中的xmlxmlDocPtr doc = xmlParseMemory(mapped, flen);if (!doc) {fprintf(stderr, "xmlParseMemory error\n");exit(1);}xmlNodePtr root = xmlDocGetRootElement(doc);// 存储所有行char*** rows = NULL;int* row_nums = NULL;int row_count = 0;int max_row = 0, max_col = 0;//printf("begin ");for (xmlNodePtr node = root; node; node = node->next) {if (node->type == XML_ELEMENT_NODE && strcmp((const char*)node->name, "sheetData") == 0) {//printf("sheetData ");for (xmlNodePtr row_node = node->children; row_node; row_node = row_node->next) {if (row_node->type == XML_ELEMENT_NODE && strcmp((const char*)row_node->name, "row") == 0) {//printf("row ");const char* r_prop = get_prop(row_node, "r");if (!r_prop) continue;int row_num = atoi(r_prop);if (row_num < 1) continue;// 扩展rowsif (row_num > max_row) {rows = (char***)realloc(rows, sizeof(char**) * (row_num + 1));row_nums = (int*)realloc(row_nums, sizeof(int) * (row_num + 1));for (int i = max_row + 1; i <= row_num; i++) {rows[i] = NULL;row_nums[i] = 0;}max_row = row_num;}// 统计本行最大列int this_max_col = -1;for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {const char* r_attr = get_prop(cnode, "r");if (!r_attr) continue;int col_len = 0;while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(r_attr, col_len);if (col_num > this_max_col) this_max_col = col_num;}}if (this_max_col < 0) continue;if (this_max_col > max_col) max_col = this_max_col;// 初始化本行rows[row_num] = (char**)calloc((this_max_col + 1), sizeof(char*));row_nums[row_num] = row_num;// 遍历c节点for (xmlNodePtr cnode = row_node->children; cnode; cnode = cnode->next) {if (cnode->type == XML_ELEMENT_NODE && strcmp((const char*)cnode->name, "c") == 0) {const char* r_attr = get_prop(cnode, "r");if (!r_attr) continue;int col_len = 0;while (r_attr[col_len] && r_attr[col_len] >= 'A' && (r_attr[col_len] <= 'Z' || r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(r_attr, col_len);if (col_num < 0) continue;const char* val = parse_c_value(cnode);rows[row_num][col_num] = strdup(val ? val : "");//if (val && (val != (const char*)xmlNodeGetContent(cnode))) xmlFree((void*)val);}}row_count++;}}}}// 输出到stdoutfor (int r = 1; r <= max_row; r++) {if (!rows[r]) continue;int has_data = 0;for (int c = 0; c <= max_col; c++) {if (rows[r][c] && strlen(rows[r][c]) > 0) {has_data = 1; break;}}if (!has_data) continue;// 输出行号printf("%d", r);for (int c = 0; c <= max_col; c++) {if (rows[r][c] && strlen(rows[r][c]) > 0)printf(",%s", rows[r][c]);elseprintf(",");}printf("\n");}
//return 0;// 释放内存for (int r = 1; r <= max_row; r++) {if (rows[r]) {for (int c = 0; c <= max_col; c++) {if (rows[r][c]) free(rows[r][c]);}free(rows[r]);}}if (rows) free(rows);if (row_nums) free(row_nums);xmlFreeDoc(doc);xmlCleanupParser();munmap(mapped, flen);close(fd);return 0;
}
查看#include头文件可知,它调用了libxml2库,这是一个DOM类型的库
用whereis命令查看,
whereis libxml2.so
libxml2.so: /usr/lib/aarch64-linux-gnu/libxml2.so
whereis tree.h
tree.h:
我的系统里有so文件,但是没有头文件,可能是安装不全所致,所以下载源代码 ,将里面的include目录下内容解压到当前目录。
然后用如下命令行编译,执行。一开始报错,可能与错误释放内存映射文件有关,后来将相关的释放内存代码xmlFree注释掉,就不报错了。
gcc catxml.c -I . -lxml2 -o catxml
./catxml sheet13.xml
munmap_chunk(): invalid pointer
Aborted (core dumped)
./catxml sheet13.xml
1,1,15519,785,1,17.00,24386.67,0.04,0.02,,,35137.0,35107.0,35146.0,,,
2,1,6731,732,2,36.00,58958.28,0.09,0.06,,,35167.0,35123.0,35175.0,,,time ./catxml /par/lineitem/xl/worksheets/sheet1.xml >catsheet1.csv
^Creal 1m7.394s
user 0m7.728s
sys 0m5.088s
输出小文件正常,输出700MB的大文件就不行了。