当前位置: 首页 > news >正文

美团龙猫利用expat库实现的保存xml指定范围数据到csv的C程序

用自己代码逐个字符解析的速度较慢,尝试了libxml2也比较慢,它需要一次性读入内存,而expat库支持流式读取。就让龙猫写了一个程序,毕竟是久经考验的库,程序很快就调试通过了。要不是我一开始没信心,让他先输出10行试试,还能少走很多弯路。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>#define MAX_CELL_CONTENT 256// 解析范围
typedef struct {int start_row;int end_row;char start_col;char end_col;
} ParseRange;// 全局结果存储
typedef struct {int row;char col;char value[MAX_CELL_CONTENT];
} CellResult;// 范围检查
int is_cell_in_range(int row, char col, ParseRange range) {return (row >= range.start_row && row <= range.end_row &&col >= range.start_col && col <= range.end_col);
}
// 全局结果存储(动态数组)
typedef struct {CellResult *data;     // 动态数组int count;            // 当前数量int capacity;         // 当前容量
} DynamicResults;DynamicResults all_results = {0}; // 全局变量// 初始化动态数组
void init_results() {all_results.capacity = 1024;  // 初始容量all_results.data = malloc(all_results.capacity * sizeof(CellResult));all_results.count = 0;
}// 扩容动态数组
void ensure_capacity(int needed) {if (all_results.count + needed >= all_results.capacity) {all_results.capacity *= 2;  // 翻倍扩容all_results.data = realloc(all_results.data, all_results.capacity * sizeof(CellResult));}
}// 添加结果(无限制版本)
void add_cell_result(int row, char col, const char *value, int is_empty) {ensure_capacity(1);  // 确保有空间all_results.data[all_results.count].row = row;all_results.data[all_results.count].col = col;strncpy(all_results.data[all_results.count].value, value, MAX_CELL_CONTENT - 1);all_results.count++;
}// 释放内存
void free_results() {free(all_results.data);all_results.data = NULL;all_results.count = all_results.capacity = 0;
}// 解析Excel范围 (如"A1:Z100")
int parse_excel_range(const char *range_str, ParseRange *range) {if (sscanf(range_str, "%c%d:%c%d", &range->start_col, &range->start_row,&range->end_col, &range->end_row) != 4) {return -1;}if (range->start_col > range->end_col) return -1;if (range->start_row > range->end_row) return -1;return 0;
}// 解析器状态
typedef struct {ParseRange range;int in_row;int current_row;char current_col;int value_started;char temp_value[MAX_CELL_CONTENT];int value_len;int rows_parsed;  // 已解析行数
} ParserState;// 开始标签回调
void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0) {state->in_row = 1;state->current_row = -1;// 解析行号属性for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_row = atoi(attrs[i+1]);state->rows_parsed++;break;}}if(1==0)printf("解析行 %d\n", state->current_row);}else if (strcmp(name, "c") == 0 && state->in_row) {// 解析列属性for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_col = attrs[i+1][0];break;}}}else if (strcmp(name, "v") == 0 || strcmp(name, "t") == 0) {if (state->current_row >= state->range.start_row && state->current_row <= state->range.end_row) {state->value_started = 1;state->value_len = 0;state->temp_value[0] = '\0';}}
}// 文本内容回调
void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {ParserState *state = (ParserState*)user_data;if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {memcpy(state->temp_value + state->value_len, s, len);state->value_len += len;state->temp_value[state->value_len] = '\0';}
}// 结束标签回调
void XMLCALL end_element(void *user_data, const XML_Char *name) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0) {state->in_row = 0;}else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {if (is_cell_in_range(state->current_row, state->current_col, state->range)) {if(1==0)printf("  单元格 %c%d: '%s'\n", state->current_col, state->current_row, state->temp_value);add_cell_result(state->current_row, state->current_col, state->temp_value, 0);}state->value_started = 0;}
}// 主解析函数
int parse_sheet_xml(const char *filename, ParseRange range) {XML_Parser parser = XML_ParserCreate(NULL);ParserState state = {0};state.range = range;XML_SetUserData(parser, &state);XML_SetElementHandler(parser, start_element, end_element);XML_SetCharacterDataHandler(parser, character_data);FILE *file = fopen(filename, "rb");if (!file) {printf("错误: 无法打开文件 %s\n", filename);XML_ParserFree(parser);return -1;}char buffer[8192];int done;do {size_t len = fread(buffer, 1, sizeof(buffer), file);done = (len < sizeof(buffer));if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {printf("解析错误: %s (行 %d)\n", XML_ErrorString(XML_GetErrorCode(parser)),XML_GetCurrentLineNumber(parser));break;}} while (!done);fclose(file);XML_ParserFree(parser);return 0;
}
/*** 输出CSV文件* @param filename 输出文件名*/
int save_results_to_csv(const char *filename) {FILE *csv = fopen(filename, "w");if (!csv) {printf("错误: 无法创建CSV文件 %s\n", filename);return -1;}// 计算列范围char min_col = all_results.data[0].col;char max_col = all_results.data[0].col;for (int i = 1; i < all_results.count; i++) {if (all_results.data[i].col < min_col) min_col = all_results.data[i].col;if (all_results.data[i].col > max_col) max_col = all_results.data[i].col;}// 输出标题fprintf(csv, "Row");for (char col = min_col; col <= max_col; col++) {fprintf(csv, ",%c", col);}    fprintf(csv, "\n");// 数据行int current_row = all_results.data[0].row;int row_start_idx = 0;for (int i = 0; i < all_results.count; i++) {	if (all_results.data[i].row != current_row) {// 输出当前行fprintf(csv, "%d", current_row);for (char col = min_col; col <= max_col; col++) {int found = 0;for (int j = row_start_idx; j < i; j++) {if (all_results.data[j].col == col) {fprintf(csv, ",%s", all_results.data[j].value);found = 1;break;}}if (!found) fprintf(csv, ",");}fprintf(csv, "\n");// 下一行current_row = all_results.data[i].row;row_start_idx = i;}}fclose(csv);printf("CSV已保存到 %s\n", filename);return 0;
}// 主函数
int main(int argc, char *argv[]) {if (argc != 3) {printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);return 1;}ParseRange range;if (parse_excel_range(argv[2], &range) != 0) {printf("错误: 无效范围格式,应为 A1:Z100\n");return 1;}printf("解析范围: %c%d:%c%d\n", range.start_col, range.start_row, range.end_col, range.end_row);init_results();   if (parse_sheet_xml(argv[1], range) == 0) {// 生成CSV文件名(替换.xml为.csv)char csv_filename[256];strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);char *ext = strrchr(csv_filename, '.');if (ext) strcpy(ext, ".csv");else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);// 输出CSVsave_results_to_csv(csv_filename);}   free_results();   return 0;
}

编译执行

gcc -o expatxml3 expatxml3.c -lexpat -O3
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A1:Z10000
解析范围: A1:Z10000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal    0m6.508s
user    0m2.132s
sys     0m0.392s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A100001:Z110000
解析范围: A100001:Z110000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal    0m6.534s
user    0m2.111s
sys     0m0.431s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A1:Z1000000
解析范围: A1:Z1000000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal    0m10.207s
user    0m3.046s
sys     0m1.795s
time ./expatxml3 lineitem/xl/worksheets/sheet1.xml A300000:Z660000
解析范围: A300000:Z660000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal    0m9.378s
user    0m2.574s
sys     0m1.030s

针对60万行16列,300MB xml, 这个时间还不错,但是没有考虑sharedstrings.xml,否则会慢一些。


文章转载自:

http://ByyQtJLl.qxnLc.cn
http://lx2iLgRT.qxnLc.cn
http://9ylUE08R.qxnLc.cn
http://SbX10Laf.qxnLc.cn
http://m6XKsWmv.qxnLc.cn
http://8R8NPZix.qxnLc.cn
http://uIjYGJX5.qxnLc.cn
http://h0cd5soN.qxnLc.cn
http://E1tVfj2y.qxnLc.cn
http://FVBLGD8B.qxnLc.cn
http://0wLjLp67.qxnLc.cn
http://NUgOCjKA.qxnLc.cn
http://roxiIwQa.qxnLc.cn
http://f4TCOGrR.qxnLc.cn
http://LaJoMS7a.qxnLc.cn
http://AYnpCZ3l.qxnLc.cn
http://pkRD6DRM.qxnLc.cn
http://o4dYJskv.qxnLc.cn
http://dOTvZcz7.qxnLc.cn
http://kBTLTPc7.qxnLc.cn
http://hqoCYKfL.qxnLc.cn
http://OcFUqK7O.qxnLc.cn
http://jvmvGR0a.qxnLc.cn
http://MtJ3fgQW.qxnLc.cn
http://4tIOxip7.qxnLc.cn
http://YJqNSsBD.qxnLc.cn
http://3e3Pmtwo.qxnLc.cn
http://KRnQ5AUL.qxnLc.cn
http://p5pjPAXo.qxnLc.cn
http://6egrLt83.qxnLc.cn
http://www.dtcms.com/a/364699.html

相关文章:

  • 【leetcode】130. 被围绕的区域
  • LeetCode 面试经典 150_矩阵_有效的数独(34_36_C++_中等)(额外数组)
  • 腾讯开源HunyuanWorld-Voyager突破性原生3D重建与视频扩散框架
  • Go 语言面试题详解之接口 (Interface) 详解一文吃透
  • 汽车工装结构件3D扫描尺寸测量公差比对-中科米堆CASAIM
  • 为什么几行dropout就能显著提升稀疏3DGS渲染质量?
  • 网格图--Day04--网格图DFS--2684. 矩阵中移动的最大次数,1254. 统计封闭岛屿的数目,130. 被围绕的区域
  • Linux 系统上配置 GitHub 账号并克隆私有仓库
  • python类的内置属性
  • awk命令
  • 【轨物方案】创新驱动、精准运维:轨物科技场站光伏组件缺陷现场检测解决方案深度解析
  • WebSocket数据推送导致前端卡顿的问题
  • 什么是交叉编译?
  • Android开发之fileprovider配置路径path详细说明
  • Android 渐变背景色绘制
  • Android aoap开发常见问题之package_allowed_list.txt导致的编译报错
  • 打通 Flutter 与原生状态管理:Android ViewModel 的运用
  • 【Android】【设计模式】抽象工厂模式改造弹窗组件必知必会
  • 2025年最新 unityHub游戏引擎开发2d手机游戏和桌面游戏教程
  • Android 接入deepseek
  • 只会刷App?大学生学透Android开发,直接开挂!
  • HTTP协议——理解相关概念、模拟实现浏览器访问自定义服务器
  • PixPin截图工具完全体下载|解决Snipaste无法长截图问题+比QQ截图更专业+无广告绿色版支持Win7-Win11全系统兼容
  • Java:跨越时代的编程语言,持续赋能数字化转型
  • Ansible高效管理大项目实战技巧
  • 【GitOps】初始Argo CD
  • 《用 Flask 构建 RESTful API:用户数据管理的实战指南》
  • Spring Boot 2.7 中资源销毁的先后顺序
  • 从源码入手,详解Linux进程
  • Ollama大模型 本地部署+使用教程