当前位置: 首页 > news >正文

我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的

以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。
重要改正点:
1.二分查找用goto控制迭代,返回<row的正确位置
2.在缓冲区头填上父标签使expat能连续解析不报错

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>#define MAX_CELL_CONTENT 256typedef struct {int start_row;int end_row;char start_col;char end_col;
} ParseRange;typedef struct {ParseRange range;FILE *csv;FILE *xml_file;XML_Parser parser;int in_row;int current_row;char current_col;int value_started;char temp_value[MAX_CELL_CONTENT];int value_len;int skip_row;long row_start_pos;int first_row_processed;char first_row_max_col;
} ParserState;int parse_excel_range(const char *range_str, ParseRange *range) {if (sscanf(range_str, "%c%d:%c%d", &range->start_col, &range->start_row,&range->end_col, &range->end_row) != 4) {return -1;}if (range->start_col > range->end_col) return -1;if (range->start_row > range->end_row) return -1;return 0;
}long binary_search_row(FILE *file, int target_row) {long low = 0;fseek(file, 0, SEEK_END);long high = ftell(file);long mid = 0;char buffer[1024];int found_row = -1;long found_pos = 0;while (low <= high) {
A:mid = (low + high) / 2;fseek(file, mid, SEEK_SET);int c;while ((c = fgetc(file)) != EOF) { if (c == '<') {char tag[128] = {c};int tag_len = 1;while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {tag[tag_len++] = c;}tag[tag_len] = '\0';int rt = (strncmp(tag, "<row", 4) == 0);if (rt) {long row_start_pos = mid + (tag_len);char *row_attr = strstr(tag, " r=\"");if (row_attr) {int row_num = atoi(row_attr + 4);found_pos = ftell(file) - tag_len - 1;found_row = row_num;if (row_num == target_row) {return found_pos;} else if (row_num < target_row) {low = mid + 1;goto A;} else {high = mid - 1;goto A;}}}}}if (c == EOF) break;}if (found_row < target_row) {return found_pos;}return 0;
}void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0) {state->row_start_pos = XML_GetCurrentByteIndex(state->parser);state->in_row = 1;state->current_row = -1;state->skip_row = 0;for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_row = atoi(attrs[i+1]);break;}}if (state->current_row > state->range.end_row) {XML_StopParser(state->parser, 0);return;}        if (state->current_row < state->range.start_row) {state->skip_row = 1;return;}fprintf(state->csv, "%d", state->current_row);}else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_col = attrs[i+1][0];break;}}}else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {if (state->current_col >= state->range.start_col && state->current_col <= state->range.end_col) {state->value_started = 1;state->value_len = 0;state->temp_value[0] = '\0';}}
}void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {ParserState *state = (ParserState*)user_data;if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {memcpy(state->temp_value + state->value_len, s, len);state->value_len += len;state->temp_value[state->value_len] = '\0';}
}void XMLCALL end_element(void *user_data, const XML_Char *name) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {for (char col = state->current_col + 1; col <= state->range.end_col; col++) {fprintf(state->csv, ",");}fprintf(state->csv, "\n");state->in_row = 0;}else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {if (state->current_col >= state->range.start_col && state->current_col <= state->range.end_col) {static char last_col = 0;if (last_col == 0) last_col = state->range.start_col;for (char col = last_col; col < state->current_col; col++) {fprintf(state->csv, ",");}fprintf(state->csv, ",%s", state->temp_value);last_col = state->current_col + 1;}state->value_started = 0;}
}int main(int argc, char *argv[]) {if (argc != 3) {printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);return 1;}ParseRange range;if (parse_excel_range(argv[2], &range) != 0) {printf("错误: 无效范围格式\n");return 1;}char csv_filename[256];strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);char *ext = strrchr(csv_filename, '.');if (ext) strcpy(ext, ".csv");else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);FILE *csv = fopen(csv_filename, "w");if (!csv) {printf("错误: 无法创建CSV\n");return 1;}fprintf(csv, "Row");for (char col = range.start_col; col <= range.end_col; col++) {fprintf(csv, ",%c", col);}fprintf(csv, "\n");FILE *file = fopen(argv[1], "rb");if (!file) {printf("错误: 无法打开文件 %s\n", argv[1]);fclose(csv);return -1;}long start_pos = binary_search_row(file, range.start_row);if (start_pos > 0) {fseek(file, start_pos, SEEK_SET);} else {fseek(file, 0, SEEK_SET);}XML_Parser parser = XML_ParserCreate(NULL);ParserState state = {0};state.range = range;state.csv = csv;state.parser = parser;XML_SetUserData(parser, &state);XML_SetElementHandler(parser, start_element, end_element);XML_SetCharacterDataHandler(parser, character_data);fseek(file, start_pos, SEEK_SET);char buffer[8192] = "<sheetData>";int done;int i = 0;do {if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);done = (len < sizeof(buffer) - 11 * (i == 0));size_t actual_len = len;if (!done) {if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {break;}i++;}} while (!done);fclose(file);fclose(csv);XML_ParserFree(parser);printf("CSV已保存到 %s\n", csv_filename);return 0;
}

编译运行和比较

gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal	0m1.865s
user	0m1.836s
sys	0m0.028sroot@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csvreal	0m2.870s
user	0m1.064s
sys	0m0.076s

文章转载自:

http://m8FNi4Qm.ckhpg.cn
http://s9HFrJ3m.ckhpg.cn
http://rNmGtXv7.ckhpg.cn
http://YdDzgy7c.ckhpg.cn
http://nFzkjagU.ckhpg.cn
http://9FqKusx2.ckhpg.cn
http://5t22Mmw7.ckhpg.cn
http://yyVXeyU5.ckhpg.cn
http://pQ2lew8X.ckhpg.cn
http://BUciDHzH.ckhpg.cn
http://wwq7RgNB.ckhpg.cn
http://SOYDxXvh.ckhpg.cn
http://JfPVuqye.ckhpg.cn
http://Kt7D8f3Y.ckhpg.cn
http://2IlFCcGl.ckhpg.cn
http://PFWdvC7o.ckhpg.cn
http://lkM4cZDs.ckhpg.cn
http://oIQOUVW0.ckhpg.cn
http://ufuEdk4z.ckhpg.cn
http://I87CB4GQ.ckhpg.cn
http://4hTEvo2D.ckhpg.cn
http://2WCJhYEz.ckhpg.cn
http://Etg0jAUe.ckhpg.cn
http://3ankzakf.ckhpg.cn
http://iClvamt1.ckhpg.cn
http://siNbvRUn.ckhpg.cn
http://rACq0A2R.ckhpg.cn
http://WsiPhvCQ.ckhpg.cn
http://1q6jaaHs.ckhpg.cn
http://FXtmfax5.ckhpg.cn
http://www.dtcms.com/a/368310.html

相关文章:

  • GPU测速方法
  • OpenCV C++ 色彩空间详解:转换、应用与 LUT 技术
  • 前端笔记2025
  • 跨境电商:如何提高电商平台数据抓取效率?
  • python + Flask模块学习 2 接收用户请求并返回json数据
  • K8S-Pod(上)
  • 【代码随想录day 23】 力扣 93.复原IP地址
  • 数据结构:栈和队列(下)
  • SAP官方授权供应商名单2025
  • 结构体简介
  • UE4 Mac构建编译报错 no template named “is_void_v” in namespace “std”
  • 嵌入式系统学习Day30(udp)
  • 【Linux】Linux进程状态和僵尸进程:一篇看懂“进程在忙啥”
  • 理解UE4中C++17的...符号及enable_if_t的用法及SFINAE思想
  • 某头部能源集团“数据治理”到“数智应用”跃迁案例剖析
  • 阿里云服务器配置ssl-docker nginx
  • 2025年COR SCI2区,基于近似细胞分解的能源高效无人机路径规划问题用于地质灾害监测,深度解析+性能实测
  • 实战案例:数字孪生+可视化大屏,如何高效管理智慧能源园区?
  • 容器的定义及工作原理
  • 【Python - 类库 - BeautifulSoup】(01)“BeautifulSoup“使用示例
  • 神经网络之深入理解偏置
  • 三、神经网络
  • 仓颉编程语言青少年基础教程:布尔类型、元组类型
  • UC Berkeley 开源大世界模型(LWM):多模态大模型领域世界模型技术新进展
  • 一次由CellStyle.hashCode值不一致引发的HashMap.get返回null问题排查
  • 【Java鱼皮】智能协同云图库项目梳理
  • 固定资产报废在BPM或OA中审批,再通过接口传到SAP
  • Redis-持久化
  • 寻找AI——初识3D建模AI
  • Playwright MCP Server - FAQ