我改写的二分法XML转CSV文件程序速度追上了张泽鹏先生的
以下是美团龙猫初稿,我改正,DeepSeek重新格式化的代码。
重要改正点:
1.二分查找用goto控制迭代,返回<row的正确位置
2.在缓冲区头填上父标签使expat能连续解析不报错
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <expat.h>#define MAX_CELL_CONTENT 256typedef struct {int start_row;int end_row;char start_col;char end_col;
} ParseRange;typedef struct {ParseRange range;FILE *csv;FILE *xml_file;XML_Parser parser;int in_row;int current_row;char current_col;int value_started;char temp_value[MAX_CELL_CONTENT];int value_len;int skip_row;long row_start_pos;int first_row_processed;char first_row_max_col;
} ParserState;int parse_excel_range(const char *range_str, ParseRange *range) {if (sscanf(range_str, "%c%d:%c%d", &range->start_col, &range->start_row,&range->end_col, &range->end_row) != 4) {return -1;}if (range->start_col > range->end_col) return -1;if (range->start_row > range->end_row) return -1;return 0;
}long binary_search_row(FILE *file, int target_row) {long low = 0;fseek(file, 0, SEEK_END);long high = ftell(file);long mid = 0;char buffer[1024];int found_row = -1;long found_pos = 0;while (low <= high) {
A:mid = (low + high) / 2;fseek(file, mid, SEEK_SET);int c;while ((c = fgetc(file)) != EOF) { if (c == '<') {char tag[128] = {c};int tag_len = 1;while ((c = fgetc(file)) != EOF && c != '>' && tag_len < sizeof(tag) - 1) {tag[tag_len++] = c;}tag[tag_len] = '\0';int rt = (strncmp(tag, "<row", 4) == 0);if (rt) {long row_start_pos = mid + (tag_len);char *row_attr = strstr(tag, " r=\"");if (row_attr) {int row_num = atoi(row_attr + 4);found_pos = ftell(file) - tag_len - 1;found_row = row_num;if (row_num == target_row) {return found_pos;} else if (row_num < target_row) {low = mid + 1;goto A;} else {high = mid - 1;goto A;}}}}}if (c == EOF) break;}if (found_row < target_row) {return found_pos;}return 0;
}void XMLCALL start_element(void *user_data, const XML_Char *name, const XML_Char **attrs) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0) {state->row_start_pos = XML_GetCurrentByteIndex(state->parser);state->in_row = 1;state->current_row = -1;state->skip_row = 0;for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_row = atoi(attrs[i+1]);break;}}if (state->current_row > state->range.end_row) {XML_StopParser(state->parser, 0);return;} if (state->current_row < state->range.start_row) {state->skip_row = 1;return;}fprintf(state->csv, "%d", state->current_row);}else if (strcmp(name, "c") == 0 && state->in_row && !state->skip_row) {for (int i = 0; attrs[i]; i += 2) {if (strcmp(attrs[i], "r") == 0) {state->current_col = attrs[i+1][0];break;}}}else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->in_row && !state->skip_row) {if (state->current_col >= state->range.start_col && state->current_col <= state->range.end_col) {state->value_started = 1;state->value_len = 0;state->temp_value[0] = '\0';}}
}void XMLCALL character_data(void *user_data, const XML_Char *s, int len) {ParserState *state = (ParserState*)user_data;if (state->value_started && state->value_len + len < MAX_CELL_CONTENT - 1) {memcpy(state->temp_value + state->value_len, s, len);state->value_len += len;state->temp_value[state->value_len] = '\0';}
}void XMLCALL end_element(void *user_data, const XML_Char *name) {ParserState *state = (ParserState*)user_data;if (strcmp(name, "row") == 0 && state->in_row && !state->skip_row) {for (char col = state->current_col + 1; col <= state->range.end_col; col++) {fprintf(state->csv, ",");}fprintf(state->csv, "\n");state->in_row = 0;}else if ((strcmp(name, "v") == 0 || strcmp(name, "t") == 0) && state->value_started) {if (state->current_col >= state->range.start_col && state->current_col <= state->range.end_col) {static char last_col = 0;if (last_col == 0) last_col = state->range.start_col;for (char col = last_col; col < state->current_col; col++) {fprintf(state->csv, ",");}fprintf(state->csv, ",%s", state->temp_value);last_col = state->current_col + 1;}state->value_started = 0;}
}int main(int argc, char *argv[]) {if (argc != 3) {printf("用法: %s <xml文件> <范围(A1:Z100)>\n", argv[0]);return 1;}ParseRange range;if (parse_excel_range(argv[2], &range) != 0) {printf("错误: 无效范围格式\n");return 1;}char csv_filename[256];strncpy(csv_filename, argv[1], sizeof(csv_filename) - 1);char *ext = strrchr(csv_filename, '.');if (ext) strcpy(ext, ".csv");else strncat(csv_filename, ".csv", sizeof(csv_filename) - strlen(csv_filename) - 1);FILE *csv = fopen(csv_filename, "w");if (!csv) {printf("错误: 无法创建CSV\n");return 1;}fprintf(csv, "Row");for (char col = range.start_col; col <= range.end_col; col++) {fprintf(csv, ",%c", col);}fprintf(csv, "\n");FILE *file = fopen(argv[1], "rb");if (!file) {printf("错误: 无法打开文件 %s\n", argv[1]);fclose(csv);return -1;}long start_pos = binary_search_row(file, range.start_row);if (start_pos > 0) {fseek(file, start_pos, SEEK_SET);} else {fseek(file, 0, SEEK_SET);}XML_Parser parser = XML_ParserCreate(NULL);ParserState state = {0};state.range = range;state.csv = csv;state.parser = parser;XML_SetUserData(parser, &state);XML_SetElementHandler(parser, start_element, end_element);XML_SetCharacterDataHandler(parser, character_data);fseek(file, start_pos, SEEK_SET);char buffer[8192] = "<sheetData>";int done;int i = 0;do {if (XML_GetErrorCode(parser) == XML_ERROR_FINISHED) break;size_t len = fread(buffer + 11 * (i == 0), 1, sizeof(buffer) - 11 * (i == 0), file);done = (len < sizeof(buffer) - 11 * (i == 0));size_t actual_len = len;if (!done) {if (XML_Parse(parser, buffer, actual_len + 11 * (i == 0), done) == XML_STATUS_ERROR) {break;}i++;}} while (!done);fclose(file);fclose(csv);XML_ParserFree(parser);printf("CSV已保存到 %s\n", csv_filename);return 0;
}
编译运行和比较
gcc expatfmt.c -o expatfmt -lexpat -O3
root@66d4e20ec1d7:/par# time ./expatfmt lineitem/xl/worksheets/sheet1.xml A500000:Z600000
CSV已保存到 lineitem/xl/worksheets/sheet1.csvreal 0m1.865s
user 0m1.836s
sys 0m0.028sroot@66d4e20ec1d7:/par# time ./aich2 lineitem/xl/worksheets/sheet1.xml A500000:Z600000 out.csvreal 0m2.870s
user 0m1.064s
sys 0m0.076s