当前位置：首页 > news >正文

利用美团龙猫添加xlsx的sheet.xml读取sharedStrings.xml中共享字符串输出到csv功能

news 2025/9/12 13:03:39

提示词

请添加对sharedStrings.xml的支持。
结构如下

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1421" uniqueCount="686"><si><t>Unique Key</t></si><si><t>Created Date</t></si><si><t>Closed Date</t></si><si><t>Agency</t></si></sst>

uniqueCount="686"代表它保存的条目数，每个字符串索引从0开始递增
然后sheet.xml

<sheetData><row r="1" spans="1:41"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c><c r="C1" t="s"><v>2</v></c><c r="D1" t="s"><v>3</v></c><c r="E1" t="s"><v>4</v></c><c r="F1" t="s"><v>5</v></c>

会引用sharedStrings.xml的字符串，当t="s"时，<v>和</v>中间的值就是<si><t>和</t></si>中间的字符串索引，比如0代表Unique Key，在csv中输出Unique Key。同一个索引可能多次出现，
思路是把sharedStrings.xml中每个条目的开始字节和长度存入数组a,b，然后memcpy a[index],b[index]
请用2个函数实现读取共享字符串条目和从索引恢复条目，并给出原始代码中要增加的部分，不做别的。

他按要求给出了，但是我改到原代码里出现一个编译错误，

error: assignment of read-only location '*(val + (sizetype)(shared_strings + (sizetype)((long unsigned int)idx * 16))->len)'376 |                     val[shared_strings[idx].len] = 0;|                                                  ^

原因是val的定义是const char* val = NULL;，但是后面对它引用的地址中的内容做了修改，通过引入另一个变量char* val2解决了。具体见如下代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>// 读取sharedStrings.xml，返回每个字符串的起始位置和长度
// 返回数组，每个元素为{start, len}
struct str_pos {const char* start;int len;
};
struct str_pos* read_shared_strings(const char* fname, int* count) {int fd = open(fname, O_RDONLY);if (fd < 0) return NULL;struct stat sb;if (fstat(fd, &sb) < 0) { close(fd); return NULL; }char* mapped = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);if (mapped == MAP_FAILED) { close(fd); return NULL; }// 获取uniqueCountconst char* unique_attr = strstr(mapped, "uniqueCount=");if (!unique_attr) unique_attr = strstr(mapped, "count=");int unique_count = 0;if (unique_attr) {unique_attr = strchr(unique_attr, '"');if (unique_attr) {unique_attr++;unique_count = atoi(unique_attr);}}if (unique_count == 0) unique_count = 1024; // fallbackstruct str_pos* ret = (struct str_pos*)calloc(unique_count, sizeof(struct str_pos));int idx = 0;const char* p = mapped;const char* end = mapped + sb.st_size;while (p < end) {const char* si = strstr(p, "<si>");if (!si) break;si += 4;const char* t = strstr(si, "<t>");if (!t || t > end) break;t += 3;const char* t_end = strstr(t, "</t>");if (!t_end || t_end > end) break;ret[idx].start = t;ret[idx].len = t_end - t;idx++;p = t_end;}*count = idx;// 保存映射ret = (struct str_pos*)realloc(ret, sizeof(struct str_pos) * idx);// 将mapped指针保存到ret数组尾部，便于后续释放struct str_pos* r = (struct str_pos*)malloc(sizeof(struct str_pos) * (idx + 1));memcpy(r, ret, sizeof(struct str_pos) * idx);r[idx].start = mapped;r[idx].len = sb.st_size;free(ret);close(fd);return r;
}// 根据索引获取共享字符串
const char* get_shared_string(struct str_pos* arr, int index) {if (arr[index].len == 0) return "";return arr[index].start;
}// 释放sharedStrings内存
void free_shared_strings(struct str_pos* arr) {if (arr) {// 最后一个元素保存mapped指针const char* mapped = arr[arr[-1].len].start; // 这里需要调整// 更简单：保存mapped和len到数组尾部// 实际上我们在read_shared_strings中多分配1个，最后一个存mapped指针int count = 0;while (arr[count].start && count < 100000) count++;if (count > 0) {const char* last_mapped = arr[count-1].start;if (last_mapped) munmap((void*)last_mapped, arr[count-1].len);}free(arr);}
}// 辅助函数：在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {char* p = (char*)end - strlen(needle);while (p >= haystack) {if (strncmp(p, needle, strlen(needle)) == 0)return p;p--;}return NULL;
}
// 字母列号转数字，A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {int num = 0;for (int i = 0; i < len; i++) {if (col[i] >= 'A' && col[i] <= 'Z')num = num * 26 + (col[i] - 'A' + 1);else if (col[i] >= 'a' && col[i] <= 'z')num = num * 26 + (col[i] - 'a' + 1);elsebreak;}return num - 1;
}// 查找标签属性值
const char* get_attr(const char* start, const char* end, const char* attr) {const char* p = start;int attr_len = strlen(attr);while (p < end - attr_len) {if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {p += attr_len + 1;if (*p == '"' || *p == '\'') {char q = *p;p++;const char* val = p;while (p < end && *p != q) p++;char* ret = (char*)malloc(p - val + 1);strncpy(ret, val, p - val);ret[p - val] = 0;return ret;}}p++;}return NULL;
}// 查找标签名
const char* get_tagname(const char* start, const char* end, const char* tag) {const char* p = start;int tag_len = strlen(tag);while (p < end - tag_len - 1) {if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0 && (p[1+tag_len]==' ' || p[1+tag_len]=='>')) {return p;}p++;}return NULL;
}// 查找标签内容
const char* get_tagcontent(const char* start, const char* end, const char* tag) {const char* p = start;int tag_len = strlen(tag);while (p < end - tag_len - 1) {if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0) {const char* tag_end = strchr(p, '>');if (!tag_end) return NULL;const char* content = tag_end + 1;const char* close = strstr(content, "</");if (!close || close > end) return NULL;const char* close_tag = close + 2;if (strncmp(close_tag, tag, tag_len) == 0 && close_tag[tag_len] == '>') {char* ret = (char*)malloc(close - content + 1);strncpy(ret, content, close - content);ret[close - content] = 0;return ret;}}p++;}return NULL;
}// 查找下一个标签
const char* next_tag(const char* start, const char* end) {const char* p = start;while (p < end) {if (*p == '<') return p;p++;}return NULL;
}// 查找标签结束
const char* tag_end(const char* tag) {const char* p = tag;while (*p && *p != '>' && *p != ' ') p++;return p;
}// 查找属性值，返回指向属性值的指针
const char* tag_attrval(const char* tag, const char* attr, const char** val_end) {int attr_len = strlen(attr);const char* p = tag;while (*p && *p != '>') {if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {p += attr_len + 1;if (*p == '"' || *p == '\'') {char q = *p;p++;const char* val = p;while (*p && *p != q) p++;*val_end = p;return val;}}p++;}*val_end = NULL;return NULL;
}// 主函数
int main(int argc, char* argv[]) {// 命令行参数改为：argv[1]为sheet.xml, argv[2]为sharedStrings.xml（可选）if (argc != 2 && argc != 3) {fprintf(stderr, "Usage: %s <input.xml> [sharedStrings.xml]\n", argv[0]);exit(1);}// 在main函数开头添加： int sst_count = 0; struct str_pos* shared_strings = NULL; if (argc > 2) { shared_strings = read_shared_strings(argv[2], &sst_count); } const char* fname = argv[1];int fd = open(fname, O_RDONLY);if (fd < 0) { perror("open"); exit(1); }struct stat sb;if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }size_t flen = sb.st_size;char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }// 查找第一个<rowconst char* first_row = strstr(mapped, "<row");if (!first_row) {munmap(mapped, flen);close(fd);return 0;}// 查找最后一个</row>char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");if (!last_row_end) {munmap(mapped, flen);close(fd);return 0;}last_row_end = strstr(last_row_end, ">");if (last_row_end) last_row_end += 1;else last_row_end = mapped + flen;const char* file_tail_start = last_row_end;// 当前行最大列int max_col = 0;// 当前解析位置const char* p = first_row;// 64K块const size_t BUF_SIZE = 65536;//char* chunk = (char*)malloc(BUF_SIZE + 100);// 行号缓存int last_row = 0;// 解析一行void output_row(const char* row_start, const char* row_end, int* max_col, int last_row) {// 提取r属性const char* r_attr_val = get_attr(row_start, row_end, "r");int row_num = r_attr_val ? atoi(r_attr_val) : -1;if (r_attr_val) free((void*)r_attr_val);if (row_num < 1) return;// 统计最大列int this_max_col = -1;const char* c = row_start;while (1) {const char* c_tag = get_tagname(c, row_end, "c");if (!c_tag) break;const char* c_end = strstr(c_tag, "</c>");if (!c_end || c_end > row_end) break;c_end = strstr(c_end, ">");if (c_end) c_end += 1;else c_end = row_end;const char* c_r_attr = get_attr(c_tag, c_end, "r");if (c_r_attr) {int col_len = 0;while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(c_r_attr, col_len);if (col_num > this_max_col) this_max_col = col_num;free((void*)c_r_attr);}c = c_end;}if (this_max_col < 0) return;if (this_max_col > *max_col) *max_col = this_max_col;// 补空行while (last_row < row_num - 1) {last_row++;int has_data = 0;for (int c = 0; c <= *max_col; c++) {// 全空}if (!has_data) continue;printf("%d", last_row);for (int c = 0; c <= *max_col; c++) printf(",");printf("\n");}// 解析c节点char** cells = (char**)calloc(this_max_col + 1, sizeof(char*));c = row_start;while (1) {const char* c_tag = get_tagname(c, row_end, "c");if (!c_tag) break;const char* c_end = strstr(c_tag, "</c>");if (!c_end || c_end > row_end) break;c_end = strstr(c_end, ">");if (c_end) c_end += 1;else c_end = row_end;const char* c_r_attr = get_attr(c_tag, c_end, "r");if (c_r_attr) {int col_len = 0;while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(c_r_attr, col_len);free((void*)c_r_attr);// 判断t属性const char* t_attr = get_attr(c_tag, c_end, "t");int is_str = (t_attr && strcmp(t_attr, "inlineStr") == 0);int is_sst = (t_attr && strcmp(t_attr, "s") == 0);if (t_attr) free((void*)t_attr);const char* val = NULL;if (is_str) {const char* t_tag = get_tagcontent(c_tag, c_end, "t");val = t_tag ? t_tag : strdup("");} //新增共享字符串处理else if (is_sst) {const char* v_tag = get_tagcontent(c_tag, c_end, "v");int idx = v_tag ? atoi(v_tag) : 0;if (shared_strings && idx < sst_count && idx >= 0) {char* val2 = (char*)malloc(shared_strings[idx].len + 1);memcpy(val2, shared_strings[idx].start, shared_strings[idx].len);val2[shared_strings[idx].len] = 0;val=val2;} else {val = strdup("");}if (v_tag) free((void*)v_tag);}else {const char* v_tag = get_tagcontent(c_tag, c_end, "v");val = v_tag ? v_tag : strdup("");}cells[col_num] = (char*)val;}c = c_end;}// 输出行int has_data = 0;for (int c = 0; c <= this_max_col; c++) {if (cells[c] && strlen(cells[c]) > 0) {has_data = 1; break;}}if (has_data) {printf("%d", row_num);for (int c = 0; c <= this_max_col; c++) {if (cells[c] && strlen(cells[c]) > 0)printf(",%s", cells[c]);elseprintf(",");}printf("\n");}// 释放for (int c = 0; c <= this_max_col; c++) {if (cells[c]) free(cells[c]);}free(cells);}// 顺序解析while (p < file_tail_start) {// 找下一个<rowconst char* row_start = p;//get_tagname(p, file_tail_start, "row");if (!row_start) break;const char* row_end = strstr(row_start, "</row>");if (!row_end || row_end > file_tail_start) break;row_end = strstr(row_end, ">");if (row_end) row_end += 1;else row_end = file_tail_start;// 解析并输出一行output_row(row_start, row_end, &max_col, last_row);last_row = last_row > 0 ? last_row : atoi(get_attr(row_start, row_end, "r"));// 继续p = row_end;}// 释放//free(chunk);munmap(mapped, flen);close(fd);// 在main函数结尾添加： if (shared_strings) free_shared_strings(shared_strings); return 0;
}

编译和执行

gcc catxmls3.c -o catxmls3 -O3
./catxmls3 wp//xl/worksheets/sheet1.xml
2,42254749,,,,,,,,11235,,,,,,,,,,,,,,,,3088060140,,1002973,152924,,,,,,,,,,,40.5863974,-73.9325913,./catxmls3 wp/xl/worksheets/sheet1.xml wp/xl/sharedStrings.xml
1,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
2,42254749,04/18/2019 09:55:45 PM,04/19/2019 03:45:24 AM,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,Residential Building/House,11235,3855 SHORE PARKWAY,SHORE PARKWAY,BRAGG STREET,BELT PARKWAY WB KNAPP STREET EN,,,ADDRESS,BROOKLYN,,Precinct,Closed,04/19/2019 05:55:45 AM,The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.,04/19/2019 03:45:24 AM,15 BROOKLYN,3088060140,BROOKLYN,1002973,152924,PHONE,Unspecified,BROOKLYN,,,,,,,,40.5863974,-73.9325913,(40.5863974, -73.9325913)