当前位置: 首页 > news >正文

利用美团龙猫添加xlsx的sheet.xml读取sharedStrings.xml中共享字符串输出到csv功能

提示词

请添加对sharedStrings.xml的支持。
结构如下

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1421" uniqueCount="686"><si><t>Unique Key</t></si><si><t>Created Date</t></si><si><t>Closed Date</t></si><si><t>Agency</t></si></sst>

uniqueCount="686"代表它保存的条目数,每个字符串索引从0开始递增
然后sheet.xml

<sheetData><row r="1" spans="1:41"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c><c r="C1" t="s"><v>2</v></c><c r="D1" t="s"><v>3</v></c><c r="E1" t="s"><v>4</v></c><c r="F1" t="s"><v>5</v></c>

会引用sharedStrings.xml的字符串,当t="s"时,<v></v>中间的值就是<si><t></t></si>中间的字符串索引,比如0代表Unique Key,在csv中输出Unique Key。同一个索引可能多次出现,
思路是把sharedStrings.xml中每个条目的开始字节和长度存入数组a,b,然后memcpy a[index],b[index]
请用2个函数实现读取共享字符串条目和从索引恢复条目,并给出原始代码中要增加的部分,不做别的。

他按要求给出了,但是我改到原代码里出现一个编译错误,

error: assignment of read-only location '*(val + (sizetype)(shared_strings + (sizetype)((long unsigned int)idx * 16))->len)'376 |                     val[shared_strings[idx].len] = 0;|                                                  ^

原因是val的定义是const char* val = NULL;,但是后面对它引用的地址中的内容做了修改,通过引入另一个变量char* val2解决了。具体见如下代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>// 读取sharedStrings.xml,返回每个字符串的起始位置和长度
// 返回数组,每个元素为{start, len}
struct str_pos {const char* start;int len;
};
struct str_pos* read_shared_strings(const char* fname, int* count) {int fd = open(fname, O_RDONLY);if (fd < 0) return NULL;struct stat sb;if (fstat(fd, &sb) < 0) { close(fd); return NULL; }char* mapped = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);if (mapped == MAP_FAILED) { close(fd); return NULL; }// 获取uniqueCountconst char* unique_attr = strstr(mapped, "uniqueCount=");if (!unique_attr) unique_attr = strstr(mapped, "count=");int unique_count = 0;if (unique_attr) {unique_attr = strchr(unique_attr, '"');if (unique_attr) {unique_attr++;unique_count = atoi(unique_attr);}}if (unique_count == 0) unique_count = 1024; // fallbackstruct str_pos* ret = (struct str_pos*)calloc(unique_count, sizeof(struct str_pos));int idx = 0;const char* p = mapped;const char* end = mapped + sb.st_size;while (p < end) {const char* si = strstr(p, "<si>");if (!si) break;si += 4;const char* t = strstr(si, "<t>");if (!t || t > end) break;t += 3;const char* t_end = strstr(t, "</t>");if (!t_end || t_end > end) break;ret[idx].start = t;ret[idx].len = t_end - t;idx++;p = t_end;}*count = idx;// 保存映射ret = (struct str_pos*)realloc(ret, sizeof(struct str_pos) * idx);// 将mapped指针保存到ret数组尾部,便于后续释放struct str_pos* r = (struct str_pos*)malloc(sizeof(struct str_pos) * (idx + 1));memcpy(r, ret, sizeof(struct str_pos) * idx);r[idx].start = mapped;r[idx].len = sb.st_size;free(ret);close(fd);return r;
}// 根据索引获取共享字符串
const char* get_shared_string(struct str_pos* arr, int index) {if (arr[index].len == 0) return "";return arr[index].start;
}// 释放sharedStrings内存
void free_shared_strings(struct str_pos* arr) {if (arr) {// 最后一个元素保存mapped指针const char* mapped = arr[arr[-1].len].start; // 这里需要调整// 更简单:保存mapped和len到数组尾部// 实际上我们在read_shared_strings中多分配1个,最后一个存mapped指针int count = 0;while (arr[count].start && count < 100000) count++;if (count > 0) {const char* last_mapped = arr[count-1].start;if (last_mapped) munmap((void*)last_mapped, arr[count-1].len);}free(arr);}
}// 辅助函数:在指定范围内查找子串
char* strrstr(const char* haystack, const char* end, const char* needle) {char* p = (char*)end - strlen(needle);while (p >= haystack) {if (strncmp(p, needle, strlen(needle)) == 0)return p;p--;}return NULL;
}
// 字母列号转数字,A1 -> (0,0), B1 -> (1,0)
int col_letter_to_num(const char* col, int len) {int num = 0;for (int i = 0; i < len; i++) {if (col[i] >= 'A' && col[i] <= 'Z')num = num * 26 + (col[i] - 'A' + 1);else if (col[i] >= 'a' && col[i] <= 'z')num = num * 26 + (col[i] - 'a' + 1);elsebreak;}return num - 1;
}// 查找标签属性值
const char* get_attr(const char* start, const char* end, const char* attr) {const char* p = start;int attr_len = strlen(attr);while (p < end - attr_len) {if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {p += attr_len + 1;if (*p == '"' || *p == '\'') {char q = *p;p++;const char* val = p;while (p < end && *p != q) p++;char* ret = (char*)malloc(p - val + 1);strncpy(ret, val, p - val);ret[p - val] = 0;return ret;}}p++;}return NULL;
}// 查找标签名
const char* get_tagname(const char* start, const char* end, const char* tag) {const char* p = start;int tag_len = strlen(tag);while (p < end - tag_len - 1) {if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0 && (p[1+tag_len]==' ' || p[1+tag_len]=='>')) {return p;}p++;}return NULL;
}// 查找标签内容
const char* get_tagcontent(const char* start, const char* end, const char* tag) {const char* p = start;int tag_len = strlen(tag);while (p < end - tag_len - 1) {if (strncmp(p, "<", 1) == 0 && strncmp(p+1, tag, tag_len) == 0) {const char* tag_end = strchr(p, '>');if (!tag_end) return NULL;const char* content = tag_end + 1;const char* close = strstr(content, "</");if (!close || close > end) return NULL;const char* close_tag = close + 2;if (strncmp(close_tag, tag, tag_len) == 0 && close_tag[tag_len] == '>') {char* ret = (char*)malloc(close - content + 1);strncpy(ret, content, close - content);ret[close - content] = 0;return ret;}}p++;}return NULL;
}// 查找下一个标签
const char* next_tag(const char* start, const char* end) {const char* p = start;while (p < end) {if (*p == '<') return p;p++;}return NULL;
}// 查找标签结束
const char* tag_end(const char* tag) {const char* p = tag;while (*p && *p != '>' && *p != ' ') p++;return p;
}// 查找属性值,返回指向属性值的指针
const char* tag_attrval(const char* tag, const char* attr, const char** val_end) {int attr_len = strlen(attr);const char* p = tag;while (*p && *p != '>') {if (strncmp(p, attr, attr_len) == 0 && p[attr_len] == '=') {p += attr_len + 1;if (*p == '"' || *p == '\'') {char q = *p;p++;const char* val = p;while (*p && *p != q) p++;*val_end = p;return val;}}p++;}*val_end = NULL;return NULL;
}// 主函数
int main(int argc, char* argv[]) {// 命令行参数改为:argv[1]为sheet.xml, argv[2]为sharedStrings.xml(可选)if (argc != 2 && argc != 3) {fprintf(stderr, "Usage: %s <input.xml> [sharedStrings.xml]\n", argv[0]);exit(1);}// 在main函数开头添加: int sst_count = 0; struct str_pos* shared_strings = NULL; if (argc > 2) { shared_strings = read_shared_strings(argv[2], &sst_count); } const char* fname = argv[1];int fd = open(fname, O_RDONLY);if (fd < 0) { perror("open"); exit(1); }struct stat sb;if (fstat(fd, &sb) < 0) { perror("fstat"); exit(1); }size_t flen = sb.st_size;char* mapped = (char*)mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);if (mapped == MAP_FAILED) { perror("mmap"); exit(1); }// 查找第一个<rowconst char* first_row = strstr(mapped, "<row");if (!first_row) {munmap(mapped, flen);close(fd);return 0;}// 查找最后一个</row>char* last_row_end = (char*)strrstr(mapped, mapped + flen, "</row>");if (!last_row_end) {munmap(mapped, flen);close(fd);return 0;}last_row_end = strstr(last_row_end, ">");if (last_row_end) last_row_end += 1;else last_row_end = mapped + flen;const char* file_tail_start = last_row_end;// 当前行最大列int max_col = 0;// 当前解析位置const char* p = first_row;// 64K块const size_t BUF_SIZE = 65536;//char* chunk = (char*)malloc(BUF_SIZE + 100);// 行号缓存int last_row = 0;// 解析一行void output_row(const char* row_start, const char* row_end, int* max_col, int last_row) {// 提取r属性const char* r_attr_val = get_attr(row_start, row_end, "r");int row_num = r_attr_val ? atoi(r_attr_val) : -1;if (r_attr_val) free((void*)r_attr_val);if (row_num < 1) return;// 统计最大列int this_max_col = -1;const char* c = row_start;while (1) {const char* c_tag = get_tagname(c, row_end, "c");if (!c_tag) break;const char* c_end = strstr(c_tag, "</c>");if (!c_end || c_end > row_end) break;c_end = strstr(c_end, ">");if (c_end) c_end += 1;else c_end = row_end;const char* c_r_attr = get_attr(c_tag, c_end, "r");if (c_r_attr) {int col_len = 0;while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(c_r_attr, col_len);if (col_num > this_max_col) this_max_col = col_num;free((void*)c_r_attr);}c = c_end;}if (this_max_col < 0) return;if (this_max_col > *max_col) *max_col = this_max_col;// 补空行while (last_row < row_num - 1) {last_row++;int has_data = 0;for (int c = 0; c <= *max_col; c++) {// 全空}if (!has_data) continue;printf("%d", last_row);for (int c = 0; c <= *max_col; c++) printf(",");printf("\n");}// 解析c节点char** cells = (char**)calloc(this_max_col + 1, sizeof(char*));c = row_start;while (1) {const char* c_tag = get_tagname(c, row_end, "c");if (!c_tag) break;const char* c_end = strstr(c_tag, "</c>");if (!c_end || c_end > row_end) break;c_end = strstr(c_end, ">");if (c_end) c_end += 1;else c_end = row_end;const char* c_r_attr = get_attr(c_tag, c_end, "r");if (c_r_attr) {int col_len = 0;while (c_r_attr[col_len] && c_r_attr[col_len] >= 'A' && (c_r_attr[col_len] <= 'Z' || c_r_attr[col_len] <= 'z')) col_len++;int col_num = col_letter_to_num(c_r_attr, col_len);free((void*)c_r_attr);// 判断t属性const char* t_attr = get_attr(c_tag, c_end, "t");int is_str = (t_attr && strcmp(t_attr, "inlineStr") == 0);int is_sst = (t_attr && strcmp(t_attr, "s") == 0);if (t_attr) free((void*)t_attr);const char* val = NULL;if (is_str) {const char* t_tag = get_tagcontent(c_tag, c_end, "t");val = t_tag ? t_tag : strdup("");} //新增共享字符串处理else if (is_sst) {const char* v_tag = get_tagcontent(c_tag, c_end, "v");int idx = v_tag ? atoi(v_tag) : 0;if (shared_strings && idx < sst_count && idx >= 0) {char* val2 = (char*)malloc(shared_strings[idx].len + 1);memcpy(val2, shared_strings[idx].start, shared_strings[idx].len);val2[shared_strings[idx].len] = 0;val=val2;} else {val = strdup("");}if (v_tag) free((void*)v_tag);}else {const char* v_tag = get_tagcontent(c_tag, c_end, "v");val = v_tag ? v_tag : strdup("");}cells[col_num] = (char*)val;}c = c_end;}// 输出行int has_data = 0;for (int c = 0; c <= this_max_col; c++) {if (cells[c] && strlen(cells[c]) > 0) {has_data = 1; break;}}if (has_data) {printf("%d", row_num);for (int c = 0; c <= this_max_col; c++) {if (cells[c] && strlen(cells[c]) > 0)printf(",%s", cells[c]);elseprintf(",");}printf("\n");}// 释放for (int c = 0; c <= this_max_col; c++) {if (cells[c]) free(cells[c]);}free(cells);}// 顺序解析while (p < file_tail_start) {// 找下一个<rowconst char* row_start = p;//get_tagname(p, file_tail_start, "row");if (!row_start) break;const char* row_end = strstr(row_start, "</row>");if (!row_end || row_end > file_tail_start) break;row_end = strstr(row_end, ">");if (row_end) row_end += 1;else row_end = file_tail_start;// 解析并输出一行output_row(row_start, row_end, &max_col, last_row);last_row = last_row > 0 ? last_row : atoi(get_attr(row_start, row_end, "r"));// 继续p = row_end;}// 释放//free(chunk);munmap(mapped, flen);close(fd);// 在main函数结尾添加: if (shared_strings) free_shared_strings(shared_strings); return 0;
}

编译和执行

gcc catxmls3.c -o catxmls3 -O3
./catxmls3 wp//xl/worksheets/sheet1.xml
2,42254749,,,,,,,,11235,,,,,,,,,,,,,,,,3088060140,,1002973,152924,,,,,,,,,,,40.5863974,-73.9325913,./catxmls3 wp/xl/worksheets/sheet1.xml wp/xl/sharedStrings.xml
1,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
2,42254749,04/18/2019 09:55:45 PM,04/19/2019 03:45:24 AM,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,Residential Building/House,11235,3855 SHORE PARKWAY,SHORE PARKWAY,BRAGG STREET,BELT PARKWAY WB KNAPP STREET EN,,,ADDRESS,BROOKLYN,,Precinct,Closed,04/19/2019 05:55:45 AM,The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.,04/19/2019 03:45:24 AM,15 BROOKLYN,3088060140,BROOKLYN,1002973,152924,PHONE,Unspecified,BROOKLYN,,,,,,,,40.5863974,-73.9325913,(40.5863974, -73.9325913)

因为sheet1.xml第一行全是共享字符串,没有提供sharedStrings.xml时不处理,故没有输出。
提供以后,确实正确输出了共享字符串。


文章转载自:

http://UuFkXs62.cLjmx.cn
http://NlPn78Yz.cLjmx.cn
http://d3VVyFbS.cLjmx.cn
http://Sk9Vb1PV.cLjmx.cn
http://Ro6rOSjI.cLjmx.cn
http://hbDjiY3m.cLjmx.cn
http://HHkPkSd9.cLjmx.cn
http://VDSGhHy2.cLjmx.cn
http://MXBV7pZP.cLjmx.cn
http://HKL8TnR9.cLjmx.cn
http://tmSPFg2q.cLjmx.cn
http://UgHJ0JPi.cLjmx.cn
http://Xk8kFtck.cLjmx.cn
http://CI5sGgIk.cLjmx.cn
http://CaG8IEQl.cLjmx.cn
http://k0NCORl0.cLjmx.cn
http://A0NErcD7.cLjmx.cn
http://q2dnYdPu.cLjmx.cn
http://bLU8vAkI.cLjmx.cn
http://kypr5dQg.cLjmx.cn
http://S9Idcxmq.cLjmx.cn
http://D2KCPUu7.cLjmx.cn
http://JMmn3rLx.cLjmx.cn
http://VdYDGEr7.cLjmx.cn
http://IS9Gtd2W.cLjmx.cn
http://kJ7RtIaF.cLjmx.cn
http://DPbVIGf2.cLjmx.cn
http://1uERWoaA.cLjmx.cn
http://A7Npt7pd.cLjmx.cn
http://mU1YcFfZ.cLjmx.cn
http://www.dtcms.com/a/379418.html

相关文章:

  • 时序数据库:定义与基本特点
  • 【WorkManager】Android 后台任务调度的核心组件指南
  • python项目批量安装包和生成requirements.txt文件
  • 零部件力学测试系统参数
  • 3D Web轻量引擎HOOPS赋能BIM/工程施工:实现超大模型的轻量化加载与高效浏览!
  • Java Web应用的安全性与防护措施!
  • 填写简历信息
  • 优先算法——专题十一:字符串
  • [Spring Cloud][3]从零开始简单工程搭建实践详解,远程调用
  • 为什么要显示调用析构函数
  • MySQL 数据完整性与约束:从基础到实战,守护数据准确性
  • Python中的“占位符”艺术:深入理解pass关键字的妙用
  • 构建企业级Python离线包仓库:从下载到服务部署全流程指南
  • C++面向对象之多态
  • 个人自留笔记——git操作
  • 命令模式,餐厅订单管理系统C++
  • Android EDLA测试命令总结
  • opencv基础实践;银行卡号识别
  • 【录屏软件】 实用工具推荐——电脑录屏软件班迪(Bandicam)录屏图文安装指南
  • 微服务事务管理实践与 Seata 框架解析
  • 今日行情明日机会——20250911
  • P4105 [HEOI2014] 南园满地堆轻絮
  • Docker 命令核心语法、常用命令
  • Windows安装Chroma DB
  • 60_基于深度学习的羊群计数统计系统(yolo11、yolov8、yolov5+UI界面+Python项目源码+模型+标注好的数据集)
  • Linux 命令 top、vmstat、iostat、free、iftop 正常用法和退出.
  • 深入解析HashMap:从原理到实践的全方位指南
  • LNMP 与 LNMT 架构实战指南:从部署到运维全流程
  • 教资科三【信息技术】— 学科知识[算法](简答题)识记版
  • 游戏中的展销系统使用的数据结构