另外几种语言挑战100万行字符串文本排序
1.javascript
张泽鹏先生创作
const fs = require('fs');
const process = require('process');const args = process.argv.slice(2);
if (args.length !== 1) {console.error(`Usage: ${process.argv[1]} <filename>`);process.exit(1);
}const content = fs.readFileSync(args[0], 'utf8');
const lines = content.split('\n');
if (lines[lines.length - 1] === '') {lines.pop();
}lines.sort();process.stdout.write(lines.join('\n') + '\n');
执行
time node-v24.4.1-linux-x64/bin/node main.js varchar.txt >qsort.txtreal 0m4.455s
user 0m1.416s
sys 0m0.495s
2.rust
张泽鹏先生创作
main.rs
use std::env;
use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter, Write};
use std::process;fn main() {let args: Vec<String> = env::args().collect();if args.len() != 2 {eprintln!("Usage: {} <filename>", args[0]);process::exit(1);}let filename = &args[1];if let Err(e) = sort_file_lines(filename) {eprintln!("Error: {}", e);process::exit(1);}
}fn sort_file_lines(filename: &str) -> io::Result<()> {let file = File::open(filename)?;let reader = BufReader::with_capacity(10 * 1024, file);let mut lines: Vec<String> = reader.lines().collect::<Result<Vec<_>, _>>()?;lines.sort_unstable();let stdout = io::stdout();let mut writer = BufWriter::with_capacity(64 * 1024, stdout.lock());for line in lines {writeln!(writer, "{}", line)?;}writer.flush()?;Ok(())
}
cargo.toml
[package]
name = "rust-sort"
version = "0.1.0"
edition = "2024"[profile.release]
lto = true
strip = true[dependencies]
编译执行
cargo build --releasetime rust-sort/target/release/rust-sort varchar.txt >qsort.txtreal 0m2.333s
user 0m0.596s
sys 0m0.193s
4.张泽鹏先生重写的zig语言
const std = @import("std");
const ArrayList = std.ArrayList;pub fn main() !void {var gpa = std.heap.GeneralPurposeAllocator(.{}){};defer _ = gpa.deinit();const allocator = gpa.allocator();const output = std.io.getStdOut().writer();var stream = std.io.bufferedWriter(output);const stdout = stream.writer();// 1. Get filename from command line argumentsvar args = std.process.args();_ = args.next(); // Skip program nameconst filename = args.next() orelse {std.debug.print("Usage: program <filename>\n", .{});return;};// 2. Open and read the fileconst file = std.fs.cwd().openFile(filename, .{}) catch |err| {std.debug.print("Open file '{s}' failed: {}\n", .{ filename, err });return;};defer file.close();// 3. Read file linesconst content = try file.readToEndAlloc(allocator, std.math.maxInt(usize));defer allocator.free(content);var lines = ArrayList([]const u8).init(allocator);defer lines.deinit();var iterator = std.mem.splitScalar(u8, content, '\n');while (iterator.next()) |line| {try lines.append(line);}// 4. Sortstd.mem.sort([]const u8, lines.items, {}, compare);for (lines.items) |line| {try stdout.print("{s}\n", .{line});}try stream.flush();
}fn compare(context: void, a: []const u8, b: []const u8) bool {_ = context;return std.mem.order(u8, a, b) == .lt;
}
关键修改是用std.mem.splitScalar代替了mem.tokenizeSequence,提升很大
编译执行
zig build-exe main.zig -O ReleaseFasttime ./main varchar.txt > zsort.txtreal 0m3.748s
user 0m0.522s
sys 0m0.252s
4.deepseek重写的c语言
提示词:
编写c语言程序,将文件内容读入buf,然后遍历buf,遇到\n就记录偏移地址到一个100万元素的数组a,然后将\n位置填写\0,遍历完成后,对a执行排序,只修改a的元素存放的buf地址,然后浏览a,按地址输出每个字符串+\n到stdout
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>#define MAX_LINES 1000000 // 最多100万行int compare_offsets(const void *a, const void *b) {return strcmp(*(const char **)a, *(const char **)b);
}int main(int argc, char *argv[]) {if (argc != 2) {fprintf(stderr, "Usage: %s <filename>\n", argv[0]);return 1;}// 打开文件并获取大小int fd = open(argv[1], O_RDONLY);if (fd == -1) {perror("open");return 1;}struct stat st;if (fstat(fd, &st) == -1) {perror("fstat");close(fd);return 1;}size_t file_size = st.st_size;// 分配缓冲区并读取文件char *buf = malloc(file_size + 1);if (!buf) {perror("malloc");close(fd);return 1;}if (read(fd, buf, file_size) != file_size) {perror("read");free(buf);close(fd);return 1;}close(fd);buf[file_size] = '\0'; // 确保以null结尾// 记录每行起始地址char *lines[MAX_LINES];size_t line_count = 0;lines[line_count++] = buf; // 第一行开始// 遍历缓冲区,记录每行起始地址并将\n替换为\0for (char *p = buf; *p && line_count < MAX_LINES; p++) {if (*p == '\n') {*p = '\0';if (*(p + 1)) { // 如果不是文件末尾lines[line_count++] = p + 1;}}}// 对行指针数组进行排序qsort(lines, line_count, sizeof(char *), compare_offsets);// 输出排序后的行for (size_t i = 0; i < line_count; i++) {printf("%s\n", lines[i]);}free(buf);return 0;
}
编译执行
gcc sort_lines.c -o ds_sort -O3time ./ds_sort varchar.txt > csort.txtreal 0m3.656s
user 0m0.421s
sys 0m0.261s
因为换了amd的机器,我把varchar.txt每行长度扩大了一倍,与上文的计时不可比,所以同时给出amd的机器Linux的sort命令的结果如下:
time sort varchar.txt > xsort.txtreal 0m3.434s
user 0m0.528s
sys 0m0.369s
总结,这几种语言的效率都差不多,修改后的zig跑到了同一起跑线。rust出奇地快,比sort命令还快。我都怀疑它对系统time作修改了,但张泽鹏先生说rust应该没这么无聊。
另外,在windows下编译c程序,需要扩大栈大小,否则执行出错。如下所示:
gcc sort_lines.c -o mingw_sort -O3 -Wl,--stack=68435456timer64 mingw_sort varchar.txt > msort.txtKernel Time = 0.171 = 18%
User Time = 0.609 = 64%
Process Time = 0.781 = 83% Virtual Memory = 110 MB
Global Time = 0.939 = 100% Physical Memory = 113 MB
其中timer64来自7-benchmark, 它的结果被输送到重定向的文件。
测试时间说明,c语言本身并不慢,但是在wsl+docker环境中比较慢。