当前位置: 首页 > news >正文

Linux Zero-Copy 技术深度分析

Linux Zero-Copy 技术深度分析

概述

零拷贝(Zero-copy)是一种高效的数据传输技术,旨在减少CPU参与的数据复制操作,提高系统性能并降低CPU负载。在Linux内核中,零拷贝技术通过多种机制实现,包括sendfile()splice()copy_file_range()mmap()以及DMA(Direct Memory Access)等。

工作原理

传统数据拷贝流程

应用程序内核空间文件系统网络栈网卡read() 系统调用读取文件数据数据复制到内核缓冲区数据复制到用户空间write() 系统调用数据复制到socket缓冲区DMA传输到网卡总计4次拷贝:2次CPU拷贝 + 2次DMA拷贝应用程序内核空间文件系统网络栈网卡

零拷贝优化流程

应用程序内核空间文件系统网络栈网卡sendfile() 系统调用读取文件数据DMA传输到内核缓冲区零拷贝传输到socket缓冲区DMA传输到网卡总计2次拷贝:仅2次DMA拷贝,无CPU拷贝应用程序内核空间文件系统网络栈网卡

核心实现机制

1. sendfile() 机制

sendfile()系统调用允许在两个文件描述符之间直接传输数据,避免数据在用户空间和内核空间之间的拷贝。

核心数据结构
/* include/linux/fs.h */
struct file_operations {ssize_t (*sendpage)(struct file *, struct page *, int, size_t, loff_t *, int);ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);// ...
};/* include/linux/splice.h */
struct splice_desc {unsigned int len;union {void __user *userptr;struct file *file;void *data;} u;loff_t pos;loff_t *opos;size_t num_spliced;unsigned int flags;
};
系统调用接口
/* fs/read_write.c - sendfile系统调用实现 */
SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{loff_t pos;off_t off;ssize_t ret;if (offset) {if (unlikely(get_user(off, offset)))return -EFAULT;pos = off;ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);if (unlikely(put_user(pos, offset)))ret = -EFAULT;return ret;}return do_sendfile(out_fd, in_fd, NULL, count, MAX_NON_LFS);
}static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,size_t count, loff_t max)
{struct fd in, out;struct inode *in_inode, *out_inode;ssize_t retval;in = fdget(in_fd);if (!in.file)return -EBADF;out = fdget(out_fd);if (!out.file)goto out;retval = -ESPIPE;if (!ppos) {pos = in.file->f_pos;ppos = &pos;} else {if (!(in.file->f_mode & FMODE_PREAD))goto fput_out;}count = min(count, MAX_RW_COUNT);retval = security_file_permission(in.file, MAY_READ);if (retval)goto fput_out;retval = rw_verify_area(WRITE, out.file, &out_pos, count);if (retval < 0)goto fput_out;if (!max)max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);if (unlikely(pos + count > max)) {retval = -EOVERFLOW;if (pos >= max)goto fput_out;count = max - pos;}fl = 0;if (in.file->f_flags & O_NONBLOCK)fl = SPLICE_F_NONBLOCK;retval = do_splice_direct(in.file, ppos, out.file, &out_pos, count, fl);if (retval > 0) {add_rchar(current, retval);add_wchar(current, retval);fsnotify_access(in.file);fsnotify_modify(out.file);out.file->f_pos = out_pos;if (ppos)in.file->f_pos = *ppos;}inc_syscr(current);inc_syscw(current);if (pos > max)retval = -EOVERFLOW;fput_out:fdput(out);
out:fdput(in);return retval;
}

2. splice() 机制

splice()系统调用使用管道作为中介,在两个文件描述符之间移动数据。

核心数据结构
/* include/linux/pipe_fs_i.h */
#define PIPE_BUF_FLAG_LRU       0x01    /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC    0x02    /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT      0x04    /* page is a gift */
#define PIPE_BUF_FLAG_PACKET    0x08    /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE 0x10    /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE     0x20    /* read() must return entire buffer or error */struct pipe_buffer {struct page *page;unsigned int offset, len;const struct pipe_buf_operations *ops;unsigned int flags;unsigned long private;
};struct pipe_inode_info {struct mutex mutex;wait_queue_head_t rd_wait, wr_wait;unsigned int head;unsigned int tail;unsigned int max_usage;unsigned int ring_size;unsigned int readers;unsigned int writers;unsigned int files;unsigned int r_counter;unsigned int w_counter;struct page *tmp_page;struct fasync_struct *fasync_readers;struct fasync_struct *fasync_writers;struct pipe_buffer *bufs;struct user_struct *user;
};struct pipe_buf_operations {int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);void (*release)(struct pipe_inode_info *, struct pipe_buffer *);bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
splice系统调用实现
/* fs/splice.c */
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,int, fd_out, loff_t __user *, off_out,size_t, len, unsigned int, flags)
{struct fd in, out;long error;if (unlikely(!len))return 0;if (unlikely(flags & ~SPLICE_F_ALL))return -EINVAL;error = -EBADF;in = fdget(fd_in);if (in.file) {out = fdget(fd_out);if (out.file) {error = do_splice(in.file, off_in, out.file, off_out,len, flags);fdput(out);}fdput(in);}return error;
}static long do_splice(struct file *in, loff_t __user *off_in,struct file *out, loff_t __user *off_out,size_t len, unsigned int flags)
{struct pipe_inode_info *ipipe;struct pipe_inode_info *opipe;loff_t offset;long ret;ipipe = get_pipe_info(in, true);opipe = get_pipe_info(out, true);if (ipipe && opipe) {if (off_in || off_out)return -ESPIPE;/* Splicing to self would be fun, but... */if (ipipe == opipe)return -EINVAL;if ((in->f_flags | out->f_flags) & O_NONBLOCK)flags |= SPLICE_F_NONBLOCK;return splice_pipe_to_pipe(ipipe, opipe, len, flags);}if (ipipe) {if (off_in)return -ESPIPE;if (off_out) {if (!(out->f_mode & FMODE_PWRITE))return -EINVAL;if (copy_from_user(&offset, off_out, sizeof(loff_t)))return -EFAULT;} else {offset = out->f_pos;}if (unlikely(out->f_flags & O_APPEND))return -EINVAL;ret = rw_verify_area(WRITE, out, &offset, len);if (unlikely(ret < 0))return ret;if (in->f_flags & O_NONBLOCK)flags |= SPLICE_F_NONBLOCK;file_start_write(out);ret = do_splice_from(ipipe, out, &offset, len, flags);file_end_write(out);if (!off_out)out->f_pos = offset;else if (copy_to_user(off_out, &offset, sizeof(loff_t)))ret = -EFAULT;return ret;}if (opipe) {if (off_out)return -ESPIPE;if (off_in) {if (!(in->f_mode & FMODE_PREAD))return -EINVAL;if (copy_from_user(&offset, off_in, sizeof(loff_t)))return -EFAULT;} else {offset = in->f_pos;}if (in->f_flags & O_NONBLOCK)flags |= SPLICE_F_NONBLOCK;ret = splice_file_to_pipe(in, opipe, &offset, len, flags);if (!off_in)in->f_pos = offset;else if (copy_to_user(off_in, &offset, sizeof(loff_t)))ret = -EFAULT;return ret;}return -EINVAL;
}

3. mmap() 机制

内存映射允许进程将文件内容直接映射到虚拟内存空间,避免传统的读写操作。

核心数据结构
/* include/linux/mm_types.h */
struct vm_area_struct {unsigned long vm_start;         /* Our start address within vm_mm. */unsigned long vm_end;           /* The first byte after our end address */struct vm_area_struct *vm_next, *vm_prev;struct rb_node vm_rb;unsigned long rb_subtree_gap;struct mm_struct *vm_mm;        /* The address space we belong to. */pgprot_t vm_page_prot;          /* Access permissions of this VMA. */unsigned long vm_flags;         /* Flags, see mm.h. */union {struct {struct rb_node rb;unsigned long rb_subtree_last;} shared;const char __user *anon_name;};struct list_head anon_vma_chain;struct anon_vma *anon_vma;const struct vm_operations_struct *vm_ops;unsigned long vm_pgoff;         /* Offset (within vm_file) in PAGE_SIZE units */struct file *vm_file;           /* File we map to (can be NULL). */void *vm_private_data;          /* was vm_pte (shared mem) */
};struct vm_operations_struct {void (*open)(struct vm_area_struct *area);void (*close)(struct vm_area_struct *area);int (*split)(struct vm_area_struct *area, unsigned long addr);int (*mremap)(struct vm_area_struct *area);vm_fault_t (*fault)(struct vm_fault *vmf);vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);vm_fault_t (*map_pages)(struct vm_fault *vmf,pgoff_t start_pgoff, pgoff_t end_pgoff);unsigned long (*pagesize)(struct vm_area_struct *area);int (*page_mkwrite)(struct vm_fault *vmf);int (*pfn_mkwrite)(struct vm_fault *vmf);int (*access)(struct vm_area_struct *vma, unsigned long addr,void *buf, int len, int write);const char *(*name)(struct vm_area_struct *vma);int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);struct mempolicy *(*get_policy)(struct vm_area_struct *vma,unsigned long addr);struct page *(*find_special_page)(struct vm_area_struct *vma,unsigned long addr);
};
mmap系统调用实现
/* mm/mmap.c */
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,unsigned long, prot, unsigned long, flags,unsigned long, fd, unsigned long, off)
{if (offset_in_page(off) != 0)return -EINVAL;return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,unsigned long prot, unsigned long flags,unsigned long fd, unsigned long pgoff)
{struct file *file = NULL;unsigned long retval;if (!(flags & MAP_ANONYMOUS)) {audit_mmap_fd(fd, flags);file = fget(fd);if (!file)return -EBADF;if (is_file_hugepages(file)) {len = ALIGN(len, huge_page_size(hstate_file(file)));} else if (unlikely(flags & MAP_HUGETLB)) {retval = -EINVAL;goto out_fput;}} else if (flags & MAP_HUGETLB) {struct user_struct *user = NULL;struct hstate *hs;hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (!hs)return -EINVAL;len = ALIGN(len, huge_page_size(hs));/** VM_NORESERVE is used because the reservations will be* taken when vm_ops->mmap() is called* A dummy user value is used because we are not locking* memory so no accounting is necessary*/file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,VM_NORESERVE,&user, HUGETLB_ANONHUGE_INODE,(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (IS_ERR(file))return PTR_ERR(file);}flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:if (file)fput(file);return retval;
}

4. DMA 机制

DMA允许外设直接访问内存,无需CPU干预进行数据传输。

核心数据结构
/* include/linux/dma-mapping.h */
enum dma_data_direction {DMA_BIDIRECTIONAL = 0,DMA_TO_DEVICE = 1,DMA_FROM_DEVICE = 2,DMA_NONE = 3,
};struct dma_map_ops {void* (*alloc)(struct device *dev, size_t size,dma_addr_t *dma_handle, gfp_t gfp,unsigned long attrs);void (*free)(struct device *dev, size_t size,void *vaddr, dma_addr_t dma_handle,unsigned long attrs);struct page *(*alloc_pages)(struct device *dev, size_t size,dma_addr_t *dma_handle, enum dma_data_direction dir,gfp_t gfp);void (*free_pages)(struct device *dev, size_t size,struct page *page, dma_addr_t dma_handle,enum dma_data_direction dir);dma_addr_t (*map_page)(struct device *dev, struct page *page,unsigned long offset, size_t size,enum dma_data_direction dir,unsigned long attrs);void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,size_t size, enum dma_data_direction dir,unsigned long attrs);int (*map_sg)(struct device *dev, struct scatterlist *sg,int nents, enum dma_data_direction dir,unsigned long attrs);void (*unmap_sg)(struct device *dev, struct scatterlist *sg,int nents, enum dma_data_direction dir,unsigned long attrs);dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr,size_t size, enum dma_data_direction dir,unsigned long attrs);void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle,size_t size, enum dma_data_direction dir,unsigned long attrs);void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle,size_t size, enum dma_data_direction dir);void (*sync_single_for_device)(struct device *dev, dma_addr_t dma_handle,size_t size, enum dma_data_direction dir);void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,int nents, enum dma_data_direction dir);void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg,int nents, enum dma_data_direction dir);void (*cache_sync)(struct device *dev, void *vaddr, size_t size,enum dma_data_direction direction);int (*dma_supported)(struct device *dev, u64 mask);u64 (*get_required_mask)(struct device *dev);size_t (*max_mapping_size)(struct device *dev);unsigned long (*get_merge_boundary)(struct device *dev);
};
DMA API实现
/* kernel/dma/mapping.c */
dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,size_t offset, size_t size,enum dma_data_direction dir,unsigned long attrs)
{const struct dma_map_ops *ops = get_dma_ops(dev);dma_addr_t addr;BUG_ON(!valid_dma_direction(dir));if (dma_is_direct(ops))addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);elseaddr = ops->map_page(dev, page, offset, size, dir, attrs);debug_dma_map_page(dev, page, offset, size, dir, addr, false);return addr;
}
EXPORT_SYMBOL(dma_map_page_attrs);void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,enum dma_data_direction dir, unsigned long attrs)
{const struct dma_map_ops *ops = get_dma_ops(dev);BUG_ON(!valid_dma_direction(dir));if (dma_is_direct(ops))dma_direct_unmap_page(dev, addr, size, dir, attrs);else if (ops->unmap_page)ops->unmap_page(dev, addr, size, dir, attrs);debug_dma_unmap_page(dev, addr, size, dir, false);
}
EXPORT_SYMBOL(dma_unmap_page_attrs);int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,enum dma_data_direction dir, unsigned long attrs)
{const struct dma_map_ops *ops = get_dma_ops(dev);int ents;BUG_ON(!valid_dma_direction(dir));if (dma_is_direct(ops))ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);elseents = ops->map_sg(dev, sg, nents, dir, attrs);if (ents > 0)debug_dma_map_sg(dev, sg, nents, ents, dir, false);else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&ents != -EIO && ents != -EREMOTEIO))return -EIO;return ents;
}
EXPORT_SYMBOL(dma_map_sg_attrs);

零拷贝技术对比分析

技术适用场景优势局限性性能提升
sendfile()文件到socket传输简单易用,内核自动优化仅支持文件到socket50-80%
splice()任意文件描述符间传输灵活性高,支持管道需要管道作为中介40-70%
mmap()大文件随机访问减少系统调用,支持随机访问虚拟内存开销30-60%
copy_file_range()文件间拷贝文件系统级优化需要文件系统支持60-90%
DMA设备到内存传输完全零CPU参与需要硬件支持80-95%

零拷贝架构图

硬件层
内核空间
用户空间
sendfile/splice/mmap
零拷贝
DMA
存储设备
网络接口卡
系统调用层
VFS虚拟文件系统
页缓存Page Cache
Socket缓冲区
网络协议栈
DMA引擎
应用程序
用户缓冲区

简单实现示例

下面提供几个零拷贝技术的应用示例:

1. sendfile示例 - 高性能文件服务器

#include <sys/sendfile.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <netinet/in.h>
#include <arpa/inet.h>#define PORT 8080
#define BUFFER_SIZE 4096typedef struct {int sockfd;struct sockaddr_in addr;
} server_t;int create_server(server_t *server) {server->sockfd = socket(AF_INET, SOCK_STREAM, 0);if (server->sockfd < 0) {perror("socket creation failed");return -1;}int opt = 1;if (setsockopt(server->sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {perror("setsockopt failed");close(server->sockfd);return -1;}server->addr.sin_family = AF_INET;server->addr.sin_addr.s_addr = INADDR_ANY;server->addr.sin_port = htons(PORT);if (bind(server->sockfd, (struct sockaddr*)&server->addr, sizeof(server->addr)) < 0) {perror("bind failed");close(server->sockfd);return -1;}if (listen(server->sockfd, 10) < 0) {perror("listen failed");close(server->sockfd);return -1;}printf("Server listening on port %d\n", PORT);return 0;
}ssize_t sendfile_transfer(int out_fd, int in_fd, off_t *offset, size_t count) {ssize_t total_sent = 0;ssize_t sent;while (total_sent < count) {sent = sendfile(out_fd, in_fd, offset, count - total_sent);if (sent < 0) {if (errno == EAGAIN || errno == EWOULDBLOCK) {continue;  // 非阻塞模式,继续尝试}perror("sendfile failed");return -1;}if (sent == 0) {break;  // EOF reached}total_sent += sent;}return total_sent;
}void handle_client(int client_fd) {char request[BUFFER_SIZE];char response[BUFFER_SIZE];char filepath[256];int file_fd;struct stat file_stat;// 读取HTTP请求ssize_t bytes_read = recv(client_fd, request, sizeof(request) - 1, 0);if (bytes_read <= 0) {close(client_fd);return;}request[bytes_read] = '\0';// 简单解析GET请求if (strncmp(request, "GET ", 4) == 0) {char *path_start = request + 4;char *path_end = strchr(path_start, ' ');if (path_end) {*path_end = '\0';if (strcmp(path_start, "/") == 0) {strcpy(filepath, "./index.html");} else {snprintf(filepath, sizeof(filepath), ".%s", path_start);}file_fd = open(filepath, O_RDONLY);if (file_fd >= 0 && fstat(file_fd, &file_stat) == 0) {// 发送HTTP响应头snprintf(response, sizeof(response),"HTTP/1.1 200 OK\r\n""Content-Type: text/html\r\n""Content-Length: %ld\r\n""Connection: close\r\n\r\n",file_stat.st_size);send(client_fd, response, strlen(response), 0);// 使用sendfile零拷贝传输文件内容off_t offset = 0;ssize_t sent = sendfile_transfer(client_fd, file_fd, &offset, file_stat.st_size);if (sent < 0) {printf("Failed to send file: %s\n", filepath);} else {printf("Sent %ld bytes using sendfile: %s\n", sent, filepath);}close(file_fd);} else {// 404 Not Foundconst char *not_found = "HTTP/1.1 404 Not Found\r\n""Content-Length: 13\r\n""Connection: close\r\n\r\n""404 Not Found";send(client_fd, not_found, strlen(not_found), 0);}}}close(client_fd);
}int main() {server_t server;int client_fd;struct sockaddr_in client_addr;socklen_t client_len = sizeof(client_addr);if (create_server(&server) < 0) {return EXIT_FAILURE;}printf("Zero-copy file server started on port %d\n", PORT);while (1) {client_fd = accept(server.sockfd, (struct sockaddr*)&client_addr, &client_len);if (client_fd < 0) {perror("accept failed");continue;}printf("Client connected: %s:%d\n", inet_ntoa(client_addr.sin_addr), ntohs(client_addr.sin_port));// 在实际应用中,这里应该创建新线程或使用异步处理handle_client(client_fd);}close(server.sockfd);return EXIT_SUCCESS;
}

2. splice示例 - 管道数据转发器

#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <sys/syscall.h>#define SPLICE_SIZE (64 * 1024)  // 64KB
#define PIPE_SIZE (16 * 4096)    // 64KB pipe buffer// splice系统调用包装函数
ssize_t splice_syscall(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags) {return syscall(SYS_splice, fd_in, off_in, fd_out, off_out, len, flags);
}typedef struct {int pipe_fd[2];size_t buffer_size;
} splice_pipe_t;int create_splice_pipe(splice_pipe_t *sp) {if (pipe(sp->pipe_fd) < 0) {perror("pipe creation failed");return -1;}// 设置管道缓冲区大小if (fcntl(sp->pipe_fd[1], F_SETPIPE_SZ, PIPE_SIZE) < 0) {perror("fcntl F_SETPIPE_SZ failed");// 不是致命错误,继续执行}sp->buffer_size = SPLICE_SIZE;return 0;
}void destroy_splice_pipe(splice_pipe_t *sp) {close(sp->pipe_fd[0]);close(sp->pipe_fd[1]);
}ssize_t splice_transfer(int fd_in, int fd_out, size_t count) {splice_pipe_t sp;ssize_t total_transferred = 0;ssize_t bytes_in_pipe = 0;if (create_splice_pipe(&sp) < 0) {return -1;}printf("Starting splice transfer of %zu bytes\n", count);while (total_transferred < count) {size_t remaining = count - total_transferred;size_t to_splice = (remaining < sp.buffer_size) ? remaining : sp.buffer_size;// 第一步:从输入文件描述符splice到管道ssize_t bytes_to_pipe = splice_syscall(fd_in, NULL, sp.pipe_fd[1], NULL, to_splice, SPLICE_F_MOVE);if (bytes_to_pipe < 0) {if (errno == EAGAIN) {continue;}perror("splice to pipe failed");break;}if (bytes_to_pipe == 0) {printf("EOF reached on input\n");break;  // EOF}bytes_in_pipe = bytes_to_pipe;// 第二步:从管道splice到输出文件描述符while (bytes_in_pipe > 0) {ssize_t bytes_from_pipe = splice_syscall(sp.pipe_fd[0], NULL, fd_out, NULL, bytes_in_pipe, SPLICE_F_MOVE);if (bytes_from_pipe < 0) {if (errno == EAGAIN) {continue;}perror("splice from pipe failed");goto cleanup;}if (bytes_from_pipe == 0) {printf("Unexpected EOF on output\n");goto cleanup;}bytes_in_pipe -= bytes_from_pipe;total_transferred += bytes_from_pipe;}printf("Transferred %ld bytes, total: %ld\n", bytes_to_pipe, total_transferred);}cleanup:destroy_splice_pipe(&sp);return total_transferred;
}int main(int argc, char *argv[]) {if (argc != 3) {fprintf(stderr, "Usage: %s <input_file> <output_file>\n", argv[0]);return EXIT_FAILURE;}const char *input_file = argv[1];const char *output_file = argv[2];int fd_in = open(input_file, O_RDONLY);if (fd_in < 0) {perror("Failed to open input file");return EXIT_FAILURE;}int fd_out = open(output_file, O_WRONLY | O_CREAT | O_TRUNC, 0644);if (fd_out < 0) {perror("Failed to open output file");close(fd_in);return EXIT_FAILURE;}// 获取输入文件大小struct stat st;if (fstat(fd_in, &st) < 0) {perror("fstat failed");close(fd_in);close(fd_out);return EXIT_FAILURE;}printf("Input file size: %ld bytes\n", st.st_size);// 执行splice传输ssize_t transferred = splice_transfer(fd_in, fd_out, st.st_size);if (transferred >= 0) {printf("Successfully transferred %ld bytes using splice\n", transferred);} else {printf("Transfer failed\n");}close(fd_in);close(fd_out);return (transferred >= 0) ? EXIT_SUCCESS : EXIT_FAILURE;
}

3. mmap示例 - 内存映射文件处理器

#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/time.h>typedef struct {void *addr;          // 映射地址size_t length;       // 映射长度int fd;              // 文件描述符int prot;            // 保护标志int flags;           // 映射标志
} mmap_region_t;int create_mmap_region(mmap_region_t *region, const char *filename, int prot, int flags) {struct stat st;// 打开文件int open_flags = O_RDONLY;if (prot & PROT_WRITE) {open_flags = O_RDWR;}region->fd = open(filename, open_flags);if (region->fd < 0) {perror("Failed to open file");return -1;}// 获取文件大小if (fstat(region->fd, &st) < 0) {perror("fstat failed");close(region->fd);return -1;}region->length = st.st_size;region->prot = prot;region->flags = flags;// 创建内存映射region->addr = mmap(NULL, region->length, prot, flags, region->fd, 0);if (region->addr == MAP_FAILED) {perror("mmap failed");close(region->fd);return -1;}printf("Successfully mapped %zu bytes at address %p\n", region->length, region->addr);return 0;
}void destroy_mmap_region(mmap_region_t *region) {if (region->addr != MAP_FAILED) {if (munmap(region->addr, region->length) < 0) {perror("munmap failed");}}if (region->fd >= 0) {close(region->fd);}
}// 使用mmap进行文件搜索
size_t mmap_search_pattern(mmap_region_t *region, const char *pattern) {size_t pattern_len = strlen(pattern);size_t count = 0;char *data = (char *)region->addr;printf("Searching for pattern '%s' in %zu bytes\n", pattern, region->length);struct timeval start, end;gettimeofday(&start, NULL);for (size_t i = 0; i <= region->length - pattern_len; i++) {if (memcmp(data + i, pattern, pattern_len) == 0) {count++;printf("Found pattern at offset %zu\n", i);}}gettimeofday(&end, NULL);double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;printf("Search completed in %.6f seconds\n", elapsed);printf("Found %zu occurrences of pattern '%s'\n", count, pattern);return count;
}// 使用mmap进行文件拷贝
int mmap_copy_file(const char *src_file, const char *dst_file) {mmap_region_t src_region;int dst_fd;struct stat st;// 映射源文件if (create_mmap_region(&src_region, src_file, PROT_READ, MAP_PRIVATE) < 0) {return -1;}// 创建目标文件dst_fd = open(dst_file, O_WRONLY | O_CREAT | O_TRUNC, 0644);if (dst_fd < 0) {perror("Failed to create destination file");destroy_mmap_region(&src_region);return -1;}// 扩展目标文件大小if (ftruncate(dst_fd, src_region.length) < 0) {perror("ftruncate failed");close(dst_fd);destroy_mmap_region(&src_region);return -1;}// 映射目标文件void *dst_addr = mmap(NULL, src_region.length, PROT_WRITE, MAP_SHARED, dst_fd, 0);if (dst_addr == MAP_FAILED) {perror("mmap destination failed");close(dst_fd);destroy_mmap_region(&src_region);return -1;}printf("Copying %zu bytes using mmap\n", src_region.length);struct timeval start, end;gettimeofday(&start, NULL);// 内存拷贝(零拷贝)memcpy(dst_addr, src_region.addr, src_region.length);// 确保数据写入磁盘if (msync(dst_addr, src_region.length, MS_SYNC) < 0) {perror("msync failed");}gettimeofday(&end, NULL);double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;printf("Copy completed in %.6f seconds\n", elapsed);// 清理资源munmap(dst_addr, src_region.length);close(dst_fd);destroy_mmap_region(&src_region);return 0;
}// 演示内存映射的预读优化
void mmap_prefetch_demo(mmap_region_t *region) {printf("Demonstrating madvise prefetch optimization\n");// 建议内核预读整个文件if (madvise(region->addr, region->length, MADV_WILLNEED) < 0) {perror("madvise MADV_WILLNEED failed");} else {printf("Advised kernel to prefetch %zu bytes\n", region->length);}// 建议顺序访问if (madvise(region->addr, region->length, MADV_SEQUENTIAL) < 0) {perror("madvise MADV_SEQUENTIAL failed");} else {printf("Advised kernel for sequential access pattern\n");}
}int main(int argc, char *argv[]) {if (argc < 3) {fprintf(stderr, "Usage: %s <command> <file> [pattern/destination]\n", argv[0]);fprintf(stderr, "Commands:\n");fprintf(stderr, "  search <file> <pattern>     - Search for pattern in file\n");fprintf(stderr, "  copy <src_file> <dst_file>  - Copy file using mmap\n");fprintf(stderr, "  prefetch <file>             - Demonstrate prefetch\n");return EXIT_FAILURE;}const char *command = argv[1];const char *filename = argv[2];if (strcmp(command, "search") == 0) {if (argc < 4) {fprintf(stderr, "Search command requires a pattern\n");return EXIT_FAILURE;}const char *pattern = argv[3];mmap_region_t region;if (create_mmap_region(&region, filename, PROT_READ, MAP_PRIVATE) < 0) {return EXIT_FAILURE;}mmap_search_pattern(&region, pattern);destroy_mmap_region(&region);} else if (strcmp(command, "copy") == 0) {if (argc < 4) {fprintf(stderr, "Copy command requires destination file\n");return EXIT_FAILURE;}const char *dst_file = argv[3];if (mmap_copy_file(filename, dst_file) < 0) {return EXIT_FAILURE;}} else if (strcmp(command, "prefetch") == 0) {mmap_region_t region;if (create_mmap_region(&region, filename, PROT_READ, MAP_PRIVATE) < 0) {return EXIT_FAILURE;}mmap_prefetch_demo(&region);destroy_mmap_region(&region);} else {fprintf(stderr, "Unknown command: %s\n", command);return EXIT_FAILURE;}return EXIT_SUCCESS;
}

编译和测试

编译脚本

#!/bin/bash# 创建构建目录
mkdir -p build# 编译sendfile示例
echo "Compiling sendfile server..."
gcc -o build/sendfile_server sendfile_server.c -D_GNU_SOURCE# 编译splice示例
echo "Compiling splice transfer..."
gcc -o build/splice_transfer splice_transfer.c -D_GNU_SOURCE# 编译mmap示例
echo "Compiling mmap processor..."
gcc -o build/mmap_processor mmap_processor.c -D_GNU_SOURCEecho "All examples compiled successfully!"
echo "Executables are in the build/ directory:"
echo "  - sendfile_server: Zero-copy file server"
echo "  - splice_transfer: Splice-based file transfer"
echo "  - mmap_processor: Memory-mapped file operations"

测试脚本

#!/bin/bashBUILD_DIR="build"
TEST_DIR="test_data"# 创建测试目录和文件
mkdir -p $TEST_DIR# 创建测试文件
echo "Creating test files..."
dd if=/dev/urandom of=$TEST_DIR/test_file_1MB bs=1M count=1 2>/dev/null
dd if=/dev/urandom of=$TEST_DIR/test_file_10MB bs=1M count=10 2>/dev/null
echo "Hello World! This is a test pattern for zero-copy demonstration." > $TEST_DIR/test_text.txt# 测试splice传输
echo "Testing splice transfer..."
time $BUILD_DIR/splice_transfer $TEST_DIR/test_file_1MB $TEST_DIR/splice_output_1MB
echo "Splice test completed."# 测试mmap搜索
echo "Testing mmap pattern search..."
$BUILD_DIR/mmap_processor search $TEST_DIR/test_text.txt "test"# 测试mmap拷贝
echo "Testing mmap file copy..."
time $BUILD_DIR/mmap_processor copy $TEST_DIR/test_file_1MB $TEST_DIR/mmap_copy_1MB# 测试mmap预读
echo "Testing mmap prefetch..."
$BUILD_DIR/mmap_processor prefetch $TEST_DIR/test_file_1MB# 验证拷贝结果
echo "Verifying copy results..."
if cmp -s $TEST_DIR/test_file_1MB $TEST_DIR/splice_output_1MB; thenecho "✓ Splice copy verification passed"
elseecho "✗ Splice copy verification failed"
fiif cmp -s $TEST_DIR/test_file_1MB $TEST_DIR/mmap_copy_1MB; thenecho "✓ Mmap copy verification passed"
elseecho "✗ Mmap copy verification failed"
fiecho "All tests completed!"

性能分析和调试工具

1. 系统调用跟踪

# 跟踪sendfile系统调用
strace -e trace=sendfile,splice,mmap,munmap ./sendfile_server# 跟踪文件操作
strace -e trace=file ./splice_transfer input.txt output.txt# 跟踪内存操作
strace -e trace=memory ./mmap_processor search large_file.txt pattern

2. 性能分析

# 使用perf分析零拷贝程序性能
perf record -g ./splice_transfer large_file.txt output.txt
perf report# 分析系统调用性能
perf trace ./sendfile_server# 分析内存访问模式
perf record -e cache-misses,page-faults ./mmap_processor copy src.txt dst.txt

3. 内存映射调试

# 查看进程内存映射
cat /proc/[PID]/maps# 监控页面错误
perf stat -e page-faults,minor-faults,major-faults ./mmap_processor# 查看虚拟内存统计
cat /proc/[PID]/status | grep -E "Vm|Rss"

4. 网络性能测试

# 测试sendfile服务器性能
ab -n 1000 -c 10 http://localhost:8080/test_file.html# 使用iperf测试网络传输
iperf3 -s &  # 服务端
iperf3 -c localhost -t 30  # 客户端测试30秒

5. I/O性能监控

# 监控磁盘I/O
iostat -x 1# 监控文件系统缓存
cat /proc/meminfo | grep -E "Cached|Buffers|Dirty"# 使用iotop监控I/O活动
iotop -o -d 1

调试技巧和最佳实践

1. 错误处理模式

// 标准错误检查模式
ssize_t result = sendfile(out_fd, in_fd, &offset, count);
if (result < 0) {switch (errno) {case EAGAIN:case EWOULDBLOCK:// 非阻塞模式,稍后重试break;case EINVAL:// 参数无效,检查文件描述符类型fprintf(stderr, "Invalid parameters for sendfile\n");break;case ENOSYS:// 系统不支持,回退到传统方法fprintf(stderr, "sendfile not supported, falling back to read/write\n");break;default:perror("sendfile failed");break;}
}

2. 性能优化建议

优化方向具体措施性能提升
缓冲区大小使用64KB-1MB缓冲区10-30%
文件预读madvise(MADV_WILLNEED)20-40%
CPU亲和性绑定特定CPU核心5-15%
内存对齐使用页对齐的缓冲区5-10%
批量操作合并多个小请求30-50%

3. 内核参数调优

# 调整网络缓冲区大小
echo 'net.core.rmem_max = 16777216' >> /etc/sysctl.conf
echo 'net.core.wmem_max = 16777216' >> /etc/sysctl.conf# 调整TCP缓冲区
echo 'net.ipv4.tcp_rmem = 4096 65536 16777216' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_wmem = 4096 65536 16777216' >> /etc/sysctl.conf# 应用设置
sysctl -p

4. 编译优化选项

# 高性能编译选项
gcc -O3 -march=native -mtune=native -flto \-ffast-math -funroll-loops \-D_GNU_SOURCE -o program source.c

总结

Linux零拷贝技术通过减少数据在用户空间和内核空间之间的拷贝,显著提升了系统性能。主要技术包括:

  1. sendfile(): 适用于文件到socket的传输,简单高效
  2. splice(): 提供更大的灵活性,支持任意文件描述符间的传输
  3. mmap(): 允许直接内存映射,适合大文件和随机访问
  4. copy_file_range(): 文件系统级别的优化拷贝
  5. DMA: 硬件级别的零拷贝,完全避免CPU参与

合理选择和组合这些技术,可以在不同场景下获得最佳性能表现。在实际应用中,还需要结合具体的硬件特性、文件系统类型和应用需求进行优化调整。


文章转载自:

http://43njLkex.jtxwq.cn
http://9FHI2s86.jtxwq.cn
http://QHoqWPqp.jtxwq.cn
http://GcTKx4OL.jtxwq.cn
http://W9Eqd5Jt.jtxwq.cn
http://H7osGcIV.jtxwq.cn
http://jR2ZHtcK.jtxwq.cn
http://KLesG4b4.jtxwq.cn
http://d3Hl1HdX.jtxwq.cn
http://JjdRiPJp.jtxwq.cn
http://TZ12arUD.jtxwq.cn
http://Jc1kA9xO.jtxwq.cn
http://elMQ5q4c.jtxwq.cn
http://teOckYxT.jtxwq.cn
http://oJI2s7Ic.jtxwq.cn
http://R0JD09eF.jtxwq.cn
http://55x3YRMw.jtxwq.cn
http://b5qGQawp.jtxwq.cn
http://2gofhSy9.jtxwq.cn
http://AW0mPahe.jtxwq.cn
http://1TTgiXtI.jtxwq.cn
http://Acy69RVc.jtxwq.cn
http://4KMv9fDn.jtxwq.cn
http://h1Uuiggk.jtxwq.cn
http://3M9igzWf.jtxwq.cn
http://Y82NICrX.jtxwq.cn
http://n3pupM2r.jtxwq.cn
http://WM9Bb0f1.jtxwq.cn
http://iBp1GtpM.jtxwq.cn
http://wmwuqm1w.jtxwq.cn
http://www.dtcms.com/a/368011.html

相关文章:

  • 【完整源码+数据集+部署教程】雪崩检测与分类图像分割系统源码和数据集:改进yolo11-HSFPN
  • 源雀SCRM开源:企微文件防泄密
  • 大模型赋能电子制造全生命周期质量管理的应用及实践
  • 三坐标测量机在汽车制造行业中的应用
  • 中小企业数字化转型卡在哪?选对AI工具+用好企业微信,人力成本直降70%
  • 通用虚拟示教器:让机器人教学像玩游戏一样简单
  • 记录下chatgpt的openai 开发过程
  • 从0开始学习Java+AI知识点总结-30.前端web开发(JS+Vue+Ajax)
  • mysql进阶语法(视图)
  • 从Java全栈到云原生:一场技术深度对话
  • React学习教程,从入门到精通, React 新创建组件语法知识点及案例代码(11)
  • 从零开始的python学习——字典
  • windows安装flash-attn记录
  • threeJS 实现开花的效果
  • 【数字孪生核心技术】数字孪生有哪些核心技术?
  • Leetcode—2749. 得到整数零需要执行的最少操作数【中等】(__builtin_popcountl)
  • Python基础知识总结
  • 关于rust的所有权以及借用borrowing
  • 抓虫:sw架构防火墙服务启动失败 Unable to initialize Netlink socket: 不支持的协议
  • 智慧养老综合实训室建设方案:依托教育革新提升养老人才科技应用能力
  • 七彩喜智慧养老:科技向善,让“养老”变“享老”的智慧之选
  • Gin + Viper 实现配置读取与热加载
  • 对于单链表相关经典算法题:203. 移除链表元素的解析
  • OpenLayers常用控件 -- 章节五:鹰眼地图控件教程
  • Swift 协议扩展与泛型:构建灵活、可维护的代码的艺术
  • python代码Bug排查
  • Xilinx系列FPGA实现DP1.4视频收发,支持4K60帧分辨率,提供2套工程源码和技术支持
  • HTML文本格式化标签
  • OpenCV C++ 进阶:图像直方图与几何变换全解析
  • Java全栈学习笔记30