当前位置：首页 > news >正文

DPVS-4： dpvs.conf配置文件解读

news 2025/11/10 23:57:39

WHY

在进行单臂负载均衡测试和双臂负载均衡测试时，发现dpvs.conf中很多配置项意思尚不理解。

DPVS的配置文件条目比较多，参考了一些资料和代码片段来理解配置文件，希望通过对配置文件的理解能更好的了解DPVS。

下面的内容可能会有错误，会在以后的研究中进行修正。

配置文件说明

接口配置

两个接口dpdk0, dpdk1 , 这里并没有和具体网卡映射关系的配置，而是按照PCI地址的从小到大来的。

比如 98:00.0 就对应dpdk0 , 98:00.1 对应dpdk1

! netif config
netif_defs {
    <init> pktpool_size     1048575   # MBUF池大小
    <init> pktpool_cache    256       # MBUF池对每个worker的缓存大小
    <init> fdir_mode        perfect   # flowdirect 精准匹配


    <init> device dpdk0 {
        rx {
            queue_number        8     # 队列数量为8 
            descriptor_number   1024  # 队列深度 1024个数据包
            rss                 all   # 全流量RSS
        }
        tx {
            queue_number        8
            descriptor_number   1024
        }
        ! mtu                   1500
        ! promisc_mode
        ! allmulticast
        kni_name                dpdk0.kni 
    }
    
    <init> device dpdk1 {
        rx {
            queue_number        8
            descriptor_number   1024
            rss                 all
        }
        tx {
            queue_number        8
            descriptor_number   1024
        }
        ! mtu                   1500
        ! promisc_mode
        ! allmulticast
        kni_name                dpdk1.kni
    }
}

worker配置

worker_defs {
    <init> worker cpu0 {   # 主worker， 控制面
        type    master
        cpu_id  0
    }

    <init> worker cpu1 {   # 从worker, 转发面
        type    slave
        cpu_id  1          # 绑定核心
        port    dpdk0 {     # 绑定网卡 tx/rx队列
            rx_queue_ids     0
            tx_queue_ids     0
            ! isol_rx_cpu_ids  9
            ! isol_rxq_ring_sz 1048576
        }
        port    dpdk1 {      # 绑定网卡 tx/rx队列
            rx_queue_ids     0
            tx_queue_ids     0
            ! isol_rx_cpu_ids  9
            ! isol_rxq_ring_sz 1048576
        }
    }
    
    <init> worker cpu2 {
        type    slave
        cpu_id  2
        port    dpdk0 {
            rx_queue_ids     1
            tx_queue_ids     1
            ! isol_rx_cpu_ids  10
            ! isol_rxq_ring_sz 1048576
        }
        port    dpdk1 {
            rx_queue_ids     1
            tx_queue_ids     1
            ! isol_rx_cpu_ids  10
            ! isol_rxq_ring_sz 1048576
        }
    }
    ...
    
    <init> worker   cpu8 {
        type        slave
        cpu_id      8
        ! icmp_redirect_core
        port        dpdk0 {
            rx_queue_ids     7
            tx_queue_ids     7
            ! isol_rx_cpu_ids  16
            ! isol_rxq_ring_sz 1048576
        }
        port        dpdk1 {
            rx_queue_ids     7
            tx_queue_ids     7
            ! isol_rx_cpu_ids  16
            ! isol_rxq_ring_sz 1048576
        }
    }

}

一共设置了9cpu核心，1~8 是转发worker 数据面, 0 是master worker.控制面。

每一个worker都包含了dpdk0, dpdk1两个网卡的tx/rx队列, 说明一个转发worker将要收发两个网卡双向的数据包。

定时器间隔

timer_defs {
    schedule_interval    500  #定时器间隔 单位 us 
}

schedule_interval 最终赋值给 timer_sched_interval_us , 确实是us单位

//schedule_interval 最终赋值给 timer_sched_interval_us ,  确实是us单位
static void lcore_job_timer_manage(void *args)
{
    static uint64_t tm_manager_time[DPVS_MAX_LCORE] = { 0 };
    uint64_t now = rte_get_timer_cycles();
    portid_t cid = rte_lcore_id();

    if (unlikely((now - tm_manager_time[cid]) * 1000000 / g_cycles_per_sec
            > timer_sched_interval_us)) {
        rte_timer_manage();
        tm_manager_time[cid] = now;
    }
}

邻居

neigh_defs {
    <init> unres_queue_length  128  # 未解析的队列额长度
    timeout                    60   # 邻居表超时时间
}

这里的邻居表超时时间似乎比linux系统长， linux 30s

ipset哈希表

ip集合的哈希池大小

ipset_defs {
    <init> ipset_hash_pool_size 131072
}

但是观察代码发现这个值并没有使用，而是直接使用了IPSET_HASH_POOL_SIZE_DEF固定值

#define IPSET_HASH_POOL_SIZE_DEF    262143
int 
ipset_hash_init(void)
{
    int i;
    char poolname[32];
	// 创建表时，表的大小直接用的宏IPSET_HASH_POOL_SIZE_DEF的值
    for (i = 0; i < get_numa_nodes(); i++) {
        snprintf(poolname, sizeof(poolname), "ipset_hash_pool_%d", i);
        ipset_hash_cache[i] = rte_mempool_create(poolname,
                            IPSET_HASH_POOL_SIZE_DEF,
                            sizeof(struct hash_entry) + HASH_ELEM_SIZE_MAX,
                            IPSET_HASH_CACHE_SIZE_DEF,
                            0, NULL, NULL, NULL, NULL, i, 0);
        if (!ipset_hash_cache[i]) {
            return EDPVS_NOMEM;
        }
    }
    return EDPVS_OK;
}

// ipset_hash_pool_size 并没有赋值给其他变量
static void 
ipset_hash_pool_size_handler(vector_t tokens)
{
    char *str = set_value(tokens);
    int pool_size;

    assert(str);

    pool_size = atoi(str);

    if (pool_size < IPSET_HASH_POOL_SIZE_MIN) {
        RTE_LOG(WARNING, IPSET, "invalid ipset_hash_pool_size %s, using default %d\n",
                str, IPSET_HASH_POOL_SIZE_DEF);
        ipset_hash_pool_size = IPSET_HASH_POOL_SIZE_DEF;
    } else {
        is_power2(pool_size, 1, &pool_size);
        RTE_LOG(INFO, IPSET, "ipset_hash_pool_size = %d (round to 2^n-1)\n", pool_size);
        ipset_hash_pool_size = pool_size - 1;
    }

    FREE_PTR(str);
}

IPv4配置

ipv4_defs {
    forwarding                 off   # 关闭转发
    <init> default_ttl         64    
    fragment {						# 处理分片的资源 
        <init> bucket_number   4096  # 桶数量
        <init> bucket_entries  16    # 桶内的条目数量
        <init> max_entries     4096  # 最大分片数量
        <init> ttl             1     
    }
}

这里尚不确定在forwarding为off时，这些分片设置是否还会起作用

IPv6 配置

ipv6_defs {
    disable                     off    # 开启ipv6
    forwarding                  off    # 关闭转发
    route6 {
        <init> method           hlist  # 哈希链表
        recycle_time            10     # 路由条目回收时间
    }
}

控制配置

ctrl_defs {
    lcore_msg {
        <init> ring_size                4096   # 消息队列长度
        sync_msg_timeout_us             20000  # 同步消息超时时间 us
        priority_level                  low    # 低优先级
    }
}

控制消息队列，是每个核心(worker)都有的，控制消息通过这些队列发送到目的worker

参考代码

    /* per-lcore msg queue */
    for (ii = 0; ii < DPVS_MAX_LCORE; ii++) {
        snprintf(ring_name, sizeof(ring_name), "msg_ring_%d", ii);
        msg_ring[ii] = rte_ring_create(ring_name, msg_ring_size,
                rte_socket_id(), RING_F_SC_DEQ);
        if (unlikely(NULL == msg_ring[ii])) {
            RTE_LOG(ERR, MSGMGR, "%s: fail to create msg ring\n", __func__);
            dpvs_mempool_destroy(msg_pool);
            for (--ii; ii >= 0; ii--)
                rte_ring_free(msg_ring[ii]);
            return EDPVS_DPDKAPIFAIL;
        }
    }

IPVS配置

连接池

    conn {
        <init> conn_pool_size       2097152   # 连接池大小
        <init> conn_pool_cache      256       # 每个核心缓存的连接池大小
        conn_init_timeout           3         # 连接初始化超时时间 s
        ! expire_quiescent_template
        ! fast_xmit_close
        ! <init> redirect           off
    }

UDP

    udp {
        ! defence_udp_drop
        uoa_mode        opp         # opp开启UOA模式，
        uoa_max_trail   3           # uoa尝试次数
        timeout {
            oneway      60          # 单向超时
            normal      300			# 正常连接超时
            last        3			# 最后超时
        }
    }

uoa模式，就是将源IP地址写入IP头里的option字段中。

TCP

tcp {
    timeout {            # tcp连接各种状态下的超时时间
        none        2    
        established 90   # 已建立连接的超时
        syn_sent    3    # 发出syn以后的超时
        syn_recv    30   # 收到syn以后的超时
        fin_wait    7    # FIN发送之后的超时
        time_wait   7	 # 发送FIN，接收到FIN之后的超时
        close       3    
        close_wait  7
        last_ack    7
        listen      120  # 监听连接超时
        synack      30
        last        2
    }
    synproxy {          # TCP SYN proxy
        synack_options {      # syn ACK修改TCP头中的字段
            mss             1452  # max segment size ,  最大段大小
            ttl             63    # time-to-live 最大转发跳数
            sack				 # 选择性确认
        }
        close_client_window
        rs_syn_max_retry    3
        ack_storm_thresh    10
        max_ack_saved       3
        conn_reuse_state {      # 连接重用状态
            close
            time_wait
        }
    }
}