当前位置：首页 > news >正文

mysql大表批量查询中IN vs JOIN vs CTE 性能实验

news 2025/10/10 12:12:52

实验

背景

一张1000w的表tb_sku 。

本次实验旨在比较三种批量查询方式在不同 IDsPerQuery （id数量）规模下的执行性能，以评估最优查询策略。

三种方法分别是：

IN 查询
临时表 + JOIN 查询
CTE (Common Table Expression) 查询

这几种方式具体来说：

1️⃣ IN 查询

SELECT * FROM tb_sku WHERE id IN (%s);

特点：

简单直接；
SQL语句长度与 ids 数量线性增长；

2️⃣ 临时表 + JOIN 查询

CREATE TEMPORARY TABLE temp_ids (id BIGINT PRIMARY KEY);
INSERT INTO temp_ids VALUES (...);
SELECT s.* FROM tb_sku s JOIN temp_ids t ON s.id = t.id;
DROP TEMPORARY TABLE temp_ids;

特点：

适合大批量；
减少 SQL 语句解析负担；
临时表在服务器端高效 JOIN。

3️⃣ CTE 查询

WITH args AS (SELECT 1 AS id UNION ALLSELECT 2 UNION ALLSELECT 3 ...
)
SELECT * FROM tb_sku JOIN args USING (id);

特点：

无需创建临时表；
服务端可直接在内存中执行；
在中小批量下性能最佳；
但大批量时会因栈空间限制导致报错：

❌ Error 1436 (HY000): Thread stack overrun
Use ‘mysqld --thread_stack=#’ to specify a bigger stack.

实验结果

1️⃣ 小规模（≤ 1000）

IDsPerQuery	IN查询(ms)	JOIN(ms)	CTE(ms)
860	99	4	6
900	100	4	6
950	109	3	7
1000	116	5	7

CTE ≈ JOIN ≪ IN
→ CTE 性能最佳，JOIN 接近，IN 明显慢。

在这里插入图片描述

2️⃣ 中规模（1k ~ 10k）

IDsPerQuery	IN查询(ms)	JOIN(ms)	CTE(ms)
1000	163	4	8
1100	149	3	8
2300	254	7	25
2400	273	9	25
2500	283	14	28
3300	374	20	46
3400	385	24	42
3500	394	19	43
4600	621	223	72
4700	540	244	70
4900	571	483	228
5000	578	569	234
6800	764	671	463
7000	775	687	601
10000	1148	1070	x

结论：

1k ~ 10k 区间：不爆栈时CTE 最优，其次JOIN 最优，随后逐渐和IN相同
临时表存在不稳定的情况（4k时忽高忽低）
IN 查询性能随 id 数量增长逐渐下降
CTE 在 5~7k 以上报错，不可用（爆栈）

在这里插入图片描述

3️⃣ 超大规模（≥ 1w）

IDsPerQuery	IN查询(ms)	JOIN(ms)
1000	163	3
5000	611	579
10000	1148	1070
140000	7352	7931
200000	7027	8626
300000	7108	6812
360000	6901	5595
390000	6929	5605

结论：

10w 以下：JOIN 性能优；
10w 以上：IN 性能略优；
30w 以上：JOIN 再次超过 IN；
说明 JOIN 优势在极大批量下又显现。

在这里插入图片描述

综合结论

数据规模	推荐方法	说明
≤ 5000	✅ CTE	内存中执行，无临时表开销，性能极佳
5000 ~ 100000	✅ JOIN 临时表	避免 SQL 过长，提高可维护性
> 100000	✅ IN / JOIN（视SQL优化器）	具体取决于数据库优化策略
并发场景	并发 CTE	CTE 最优方案，在多线程并发时效果显著

故：

🔹 小批量：CTE = JOIN > IN
🔹 中批量：CTE = JOIN > IN
🔹 超大批量：JOIN ≈ IN
🔹 并发时：CTE 优势最明显（尤其在栈内执行时）

附录

实验代码：

package bigtableimport ("database/sql""fmt""strings""testing""time"
)// --- 工具函数 ---func countDup(nums []int) (int, []int) {freq := make(map[int]int)for _, v := range nums {freq[v]++}var duplicates []intfor num, cnt := range freq {if cnt > 1 {duplicates = append(duplicates, num)}}return len(duplicates), duplicates
}func distinct(nums []int) []int {m := make(map[int]struct{})for _, num := range nums {m[num] = struct{}{}}res := make([]int, 0, len(m))for k := range m {res = append(res, k)}return res
}// --- SQL 批量查询方法 ---// 1️⃣ IN 查询
func queryWithIN(db *sql.DB, ids []int) (int, time.Duration, error) {if len(ids) == 0 {return 0, 0, nil}inParams := make([]string, len(ids))for i, id := range ids {inParams[i] = fmt.Sprintf("%d", id)}query := fmt.Sprintf(`SELECT id,name FROM tb_sku WHERE id IN (%s)`, strings.Join(inParams, ","))start := time.Now()rows, err := db.Query(query)if err != nil {return 0, 0, err}defer rows.Close()var count intfor rows.Next() {var id intvar name stringif err := rows.Scan(&id, &name); err != nil {return count, time.Since(start), err}count++}return count, time.Since(start), nil
}// 2️⃣ 临时表 + JOIN 查询
func queryWithTempTable(db *sql.DB, ids []int, batchSize int) (int, time.Duration, error) {if len(ids) == 0 {return 0, 0, nil}// 删除旧临时表_, _ = db.Exec(`DROP TEMPORARY TABLE IF EXISTS tmp_ids`)// 创建临时表_, err := db.Exec(`CREATE TEMPORARY TABLE IF NOT EXISTS tmp_ids (id BIGINT PRIMARY KEY)`)if err != nil {return 0, 0, err}// 批量插入for i := 0; i < len(ids); i += batchSize {end := i + batchSizeif end > len(ids) {end = len(ids)}batch := ids[i:end]placeholders := make([]string, len(batch))args := make([]interface{}, len(batch))for j, id := range batch {placeholders[j] = "(?)"args[j] = id}query := fmt.Sprintf("INSERT INTO tmp_ids (id) VALUES %s", strings.Join(placeholders, ","))_, err := db.Exec(query, args...)if err != nil {return 0, 0, err}}// JOIN 查询start := time.Now()rows, err := db.Query(`SELECT s.id, s.name FROM tb_sku s JOIN tmp_ids t ON s.id = t.id`)if err != nil {return 0, 0, err}defer rows.Close()var count intfor rows.Next() {var id intvar name stringif err := rows.Scan(&id, &name); err != nil {return count, time.Since(start), err}count++}return count, time.Since(start), nil
}// 3️⃣ CTE（WITH … AS … UNION ALL）查询
func queryWithCTE(db *sql.DB, ids []int) (int, time.Duration, error) {if len(ids) == 0 {return 0, 0, nil}// 构造 CTEquery := "WITH args AS ("for i, id := range ids {if i > 0 {query += " UNION ALL "}query += fmt.Sprintf("SELECT %d AS id", id)}query += fmt.Sprintf(") SELECT s.id, s.name FROM tb_sku s JOIN args USING (id)")start := time.Now()rows, err := db.Query(query)if err != nil {return 0, 0, err}defer rows.Close()var count intfor rows.Next() {var id intvar name stringif err := rows.Scan(&id, &name); err != nil {return count, time.Since(start), err}count++}return count, time.Since(start), nil
}// --- 测试函数 ---func TestBatchQueriesPerfMultiSize(t *testing.T) {db := openDB()defer db.Close()maxID := 10000000// idsPerQuery 维度，可以测试不同批量idSizes := []int{}for i := 1000; i <= 5000; i += 100 {idSizes = append(idSizes, i)}//for i := 1000; i <= 5000; i += 100 {//	idSizes = append(idSizes, i)//}//for i := 3000; i <= 5000; i += 100 {//	idSizes = append(idSizes, i)//}//for i := 0; i <= 1000; i += 10 {//	idSizes = append(idSizes, i)//}//idSizes := []int{1000, 5000}//idSizes := []int{1000, 5000, 10000, 20000, 30000, 40000, 50000}//idSizes := []int{80000, 120000}//idSizes := []int{140000, 160000}//idSizes := []int{180000, 200000}//idSizes := []int{220000, 240000}//idSizes := []int{250000, 300000}//idSizes := []int{330000, 360000, 390000}batchSize := 1000// 用于记录随 idSizes 变化的耗时inTimes := make([]float64, len(idSizes))tmpTableTimes := make([]float64, len(idSizes))cteTimes := make([]float64, len(idSizes)) // 新增 CTE 方法耗时type result struct {Method      stringIDsPerQuery intCount       intElapsed     time.DurationErr         error}var results []resultfmt.Println("IDsPerQuery\tIN查询(ms)\tJOIN临时表(ms) \t CTE")for i, idsPerQuery := range idSizes {rawIDs := randomIDs(idsPerQuery, maxID)ids := distinct(rawIDs)// --- IN 查询 ---countIn, durationIn, err := queryWithIN(db, ids)results = append(results, result{Method:      "IN 查询",IDsPerQuery: idsPerQuery,Count:       countIn,Elapsed:     durationIn,Err:         err,})inTimes[i] = float64(durationIn.Milliseconds())// --- 临时表 + JOIN 查询 ---countTmp, durationTmp, err := queryWithTempTable(db, ids, batchSize)results = append(results, result{Method:      "JOIN临",IDsPerQuery: idsPerQuery,Count:       countTmp,Elapsed:     durationTmp,Err:         err,})tmpTableTimes[i] = float64(durationTmp.Milliseconds())// CTE 查询_, durationCTE, err := queryWithCTE(db, ids)if err != nil {t.Errorf("CTE 查询失败: %v", err)continue}cteTimes[i] = float64(durationCTE.Milliseconds())//fmt.Printf("%d\t\t%.2f\t\t%.2f\n", idsPerQuery, inTimes[i], tmpTableTimes[i])fmt.Printf("%d\t\t%.2f\t\t%.2f\t\t%.2f\n", idsPerQuery, inTimes[i], tmpTableTimes[i], cteTimes[i])}t.Logf("idSizes: %v", idSizes)t.Logf("IN 查询耗时: %v", inTimes)t.Logf("临时表+JOIN耗时: %v", tmpTableTimes)// 打印表格化结果//fmt.Println("批量查询性能对比 (不同 IDsPerQuery):")//fmt.Println("IDsPerQuery\t方法\t\t命中行数\t耗时(ms)\t状态")//for _, r := range results {//	status := "OK"//	if r.Err != nil {//		status = fmt.Sprintf("错误: %v", r.Err)//	}//	fmt.Printf("%d\t\t%s\t%d\t\t%.2f\t%s\n",//		r.IDsPerQuery,//		r.Method,//		r.Count,//		float64(r.Elapsed.Milliseconds()),//		status,//	)//}
}