板凳-------Mysql cookbook学习 (十一--------9)
13.2 分组描述统计
mysql> select age, count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by age;
+-----+---+------+---------+---------+--------+--------------------+--------------------+
| age | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
| 5 | 4 | 22 | 4 | 7 | 5.5000 | 1.2909944487358056 | 1.6666666666666667 |
| 6 | 4 | 27 | 4 | 9 | 6.7500 | 2.217355782608345 | 4.916666666666667 |
| 7 | 4 | 30 | 6 | 9 | 7.5000 | 1.2909944487358056 | 1.6666666666666667 |
| 8 | 4 | 32 | 6 | 10 | 8.0000 | 1.8257418583505538 | 3.3333333333333335 |
| 9 | 4 | 35 | 7 | 10 | 8.7500 | 1.2583057392117918 | 1.583333333333334 |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
5 rows in set (0.03 sec)mysql> select sex, count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by sex;
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| sex | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| M | 10 | 71 | 4 | 9 | 7.1000 | 1.7919573407620817 | 3.2111111111111112 |
| F | 10 | 75 | 4 | 10 | 7.5000 | 1.9578900207451218 | 3.8333333333333335 |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
2 rows in set (0.00 sec)mysql> select age, sex, count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by age, sex;
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
| age | sex | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
| 5 | M | 2 | 9 | 4 | 5 | 4.5000 | 0.7071067811865476 | 0.5 |
| 5 | F | 2 | 13 | 6 | 7 | 6.5000 | 0.7071067811865476 | 0.5 |
| 6 | M | 2 | 17 | 8 | 9 | 8.5000 | 0.7071067811865476 | 0.5 |
| 6 | F | 2 | 10 | 4 | 6 | 5.0000 | 1.4142135623730951 | 2 |
| 7 | M | 2 | 14 | 6 | 8 | 7.0000 | 1.4142135623730951 | 2 |
| 7 | F | 2 | 16 | 7 | 9 | 8.0000 | 1.4142135623730951 | 2 |
| 8 | M | 2 | 15 | 6 | 9 | 7.5000 | 2.1213203435596424 | 4.5 |
| 8 | F | 2 | 17 | 7 | 10 | 8.5000 | 2.1213203435596424 | 4.5 |
| 9 | M | 2 | 16 | 7 | 9 | 8.0000 | 1.4142135623730951 | 2 |
| 9 | F | 2 | 19 | 9 | 10 | 9.5000 | 0.7071067811865476 | 0.5 |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
10 rows in set (0.00 sec)
13.3 产生频率分布
mysql> select score, count(score) as occurence-> from testscore group by score;
+-------+-----------+
| score | occurence |
+-------+-----------+
| 5 | 1 |
| 4 | 2 |
| 6 | 4 |
| 7 | 4 |
| 8 | 2 |
| 9 | 5 |
| 10 | 2 |
+-------+-----------+
7 rows in set (0.00 sec)mysql> select @n := count(score) from testscore;
+--------------------+
| @n := count(score) |
+--------------------+
| 20 |
+--------------------+
1 row in set, 1 warning (0.01 sec)mysql> select score, (count(score) * 100)/@n as percent-> from testscore group by score;
+-------+---------+
| score | percent |
+-------+---------+
| 5 | 5.0000 |
| 4 | 10.0000 |
| 6 | 20.0000 |
| 7 | 20.0000 |
| 8 | 10.0000 |
| 9 | 25.0000 |
| 10 | 10.0000 |
+-------+---------+
7 rows in set (0.00 sec)mysql> select score, repeat('*', count(score)) as occurrences-> from testscore group by score;
+-------+-------------+
| score | occurrences |
+-------+-------------+
| 5 | * |
| 4 | ** |
| 6 | **** |
| 7 | **** |
| 8 | ** |
| 9 | ***** |
| 10 | ** |
+-------+-------------+
7 rows in set (0.00 sec)mysql> select @n := count(score) from testscore;
+--------------------+
| @n := count(score) |
+--------------------+
| 20 |
+--------------------+
1 row in set, 1 warning (0.00 sec)mysql> select score, repeat('*', (count(score)*100)/@n) as percent-> from testscore group by score;
+-------+---------------------------+
| score | percent |
+-------+---------------------------+
| 5 | ***** |
| 4 | ********** |
| 6 | ******************** |
| 7 | ******************** |
| 8 | ********** |
| 9 | ************************* |
| 10 | ********** |
+-------+---------------------------+
7 rows in set (0.00 sec)mysql> drop table if exists ref;
Query OK, 0 rows affected (0.03 sec)mysql> create table ref(score int);
Query OK, 0 rows affected (0.04 sec)mysql> insert into ref(score)-> values(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
Query OK, 11 rows affected (0.02 sec)
Records: 11 Duplicates: 0 Warnings: 0mysql> select ref.score, count(testscore.score) as occurences-> from ref left join testscore on ref.score = testscore.score-> group by ref.score;
+-------+------------+
| score | occurences |
+-------+------------+
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 2 |
| 5 | 1 |
| 6 | 4 |
| 7 | 4 |
| 8 | 2 |
| 9 | 5 |
| 10 | 2 |
+-------+------------+
11 rows in set (0.00 sec)mysql> select ref.score, (count(testscore.score)*100)/@n as percent-> from ref left join testscore on ref.score = testscore.score-> group by ref.score;
+-------+---------+
| score | percent |
+-------+---------+
| 0 | 0.0000 |
| 1 | 0.0000 |
| 2 | 0.0000 |
| 3 | 0.0000 |
| 4 | 10.0000 |
| 5 | 5.0000 |
| 6 | 20.0000 |
| 7 | 20.0000 |
| 8 | 10.0000 |
| 9 | 25.0000 |
| 10 | 10.0000 |
+-------+---------+
11 rows in set (0.00 sec)
13.4 计数缺失值
mysql> -- 创建表
mysql> CREATE TABLE subject_scores (-> subject INT,-> score INT NULL-> );
Query OK, 0 rows affected (0.06 sec)mysql>
mysql> -- 插入示例数据
mysql> INSERT INTO subject_scores VALUES-> (1, 38), (2, NULL), (3, 47),-> (4, 82), (5, NULL), (6, 65),-> (7, 90), (8, 73), (9, NULL),-> (10, 55), (11, 68), (12, 79);
Query OK, 12 rows affected (0.01 sec)
Records: 12 Duplicates: 0 Warnings: 0mysql>
mysql> -- 查询
mysql> SELECT * FROM subject_scores ORDER BY subject;
+---------+-------+
| subject | score |
+---------+-------+
| 1 | 38 |
| 2 | NULL |
| 3 | 47 |
| 4 | 82 |
| 5 | NULL |
| 6 | 65 |
| 7 | 90 |
| 8 | 73 |
| 9 | NULL |
| 10 | 55 |
| 11 | 68 |
| 12 | 79 |
+---------+-------+
12 rows in set (0.00 sec)mysql> SELECT-> COUNT(*) AS 'n (total)',-> COUNT(score) AS 'n (nonmissing)',-> COUNT(*) - COUNT(score) AS 'n (missing)',-> ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'-> FROM subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
| 12 | 9 | 3 | 25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)mysql> select count(*) as 'n (total)',-> count(score) as 'n (nonmissing)',-> sum(isnull(score)) as 'n (missing)',-> (sum(isnull(score))* 100) / count(*) as '% missing'-> from subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
| 12 | 9 | 3 | 25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)如果您确实需要按某些条件分组:
需要先确定分组依据。例如,如果:
• 科目1-6是A组
• 科目7-12是B组
可以这样写:
sql
mysql> SELECT-> CASE WHEN subject BETWEEN 1 AND 6 THEN 'A' ELSE 'B' END AS group_name,-> COUNT(*) AS 'n (total)',-> COUNT(score) AS 'n (nonmissing)',-> COUNT(*) - COUNT(score) AS 'n (missing)',-> ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'-> FROM subject_scores-> GROUP BY group_name;
+------------+-----------+----------------+-------------+-----------+
| group_name | n (total) | n (nonmissing) | n (missing) | % missing |
+------------+-----------+----------------+-------------+-----------+
| A | 6 | 4 | 2 | 33.3333 |
| B | 6 | 5 | 1 | 16.6667 |
+------------+-----------+----------------+-------------+-----------+
2 rows in set (0.00 sec)
13.5 计算线性回归和相关系数
mysql> select age, score from testscore;
+-----+-------+
| age | score |
+-----+-------+
| 5 | 5 |
| 5 | 4 |
| 5 | 6 |
| 5 | 7 |
| 6 | 8 |
| 6 | 9 |
| 6 | 4 |
| 6 | 6 |
| 7 | 8 |
| 7 | 6 |
| 7 | 9 |
| 7 | 7 |
| 8 | 9 |
| 8 | 6 |
| 8 | 7 |
| 8 | 10 |
| 9 | 9 |
| 9 | 7 |
| 9 | 10 |
| 9 | 9 |
+-----+-------+
20 rows in set (0.00 sec)mysql> SELECT-> @n := COUNT(score) AS n,-> @meanx := AVG(age) AS 'x mean',-> @sumx := SUM(age) AS 'x sum',-> @sumxx := SUM(age * age) AS 'x sum of squares',-> @meany := AVG(score) AS 'y mean',-> @sumy := SUM(score) AS 'y sum',-> @sumyy := SUM(score * score) AS 'y sum of squares',-> @sumxy := SUM(age * score) AS 'x*y sum'-> FROM testscore\G
*************************** 1. row ***************************n: 20x mean: 7.0000x sum: 140
x sum of squares: 1020y mean: 7.3000y sum: 146
y sum of squares: 1130x*y sum: 1053
1 row in set, 8 warnings (0.00 sec)mysql> select-> @b := (@n * @sumxy - @sumx * @sumy)/ (@n * @sumxx - @sumx * @sumx)-> as slope;
+-------------+
| slope |
+-------------+
| 0.775000000 |
+-------------+
1 row in set, 1 warning (0.00 sec)mysql> select @a := (@meany - @b * @meanx) as intercept;
+----------------------+
| intercept |
+----------------------+
| 1.875000000000000000 |
+----------------------+
1 row in set, 1 warning (0.00 sec)mysql> select concat('y =', @b, 'x + ', @a) as 'least-squares regression';
+----------------------------------------+
| least-squares regression |
+----------------------------------------+
| y =0.775000000x + 1.875000000000000000 |
+----------------------------------------+
1 row in set (0.00 sec)mysql> select-> (@n * @sumxy - @sumx * @sumy)-> /sqrt((@n * @sumxx - @sumx * @sumx) * (@n * @sumyy - @sumy * @sumy ))-> as correlation;
+--------------------+
| correlation |
+--------------------+
| 0.6117362044219903 |
+--------------------+
1 row in set (0.00 sec)