Skip to main content

How would we write a MySQL query that returns the average of the top 50% of scores per team?

-- ---------------------------------------------------------------------------
-- Average the top 50% values per group

-- from the Artful Common Queries page
-- http://www.artfulsoftware.com/infotree/qrytip.php?id=532
-- ---------------------------------------------------------------------------

-- Each row of a games table records one game score for a team:

DROP TABLE
IF EXISTS games;

CREATE TABLE games (id INT, teamID INT, score INT);

INSERT INTO games
VALUES
  (1, 1, 3),
  (2, 1, 4),
  (3, 1, 5),
  (4, 1, 6),
  (5, 2, 6),
  (6, 2, 7),
  (7, 2, 8),
  (8, 2, 7),
  (9, 2, 6),
  (10, 2, 7);

-- How would we write a query that returns the average of the top 50% of
-- scores per team?

-- The per-team median value is its middle value, lower than the highest 50%
-- and higher than the lowest 50% of values for that team -- so a shortcut
-- is to query the team medians, then aggregate on a join that selects
-- per-team scores above the medians.

-- How to find per-team medians? If a resultset has an odd number of rows,
-- at least one row has the true median score. If it has an even number of
-- rows, the median score is an average of two central values.
-- The following query adapts Joe Celko's formula in "SQL for Smarties"
-- averaging "low" and "high" medians:

DROP TABLE IF EXISTS medians;
CREATE TABLE medians
SELECT p1.teamid, AVG(P1.score) AS median
FROM games AS P1, games AS P2
WHERE p1.teamid=p2.teamid
GROUP BY p1.teamid
HAVING (
  SUM(CASE WHEN P2.score <= P1.score THEN 1 ELSE 0 END) >= ((COUNT(*) + 1) / 2)
  AND
  SUM(CASE WHEN P2.score >= P1.score THEN 1 ELSE 0 END) >= (COUNT(*)/2 + 1)
)
OR (
  SUM(CASE WHEN P2.score >= P1.score THEN 1 ELSE 0 END) >= ((COUNT(*) + 1) / 2)
  AND
  SUM(CASE WHEN P2.score <= P1.score THEN 1 ELSE 0 END) >= (COUNT(*)/2 + 1)
);
-- +--------+--------+
-- | teamid | median |
-- +--------+--------+
-- |      1 | 4.5000 |
-- |      2 | 6.8333 |
-- +--------+--------+

-- Now join games to medians accepting only top-half values:

SELECT g.teamid, AVG(g.score) AS Top50Avg
FROM games g
JOIN medians m ON g.teamid = m.teamid AND g.score >= m.median
GROUP BY g.teamid
ORDER BY Top50Avg DESC;
-- +--------+----------+
-- | teamid | Top50Avg |
-- +--------+----------+
-- |      2 |   7.2500 |
-- |      1 |   5.5000 |
-- +--------+----------+
DROP TABLE medians;

-- Yes, all the logic can be moved into one query:

SELECT g.teamid, AVG(g.score) AS Top50Avg
FROM games g
JOIN (
  SELECT p1.teamid, AVG(P1.score) AS median
  FROM games AS P1, games AS P2
  WHERE p1.teamid=p2.teamid
  GROUP BY p1.teamid
  HAVING (
    SUM(CASE WHEN P2.score <= P1.score THEN 1 ELSE 0 END) >= ((COUNT(*) + 1) / 2)
    AND
    SUM(CASE WHEN P2.score >= P1.score THEN 1 ELSE 0 END) >= (COUNT(*)/2 + 1)
  )
  OR (
    SUM(CASE WHEN P2.score >= P1.score THEN 1 ELSE 0 END) >= ((COUNT(*) + 1) / 2)
    AND
    SUM(CASE WHEN P2.score <= P1.score THEN 1 ELSE 0 END) >= (COUNT(*)/2 + 1)
  )
) AS m ON g.teamid = m.teamid AND g.score >= m.median
GROUP BY g.teamid
ORDER BY Top50Avg DESC;