Skip to main content

Aggregating by bands of values can clarify data patterns. The banding trick is a simple transformation on the banding column.

-- ---------------------------------------------------------------------------
-- Aggregates from bands of values

-- from the Artful Common Queries page
-- http://www.artfulsoftware.com/infotree/qrytip.php?id=61
-- ---------------------------------------------------------------------------

-- Aggregating by bands of values can clarify data patterns. The banding
-- trick is a simple transformation on the banding column:

  -- <band width> * Floor( <banding column> / <band width> )

-- so to count and average scores in bands of 10, ie 0-9,10-19 and so on ...

  create table scores(score int);
  insert into scores values(5),(15),(25),(35);
  SELECT 10 * FLOOR( score / 10  ) AS  Bottom,
         10 * FLOOR( score / 10  ) + 9 AS Top,
         Count( score ),
         Avg( score )
  FROM scores
  GROUP BY 10  * FLOOR( score / 10  );

-- +--------+------+----------------+--------------+
-- | Bottom | Top  | Count( score ) | Avg( score ) |
-- +--------+------+----------------+--------------+
-- |      0 |    9 |              1 |       5.0000 |
-- |     10 |   19 |              1 |      15.0000 |
-- |     20 |   29 |              1 |      25.0000 |
-- |     30 |   39 |              1 |      35.0000 |
-- +--------+------+----------------+--------------+

-- Here's another example, from the world database, grouping UK cities in
-- bands of 100,000:

  select
    floor(population/100000)*100000 as 'Population Band',
    group_concat(name) as Cities
  from world.city
  where countrycode='GBR'
  group by  1
  order by  1 desc limit 5

-- +-----------------+--------------------------------------------------------+
-- | Population Band | Cities                                                 |
-- +-----------------+--------------------------------------------------------+
-- |         7200000 | London                                                 |
-- |         1000000 | Birmingham                                             |
-- |          600000 | Glasgow                                                |
-- |          400000 | Bristol,Leeds,Manchester,Sheffield,Edinburgh,Liverpool |
-- |          300000 | Cardiff,Coventry                                       |
-- +-----------------+--------------------------------------------------------+

-- Here is an example from a MySQL forum. You have a table of IDs and point
-- scores, and you wish to show the distribution of counts across ranges.
-- A traditional way of doing this is to build a table of ranges, then join
-- from ranges to scores:

  drop table if exists points,ranges;
  create table points (
    id int not null auto_increment primary key,
    points int not null
  );
  create table ranges (
    low int not null,
    high int not null
  );
  insert into points (points) values
    (456),(401),(543),(234),(303),(521),(478),(643),(575),(456),(432),
    (312),(564),(423),(411),(395),(543);
  insert into ranges (low, high) values
    (0,50),(51,100),(101,150),(151,200),(201,250),(251,300),(301,350),
    (351,400),(401,450),(451,500),(501,550),(551,600),(601,650),(651,700);

  select r.low, r.high, count(p.id)
  from ranges r
  left join points p on p.points between r.low and r.high
  group by r.low, r.high;

-- That's fine, but the ranges should be consistent, eg 0-49,50-99,100-149...,
-- and if you keep a simple utility table of ints 0..9, you never need to
-- build another range table:

  drop table if exists ints,ranges;
  create table ints(i tinyint);
  insert into ints values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);

  select ranges.low, ranges.high, count(p.id)
  from (
    select ( t.i*100 + u.i*10 + v.i ) as low,( t.i*100 + u.i*10 + v.i ) + 49 as high
    from ints t
    join ints u
    join ints v
    where (t.i*100 + u.i*10 + v.i) mod 50 = 0 and (t.i*100 + u.i*10 + v.i) < 701
  ) ranges
  left join points p on p.points between ranges.low and ranges.high
  group by ranges.low;

-- +-----+------+-------------+
-- | low | high | count(p.id) |
-- +-----+------+-------------+
-- |   0 |   50 |           0 |
-- |  51 |  100 |           0 |
-- | 101 |  150 |           0 |
-- | 151 |  200 |           0 |
-- | 201 |  250 |           1 |
-- | 251 |  300 |           0 |
-- | 301 |  350 |           2 |
-- | 351 |  400 |           1 |
-- | 401 |  450 |           4 |
-- | 451 |  500 |           3 |
-- | 501 |  550 |           3 |
-- | 551 |  600 |           2 |
-- | 601 |  650 |           1 |
-- | 651 |  700 |           0 |
-- +-----+------+-------------+

-- Google Visualisation tools don't include a histogram widget, but they do
-- have a line chart widget, so if you pass banding query results like the
-- above to the Google Visualisation Line Chart widget, eg with TheUsual, you
-- get a line chart representation of the histogram.