Add GROUP BY documentation

arnaud-lacurie · arnaud-lacurie · commit 7166cdbfff4c · 2025-11-17T17:31:56.000Z
diff --git a/docs/sphinx/source/reference/sql_commands/DQL/GROUP_BY.diagram b/docs/sphinx/source/reference/sql_commands/DQL/GROUP_BY.diagram
@@ -0,0 +1,23 @@
+Diagram(
+    Terminal('GROUP'),
+    Terminal('BY'),
+    NonTerminal('expression'),
+    Optional(
+        Sequence(
+            Terminal('AS'),
+            NonTerminal('alias')
+        )
+    ),
+    OneOrMore(
+        Sequence(
+            Terminal(','),
+            NonTerminal('expression'),
+            Optional(
+                Sequence(
+                    Terminal('AS'),
+                    NonTerminal('alias')
+                )
+            )
+        )
+    ),
+)
diff --git a/docs/sphinx/source/reference/sql_commands/DQL/GROUP_BY.rst b/docs/sphinx/source/reference/sql_commands/DQL/GROUP_BY.rst
@@ -0,0 +1,279 @@
+========
+GROUP BY
+========
+
+.. _group_by:
+
+Groups rows that have the same values in specified columns into aggregated rows, typically used with aggregate functions.
+
+Syntax
+======
+
+.. raw:: html
+    :file: GROUP_BY.diagram.svg
+
+The GROUP BY clause is used in SELECT statements:
+
+.. code-block:: sql
+
+    SELECT column1, aggregate_function(column2)
+    FROM table_name
+    GROUP BY column1
+
+Parameters
+==========
+
+``GROUP BY expression [AS alias], ...``
+    Groups rows based on the values of one or more expressions. Each unique combination of expression values creates a separate group.
+
+``expression``
+    Can be:
+
+    - Column names
+    - Nested field references (e.g., ``struct_column.field``)
+    - Expressions or calculations
+
+``alias`` (optional)
+    An optional alias for the grouping expression
+
+Returns
+=======
+
+Returns one row per unique combination of grouped values. When used with aggregate functions, computes aggregate values for each group.
+
+Examples
+========
+
+Setup
+-----
+
+For these examples, assume we have an ``employees`` table:
+
+.. code-block:: sql
+
+    CREATE TABLE employees(
+        id BIGINT,
+        department STRING,
+        role STRING,
+        salary BIGINT,
+        PRIMARY KEY(id))
+
+    CREATE INDEX dept_idx AS SELECT department FROM employees ORDER BY department
+    CREATE INDEX role_idx AS SELECT role FROM employees ORDER BY role
+
+    INSERT INTO employees VALUES
+        (1, 'Engineering', 'Developer', 100000),
+        (2, 'Engineering', 'Developer', 110000),
+        (3, 'Engineering', 'Manager', 150000),
+        (4, 'Sales', 'Representative', 80000),
+        (5, 'Sales', 'Manager', 120000)
+
+GROUP BY Single Column
+-----------------------
+
+Count employees by department:
+
+.. code-block:: sql
+
+    SELECT department, COUNT(*) AS employee_count
+    FROM employees
+    GROUP BY department
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`department`
+      - :sql:`employee_count`
+    * - :json:`"Engineering"`
+      - :json:`3`
+    * - :json:`"Sales"`
+      - :json:`2`
+
+GROUP BY Multiple Columns
+--------------------------
+
+Count employees by department and role:
+
+.. code-block:: sql
+
+    SELECT department, role, COUNT(*) AS employee_count
+    FROM employees
+    GROUP BY department, role
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`department`
+      - :sql:`role`
+      - :sql:`employee_count`
+    * - :json:`"Engineering"`
+      - :json:`"Developer"`
+      - :json:`2`
+    * - :json:`"Engineering"`
+      - :json:`"Manager"`
+      - :json:`1`
+    * - :json:`"Sales"`
+      - :json:`"Representative"`
+      - :json:`1`
+    * - :json:`"Sales"`
+      - :json:`"Manager"`
+      - :json:`1`
+
+GROUP BY with Aggregate Functions
+-----------------------------------
+
+Calculate average salary by department:
+
+.. code-block:: sql
+
+    SELECT department, AVG(salary) AS avg_salary
+    FROM employees
+    GROUP BY department
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`department`
+      - :sql:`avg_salary`
+    * - :json:`"Engineering"`
+      - :json:`120000.0`
+    * - :json:`"Sales"`
+      - :json:`100000.0`
+
+Calculate multiple aggregates:
+
+.. code-block:: sql
+
+    SELECT department,
+           COUNT(*) AS employee_count,
+           MIN(salary) AS min_salary,
+           MAX(salary) AS max_salary,
+           AVG(salary) AS avg_salary
+    FROM employees
+    GROUP BY department
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`department`
+      - :sql:`employee_count`
+      - :sql:`min_salary`
+      - :sql:`max_salary`
+      - :sql:`avg_salary`
+    * - :json:`"Engineering"`
+      - :json:`3`
+      - :json:`100000`
+      - :json:`150000`
+      - :json:`120000.0`
+    * - :json:`"Sales"`
+      - :json:`2`
+      - :json:`80000`
+      - :json:`120000`
+      - :json:`100000.0`
+
+GROUP BY with HAVING Clause
+-----------------------------
+
+Filter groups using HAVING:
+
+.. code-block:: sql
+
+    SELECT department, AVG(salary) AS avg_salary
+    FROM employees
+    GROUP BY department
+    HAVING AVG(salary) > 110000
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`department`
+      - :sql:`avg_salary`
+    * - :json:`"Engineering"`
+      - :json:`120000.0`
+
+The HAVING clause filters groups after aggregation, unlike WHERE which filters rows before grouping.
+
+GROUP BY with Column Aliases
+------------------------------
+
+Use aliases for grouped columns:
+
+.. code-block:: sql
+
+    SELECT department AS dept, COUNT(*) AS total
+    FROM employees
+    GROUP BY department AS dept
+
+.. list-table::
+    :header-rows: 1
+
+    * - :sql:`dept`
+      - :sql:`total`
+    * - :json:`"Engineering"`
+      - :json:`3`
+    * - :json:`"Sales"`
+      - :json:`2`
+
+Important Notes
+===============
+
+Index Requirement
+-----------------
+
+**GROUP BY operations require an appropriate index for optimal performance.** The query planner needs an index on the grouped column(s) to execute the query efficiently. Without a suitable index, the query will fail with an "unable to plan" error.
+
+Example index creation:
+
+.. code-block:: sql
+
+    CREATE INDEX dept_idx AS SELECT department FROM employees ORDER BY department
+
+See :ref:`Indexes <index_definition>` for details on creating indexes that support GROUP BY operations.
+
+Column Selection Rules
+----------------------
+
+* Only columns in the GROUP BY clause or aggregate functions can appear in the SELECT list
+* Selecting non-grouped, non-aggregated columns will result in error 42803
+
+**Invalid example**:
+
+.. code-block:: sql
+
+    -- ERROR: id is neither grouped nor aggregated
+    SELECT id, department, COUNT(*)
+    FROM employees
+    GROUP BY department
+
+**Valid example**:
+
+.. code-block:: sql
+
+    -- OK: all non-aggregated columns are grouped
+    SELECT department, role, COUNT(*)
+    FROM employees
+    GROUP BY department, role
+
+Nested Fields
+-------------
+
+GROUP BY supports grouping on nested struct fields:
+
+.. code-block:: sql
+
+    SELECT address.city, COUNT(*) AS resident_count
+    FROM people
+    GROUP BY address.city
+
+Execution Model
+---------------
+
+The Relational Layer does not perform in-memory grouping. All GROUP BY operations must be backed by an appropriate index. This is a fundamental architectural constraint that ensures queries can execute efficiently over large datasets.
+
+See Also
+========
+
+* :ref:`Aggregate Functions <aggregate_functions>` - Functions used with GROUP BY
+* :ref:`Indexes <index_definition>` - Creating indexes for GROUP BY
+* :ref:`HAVING Clause <having>` - Filtering grouped results
+* :ref:`SELECT Statement <select>` - Full SELECT syntax
diff --git a/yaml-tests/src/test/java/DocumentationQueriesTests.java b/yaml-tests/src/test/java/DocumentationQueriesTests.java
@@ -53,4 +53,9 @@ void aggregateFunctionsDocumentationQueriesTests(YamlTest.Runner runner) throws
     void scalarFunctionsDocumentationQueriesTests(YamlTest.Runner runner) throws Exception {
         runner.runYamsql(PREFIX + "/scalar-functions-documentation-queries.yamsql");
     }
+
+    @TestTemplate
+    void groupByDocumentationQueriesTests(YamlTest.Runner runner) throws Exception {
+        runner.runYamsql(PREFIX + "/group-by-documentation-queries.yamsql");
+    }
 }
diff --git a/yaml-tests/src/test/resources/documentation-queries/group-by-documentation-queries.yamsql b/yaml-tests/src/test/resources/documentation-queries/group-by-documentation-queries.yamsql
@@ -0,0 +1,81 @@
+---
+options:
+  supported_version: 4.3.2.0
+---
+schema_template:
+    create table employees(id bigint, department string, role string, salary bigint, primary key(id))
+    create index dept_idx as select department from employees order by department
+    create index role_idx as select role from employees order by role
+---
+setup:
+  steps:
+    - query: insert into employees
+            values (1, 'Engineering', 'Developer', 100000),
+                   (2, 'Engineering', 'Developer', 110000),
+                   (3, 'Engineering', 'Manager', 150000),
+                   (4, 'Sales', 'Representative', 80000),
+                   (5, 'Sales', 'Manager', 120000)
+---
+test_block:
+  name: group-by-documentation-tests
+  preset: single_repetition_ordered
+  tests:
+    # GROUP BY Single Column
+    -
+      - query: SELECT department, COUNT(*) AS employee_count
+              FROM employees
+              GROUP BY department
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{department: "Engineering", employee_count: 3},
+                          {department: "Sales", employee_count: 2}]
+
+    # GROUP BY Multiple Columns
+    -
+      - query: SELECT department, role, COUNT(*) AS employee_count
+              FROM employees
+              GROUP BY department, role
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{department: "Engineering", role: "Developer", employee_count: 2},
+                          {department: "Engineering", role: "Manager", employee_count: 1},
+                          {department: "Sales", role: "Representative", employee_count: 1},
+                          {department: "Sales", role: "Manager", employee_count: 1}]
+
+    # GROUP BY with Aggregate Functions - Average salary
+    -
+      - query: SELECT department, AVG(salary) AS avg_salary
+              FROM employees
+              GROUP BY department
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{department: "Engineering", avg_salary: 120000.0},
+                          {department: "Sales", avg_salary: 100000.0}]
+
+    # Calculate multiple aggregates
+    -
+      - query: SELECT department,
+                     COUNT(*) AS employee_count,
+                     MIN(salary) AS min_salary,
+                     MAX(salary) AS max_salary,
+                     AVG(salary) AS avg_salary
+              FROM employees
+              GROUP BY department
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{department: "Engineering", employee_count: 3, min_salary: 100000, max_salary: 150000, avg_salary: 120000.0},
+                          {department: "Sales", employee_count: 2, min_salary: 80000, max_salary: 120000, avg_salary: 100000.0}]
+
+    # GROUP BY with HAVING Clause
+    -
+      - query: SELECT department, AVG(salary) AS avg_salary
+              FROM employees
+              GROUP BY department
+              HAVING AVG(salary) > 110000
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{department: "Engineering", avg_salary: 120000.0}]
+
+    # GROUP BY with Column Aliases
+    -
+      - query: SELECT department AS dept, COUNT(*) AS total
+              FROM employees
+              GROUP BY department AS dept
+      - supported_version: 4.3.2.0
+      - unorderedResult: [{dept: "Engineering", total: 3},
+                          {dept: "Sales", total: 2}]

Original file line number	Diff line number	Diff line change
`@@ -53,4 +53,9 @@ void aggregateFunctionsDocumentationQueriesTests(YamlTest.Runner runner) throws`
`53`	`53`	`void scalarFunctionsDocumentationQueriesTests(YamlTest.Runner runner) throws Exception {`
`54`	`54`	`runner.runYamsql(PREFIX + "/scalar-functions-documentation-queries.yamsql");`
`55`	`55`	`}`
	`56`	`+`
	`57`	`+ @TestTemplate`
	`58`	`+ void groupByDocumentationQueriesTests(YamlTest.Runner runner) throws Exception {`
	`59`	`+ runner.runYamsql(PREFIX + "/group-by-documentation-queries.yamsql");`
	`60`	`+ }`
`56`	`61`	`}`