Home / Notebooks / Database
Database
advanced

Advanced SQL

Advanced SQL techniques including window functions, CTEs, query optimization, and performance tuning

March 10, 2024
Updated regularly

Advanced SQL

Quick reference guide for advanced SQL techniques and optimization.

Window Functions

Perform calculations across rows related to the current row:

ROW_NUMBER, RANK, DENSE_RANK

-- ========== ROW_NUMBER (Unique sequential number) ==========
SELECT 
    employee_id,
    department,
    salary,
    ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
FROM employees;

-- ========== RANK (Same rank for ties, gaps in sequence) ==========
SELECT 
    employee_id,
    department,
    salary,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
FROM employees;
-- Result: 1, 2, 2, 4, 5 (gap after tie)

-- ========== DENSE_RANK (Same rank for ties, no gaps) ==========
SELECT 
    employee_id,
    department,
    salary,
    DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
FROM employees;
-- Result: 1, 2, 2, 3, 4 (no gap after tie)

-- ========== Get Top N Per Group ==========
WITH ranked_employees AS (
    SELECT 
        *,
        ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
    FROM employees
)
SELECT * FROM ranked_employees
WHERE rn <= 3;  -- Top 3 earners per department

Aggregate Window Functions

-- ========== Running Totals ==========
SELECT 
    order_date,
    amount,
    SUM(amount) OVER (ORDER BY order_date) AS running_total
FROM orders;

-- ========== Moving Average ==========
SELECT 
    order_date,
    amount,
    AVG(amount) OVER (
        ORDER BY order_date 
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7_days
FROM orders;

-- ========== Cumulative Distribution ==========
SELECT 
    product_name,
    sales,
    CUME_DIST() OVER (ORDER BY sales) AS cumulative_dist,
    PERCENT_RANK() OVER (ORDER BY sales) AS percent_rank
FROM products;

-- ========== Compare with Previous/Next Row ==========
SELECT 
    order_date,
    amount,
    LAG(amount) OVER (ORDER BY order_date) AS previous_amount,
    LEAD(amount) OVER (ORDER BY order_date) AS next_amount,
    amount - LAG(amount) OVER (ORDER BY order_date) AS difference
FROM orders;

-- ========== First and Last Values ==========
SELECT 
    employee_id,
    department,
    salary,
    FIRST_VALUE(salary) OVER (
        PARTITION BY department 
        ORDER BY salary DESC
    ) AS highest_salary_in_dept,
    LAST_VALUE(salary) OVER (
        PARTITION BY department 
        ORDER BY salary DESC
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS lowest_salary_in_dept
FROM employees;

Common Table Expressions (CTEs)

Temporary named result sets for complex queries:

Basic CTE

-- ========== Single CTE ==========
WITH high_earners AS (
    SELECT * FROM employees
    WHERE salary > 100000
)
SELECT 
    department,
    COUNT(*) AS high_earner_count,
    AVG(salary) AS avg_salary
FROM high_earners
GROUP BY department;

-- ========== Multiple CTEs ==========
WITH 
    sales_2023 AS (
        SELECT * FROM sales
        WHERE YEAR(sale_date) = 2023
    ),
    top_products AS (
        SELECT 
            product_id,
            SUM(quantity) AS total_quantity
        FROM sales_2023
        GROUP BY product_id
        HAVING SUM(quantity) > 100
    )
SELECT 
    p.product_name,
    tp.total_quantity,
    p.price * tp.total_quantity AS total_revenue
FROM top_products tp
JOIN products p ON tp.product_id = p.id;

Recursive CTE

-- ========== Hierarchical Data (Employee-Manager) ==========
WITH RECURSIVE employee_hierarchy AS (
    -- Base case: Top-level employees (no manager)
    SELECT 
        employee_id,
        name,
        manager_id,
        1 AS level,
        name AS path
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    -- Recursive case: Employees with managers
    SELECT 
        e.employee_id,
        e.name,
        e.manager_id,
        eh.level + 1,
        eh.path || ' > ' || e.name
    FROM employees e
    INNER JOIN employee_hierarchy eh ON e.manager_id = eh.employee_id
)
SELECT * FROM employee_hierarchy
ORDER BY level, name;

-- ========== Number Series ==========
WITH RECURSIVE numbers AS (
    SELECT 1 AS n
    UNION ALL
    SELECT n + 1 FROM numbers WHERE n < 100
)
SELECT * FROM numbers;

-- ========== Date Series ==========
WITH RECURSIVE date_range AS (
    SELECT DATE('2024-01-01') AS date
    UNION ALL
    SELECT DATE(date, '+1 day')
    FROM date_range
    WHERE date < '2024-12-31'
)
SELECT * FROM date_range;

Advanced Joins

Self Join

-- ========== Find Employees Earning More Than Their Manager ==========
SELECT 
    e.name AS employee,
    e.salary AS employee_salary,
    m.name AS manager,
    m.salary AS manager_salary
FROM employees e
INNER JOIN employees m ON e.manager_id = m.employee_id
WHERE e.salary > m.salary;

-- ========== Compare Sequential Records ==========
SELECT 
    curr.order_id AS current_order,
    curr.order_date AS current_date,
    prev.order_id AS previous_order,
    prev.order_date AS previous_date,
    DATEDIFF(curr.order_date, prev.order_date) AS days_between
FROM orders curr
LEFT JOIN orders prev ON curr.order_id = prev.order_id + 1;

Cross Join with Filtering

-- ========== All Possible Combinations ==========
SELECT 
    p.product_name,
    c.category_name
FROM products p
CROSS JOIN categories c
WHERE NOT EXISTS (
    SELECT 1 FROM product_categories pc
    WHERE pc.product_id = p.id 
    AND pc.category_id = c.id
);

Lateral Join (PostgreSQL)

-- ========== Get Top 3 Orders Per Customer ==========
SELECT 
    c.customer_name,
    o.order_id,
    o.order_date,
    o.total
FROM customers c
CROSS JOIN LATERAL (
    SELECT *
    FROM orders
    WHERE customer_id = c.id
    ORDER BY order_date DESC
    LIMIT 3
) o;

Subqueries

Correlated Subquery

-- ========== Find Above-Average Salaries Per Department ==========
SELECT 
    employee_id,
    name,
    department,
    salary
FROM employees e1
WHERE salary > (
    SELECT AVG(salary)
    FROM employees e2
    WHERE e2.department = e1.department
);

-- ========== Find Customers with Above-Average Orders ==========
SELECT 
    customer_id,
    customer_name,
    (SELECT COUNT(*) 
     FROM orders o 
     WHERE o.customer_id = c.id) AS order_count
FROM customers c
WHERE (
    SELECT COUNT(*) 
    FROM orders o 
    WHERE o.customer_id = c.id
) > (
    SELECT AVG(order_count)
    FROM (
        SELECT COUNT(*) AS order_count
        FROM orders
        GROUP BY customer_id
    ) AS counts
);

EXISTS vs IN

-- ========== EXISTS (Better for large datasets) ==========
SELECT *
FROM customers c
WHERE EXISTS (
    SELECT 1
    FROM orders o
    WHERE o.customer_id = c.id
    AND o.order_date >= '2024-01-01'
);

-- ========== IN (Good for small lists) ==========
SELECT *
FROM products
WHERE category_id IN (
    SELECT id FROM categories
    WHERE category_name IN ('Electronics', 'Computers')
);

-- ========== NOT EXISTS (Find customers without orders) ==========
SELECT *
FROM customers c
WHERE NOT EXISTS (
    SELECT 1
    FROM orders o
    WHERE o.customer_id = c.id
);

Pivot and Unpivot

Pivot (Rows to Columns)

-- ========== Manual Pivot ==========
SELECT 
    product_id,
    SUM(CASE WHEN MONTH(order_date) = 1 THEN quantity ELSE 0 END) AS Jan,
    SUM(CASE WHEN MONTH(order_date) = 2 THEN quantity ELSE 0 END) AS Feb,
    SUM(CASE WHEN MONTH(order_date) = 3 THEN quantity ELSE 0 END) AS Mar
FROM orders
GROUP BY product_id;

-- ========== Dynamic Pivot (SQL Server) ==========
SELECT *
FROM (
    SELECT product_id, MONTH(order_date) AS month, quantity
    FROM orders
) AS source_data
PIVOT (
    SUM(quantity)
    FOR month IN ([1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12])
) AS pivoted_data;

Unpivot (Columns to Rows)

-- ========== Manual Unpivot ==========
SELECT product_id, 'Jan' AS month, jan_sales AS sales FROM sales_summary
UNION ALL
SELECT product_id, 'Feb' AS month, feb_sales FROM sales_summary
UNION ALL
SELECT product_id, 'Mar' AS month, mar_sales FROM sales_summary;

-- ========== UNPIVOT (SQL Server) ==========
SELECT product_id, month, sales
FROM sales_summary
UNPIVOT (
    sales FOR month IN (jan_sales, feb_sales, mar_sales)
) AS unpivoted_data;

Query Optimization

Using Indexes Effectively

-- ========== Create Indexes ==========
CREATE INDEX idx_orders_customer_date ON orders(customer_id, order_date);
CREATE INDEX idx_products_category ON products(category_id);
CREATE INDEX idx_employees_dept_salary ON employees(department, salary DESC);

-- ========== Covering Index ==========
CREATE INDEX idx_orders_covering ON orders(customer_id, order_date)
INCLUDE (total, status);

-- ========== Partial Index (PostgreSQL) ==========
CREATE INDEX idx_active_orders ON orders(customer_id)
WHERE status = 'active';

-- ========== Check Index Usage ==========
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;
EXPLAIN ANALYZE SELECT * FROM orders WHERE customer_id = 123;

Query Optimization Techniques

-- ❌ BAD: Avoid functions on indexed columns
SELECT * FROM orders
WHERE YEAR(order_date) = 2024;

-- ✅ GOOD: Use range instead
SELECT * FROM orders
WHERE order_date >= '2024-01-01' 
AND order_date < '2025-01-01';

-- ❌ BAD: Avoid OR with different columns
SELECT * FROM products
WHERE category_id = 1 OR price > 100;

-- ✅ GOOD: Use UNION if needed
SELECT * FROM products WHERE category_id = 1
UNION
SELECT * FROM products WHERE price > 100;

-- ❌ BAD: SELECT *
SELECT * FROM orders;

-- ✅ GOOD: Select only needed columns
SELECT order_id, order_date, total FROM orders;

-- ❌ BAD: NOT IN with NULL values
SELECT * FROM customers
WHERE id NOT IN (SELECT customer_id FROM orders);

-- ✅ GOOD: NOT EXISTS or IS NOT NULL
SELECT * FROM customers c
WHERE NOT EXISTS (
    SELECT 1 FROM orders o WHERE o.customer_id = c.id
);

Transactions and Locking

Transaction Control

-- ========== Basic Transaction ==========
BEGIN TRANSACTION;

UPDATE accounts SET balance = balance - 100 WHERE account_id = 1;
UPDATE accounts SET balance = balance + 100 WHERE account_id = 2;

COMMIT;  -- Save changes
-- ROLLBACK;  -- Undo changes

-- ========== Transaction with Savepoint ==========
BEGIN TRANSACTION;

UPDATE inventory SET quantity = quantity - 10 WHERE product_id = 1;
SAVEPOINT after_inventory_update;

UPDATE orders SET status = 'shipped' WHERE order_id = 123;

-- If something goes wrong
ROLLBACK TO SAVEPOINT after_inventory_update;

COMMIT;

Isolation Levels

-- ========== Set Isolation Level ==========
SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;

-- ========== Example with Read Committed ==========
SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
BEGIN TRANSACTION;

SELECT balance FROM accounts WHERE account_id = 1;
-- Other transactions can modify this row
SELECT balance FROM accounts WHERE account_id = 1;  -- May be different

COMMIT;

Locking

-- ========== Explicit Locks ==========
-- Shared lock (read)
SELECT * FROM orders WITH (HOLDLOCK) WHERE order_id = 123;

-- Exclusive lock (write)
SELECT * FROM orders WITH (UPDLOCK) WHERE order_id = 123;

-- Table-level lock
LOCK TABLE orders IN EXCLUSIVE MODE;

-- Row-level lock (PostgreSQL)
SELECT * FROM orders WHERE order_id = 123 FOR UPDATE;
SELECT * FROM orders WHERE order_id = 123 FOR SHARE;

Advanced Aggregations

GROUPING SETS

-- ========== Multiple Grouping Levels ==========
SELECT 
    department,
    job_title,
    COUNT(*) AS employee_count,
    AVG(salary) AS avg_salary
FROM employees
GROUP BY GROUPING SETS (
    (department, job_title),  -- By department and job
    (department),             -- By department only
    (job_title),              -- By job only
    ()                        -- Grand total
);

-- ========== ROLLUP (Hierarchical aggregation) ==========
SELECT 
    year,
    quarter,
    month,
    SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (year, quarter, month);

-- ========== CUBE (All combinations) ==========
SELECT 
    region,
    product_category,
    SUM(sales) AS total_sales
FROM sales
GROUP BY CUBE (region, product_category);

FILTER Clause

-- ========== Conditional Aggregation ==========
SELECT 
    department,
    COUNT(*) AS total_employees,
    COUNT(*) FILTER (WHERE salary > 100000) AS high_earners,
    AVG(salary) FILTER (WHERE hire_date >= '2023-01-01') AS avg_new_hire_salary
FROM employees
GROUP BY department;

JSON Operations (PostgreSQL)

JSON Queries

-- ========== Extract JSON Data ==========
SELECT 
    id,
    data->>'name' AS name,
    data->>'email' AS email,
    data->'address'->>'city' AS city
FROM users;

-- ========== JSON Array Operations ==========
SELECT 
    id,
    jsonb_array_elements(data->'tags') AS tag
FROM posts;

-- ========== Query JSON Fields ==========
SELECT * FROM products
WHERE data @> '{"category": "electronics"}';

SELECT * FROM products
WHERE data->>'price' > '100';

-- ========== Update JSON Fields ==========
UPDATE users
SET data = jsonb_set(data, '{address,city}', '"Jakarta"')
WHERE id = 1;
-- ========== Create Text Search Vector ==========
ALTER TABLE articles 
ADD COLUMN search_vector tsvector;

UPDATE articles
SET search_vector = to_tsvector('english', title || ' ' || content);

CREATE INDEX idx_search ON articles USING GIN(search_vector);

-- ========== Search ==========
SELECT 
    title,
    ts_rank(search_vector, query) AS rank
FROM articles,
     to_tsquery('english', 'kubernetes & docker') AS query
WHERE search_vector @@ query
ORDER BY rank DESC;

-- ========== Highlight Results ==========
SELECT 
    ts_headline('english', content, query) AS snippet
FROM articles,
     to_tsquery('english', 'kubernetes') AS query
WHERE search_vector @@ query;

Database Design Patterns

Slowly Changing Dimensions (SCD Type 2)

-- ========== Track Historical Changes ==========
CREATE TABLE customer_history (
    id SERIAL PRIMARY KEY,
    customer_id INT NOT NULL,
    name VARCHAR(100),
    email VARCHAR(255),
    address TEXT,
    valid_from DATE NOT NULL,
    valid_to DATE,
    is_current BOOLEAN DEFAULT TRUE
);

-- ========== Insert New Record ==========
-- 1. Close old record
UPDATE customer_history
SET valid_to = CURRENT_DATE,
    is_current = FALSE
WHERE customer_id = 123 AND is_current = TRUE;

-- 2. Insert new record
INSERT INTO customer_history (customer_id, name, email, address, valid_from)
VALUES (123, 'John Doe', 'john@example.com', 'New Address', CURRENT_DATE);

-- ========== Query Current State ==========
SELECT * FROM customer_history
WHERE is_current = TRUE;

-- ========== Query Historical State ==========
SELECT * FROM customer_history
WHERE customer_id = 123
AND '2023-06-15' BETWEEN valid_from AND COALESCE(valid_to, '9999-12-31');

Temporal Tables (SQL Server)

-- ========== System-Versioned Table ==========
CREATE TABLE products (
    product_id INT PRIMARY KEY,
    product_name VARCHAR(100),
    price DECIMAL(10, 2),
    valid_from DATETIME2 GENERATED ALWAYS AS ROW START,
    valid_to DATETIME2 GENERATED ALWAYS AS ROW END,
    PERIOD FOR SYSTEM_TIME (valid_from, valid_to)
)
WITH (SYSTEM_VERSIONING = ON);

-- ========== Query Historical Data ==========
SELECT * FROM products
FOR SYSTEM_TIME AS OF '2024-01-01'
WHERE product_id = 1;

SELECT * FROM products
FOR SYSTEM_TIME BETWEEN '2024-01-01' AND '2024-12-31'
WHERE product_id = 1;

Performance Monitoring

Query Performance

-- ========== PostgreSQL ==========
-- Enable query timing
\timing

-- Analyze query plan
EXPLAIN ANALYZE
SELECT * FROM orders
WHERE customer_id = 123;

-- View slow queries
SELECT 
    query,
    calls,
    total_time,
    mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;

-- ========== MySQL ==========
-- Enable slow query log
SET GLOBAL slow_query_log = 'ON';
SET GLOBAL long_query_time = 2;

-- Analyze query
EXPLAIN FORMAT=JSON
SELECT * FROM orders
WHERE customer_id = 123;

-- View query profile
SHOW PROFILE FOR QUERY 1;

Index Analysis

-- ========== Find Missing Indexes ==========
-- PostgreSQL
SELECT 
    schemaname,
    tablename,
    seq_scan,
    seq_tup_read,
    idx_scan
FROM pg_stat_user_tables
WHERE seq_scan > idx_scan
ORDER BY seq_tup_read DESC;

-- ========== Find Unused Indexes ==========
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0;

Tips

  • Use CTEs for complex queries instead of nested subqueries
  • Add indexes on columns used in WHERE, JOIN, and ORDER BY
  • Avoid SELECT \* - specify columns explicitly
  • Use EXISTS instead of COUNT for existence checks
  • Partition large tables for better performance
  • Analyze query plans regularly with EXPLAIN
  • Use appropriate data types (smaller is better)
  • Normalize database but denormalize for read-heavy workloads
  • Use connection pooling in applications
  • Monitor slow queries and optimize them
  • Common Anti-Patterns

    -- ❌ BAD: N+1 Query Problem
    -- Fetching orders in loop
    SELECT * FROM customers;
    -- Then for each customer:
    SELECT * FROM orders WHERE customer_id = ?;
    
    -- ✅ GOOD: Single query with JOIN
    SELECT 
        c.*,
        o.order_id,
        o.order_date,
        o.total
    FROM customers c
    LEFT JOIN orders o ON c.id = o.customer_id;
    
    -- ❌ BAD: Using DISTINCT to fix bad join
    SELECT DISTINCT c.* FROM customers c
    JOIN orders o ON c.id = o.customer_id;
    
    -- ✅ GOOD: Use proper aggregation or EXISTS
    SELECT c.* FROM customers c
    WHERE EXISTS (SELECT 1 FROM orders o WHERE o.customer_id = c.id);
    
    -- ❌ BAD: Implicit type conversion
    SELECT * FROM products WHERE product_id = '123';  -- id is INT
    
    -- ✅ GOOD: Use correct type
    SELECT * FROM products WHERE product_id = 123;
    

    Resources

  • PostgreSQL Documentation
  • MySQL Performance Blog
  • Use The Index, Luke
  • SQL Performance Explained
  • Modern SQL
  • Topics

    SQLDatabasePerformanceAdvanced

    Found This Helpful?

    If you have questions or suggestions for improving these notes, I'd love to hear from you.