Advanced SQL
Quick reference guide for advanced SQL techniques and optimization.
Window Functions
Perform calculations across rows related to the current row:
ROW_NUMBER, RANK, DENSE_RANK
-- ========== ROW_NUMBER (Unique sequential number) ==========
SELECT
employee_id,
department,
salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
FROM employees;
-- ========== RANK (Same rank for ties, gaps in sequence) ==========
SELECT
employee_id,
department,
salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
FROM employees;
-- Result: 1, 2, 2, 4, 5 (gap after tie)
-- ========== DENSE_RANK (Same rank for ties, no gaps) ==========
SELECT
employee_id,
department,
salary,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
FROM employees;
-- Result: 1, 2, 2, 3, 4 (no gap after tie)
-- ========== Get Top N Per Group ==========
WITH ranked_employees AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
FROM employees
)
SELECT * FROM ranked_employees
WHERE rn <= 3; -- Top 3 earners per department
Aggregate Window Functions
-- ========== Running Totals ==========
SELECT
order_date,
amount,
SUM(amount) OVER (ORDER BY order_date) AS running_total
FROM orders;
-- ========== Moving Average ==========
SELECT
order_date,
amount,
AVG(amount) OVER (
ORDER BY order_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7_days
FROM orders;
-- ========== Cumulative Distribution ==========
SELECT
product_name,
sales,
CUME_DIST() OVER (ORDER BY sales) AS cumulative_dist,
PERCENT_RANK() OVER (ORDER BY sales) AS percent_rank
FROM products;
-- ========== Compare with Previous/Next Row ==========
SELECT
order_date,
amount,
LAG(amount) OVER (ORDER BY order_date) AS previous_amount,
LEAD(amount) OVER (ORDER BY order_date) AS next_amount,
amount - LAG(amount) OVER (ORDER BY order_date) AS difference
FROM orders;
-- ========== First and Last Values ==========
SELECT
employee_id,
department,
salary,
FIRST_VALUE(salary) OVER (
PARTITION BY department
ORDER BY salary DESC
) AS highest_salary_in_dept,
LAST_VALUE(salary) OVER (
PARTITION BY department
ORDER BY salary DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS lowest_salary_in_dept
FROM employees;
Common Table Expressions (CTEs)
Temporary named result sets for complex queries:
Basic CTE
-- ========== Single CTE ==========
WITH high_earners AS (
SELECT * FROM employees
WHERE salary > 100000
)
SELECT
department,
COUNT(*) AS high_earner_count,
AVG(salary) AS avg_salary
FROM high_earners
GROUP BY department;
-- ========== Multiple CTEs ==========
WITH
sales_2023 AS (
SELECT * FROM sales
WHERE YEAR(sale_date) = 2023
),
top_products AS (
SELECT
product_id,
SUM(quantity) AS total_quantity
FROM sales_2023
GROUP BY product_id
HAVING SUM(quantity) > 100
)
SELECT
p.product_name,
tp.total_quantity,
p.price * tp.total_quantity AS total_revenue
FROM top_products tp
JOIN products p ON tp.product_id = p.id;
Recursive CTE
-- ========== Hierarchical Data (Employee-Manager) ==========
WITH RECURSIVE employee_hierarchy AS (
-- Base case: Top-level employees (no manager)
SELECT
employee_id,
name,
manager_id,
1 AS level,
name AS path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive case: Employees with managers
SELECT
e.employee_id,
e.name,
e.manager_id,
eh.level + 1,
eh.path || ' > ' || e.name
FROM employees e
INNER JOIN employee_hierarchy eh ON e.manager_id = eh.employee_id
)
SELECT * FROM employee_hierarchy
ORDER BY level, name;
-- ========== Number Series ==========
WITH RECURSIVE numbers AS (
SELECT 1 AS n
UNION ALL
SELECT n + 1 FROM numbers WHERE n < 100
)
SELECT * FROM numbers;
-- ========== Date Series ==========
WITH RECURSIVE date_range AS (
SELECT DATE('2024-01-01') AS date
UNION ALL
SELECT DATE(date, '+1 day')
FROM date_range
WHERE date < '2024-12-31'
)
SELECT * FROM date_range;
Advanced Joins
Self Join
-- ========== Find Employees Earning More Than Their Manager ==========
SELECT
e.name AS employee,
e.salary AS employee_salary,
m.name AS manager,
m.salary AS manager_salary
FROM employees e
INNER JOIN employees m ON e.manager_id = m.employee_id
WHERE e.salary > m.salary;
-- ========== Compare Sequential Records ==========
SELECT
curr.order_id AS current_order,
curr.order_date AS current_date,
prev.order_id AS previous_order,
prev.order_date AS previous_date,
DATEDIFF(curr.order_date, prev.order_date) AS days_between
FROM orders curr
LEFT JOIN orders prev ON curr.order_id = prev.order_id + 1;
Cross Join with Filtering
-- ========== All Possible Combinations ==========
SELECT
p.product_name,
c.category_name
FROM products p
CROSS JOIN categories c
WHERE NOT EXISTS (
SELECT 1 FROM product_categories pc
WHERE pc.product_id = p.id
AND pc.category_id = c.id
);
Lateral Join (PostgreSQL)
-- ========== Get Top 3 Orders Per Customer ==========
SELECT
c.customer_name,
o.order_id,
o.order_date,
o.total
FROM customers c
CROSS JOIN LATERAL (
SELECT *
FROM orders
WHERE customer_id = c.id
ORDER BY order_date DESC
LIMIT 3
) o;
Subqueries
Correlated Subquery
-- ========== Find Above-Average Salaries Per Department ==========
SELECT
employee_id,
name,
department,
salary
FROM employees e1
WHERE salary > (
SELECT AVG(salary)
FROM employees e2
WHERE e2.department = e1.department
);
-- ========== Find Customers with Above-Average Orders ==========
SELECT
customer_id,
customer_name,
(SELECT COUNT(*)
FROM orders o
WHERE o.customer_id = c.id) AS order_count
FROM customers c
WHERE (
SELECT COUNT(*)
FROM orders o
WHERE o.customer_id = c.id
) > (
SELECT AVG(order_count)
FROM (
SELECT COUNT(*) AS order_count
FROM orders
GROUP BY customer_id
) AS counts
);
EXISTS vs IN
-- ========== EXISTS (Better for large datasets) ==========
SELECT *
FROM customers c
WHERE EXISTS (
SELECT 1
FROM orders o
WHERE o.customer_id = c.id
AND o.order_date >= '2024-01-01'
);
-- ========== IN (Good for small lists) ==========
SELECT *
FROM products
WHERE category_id IN (
SELECT id FROM categories
WHERE category_name IN ('Electronics', 'Computers')
);
-- ========== NOT EXISTS (Find customers without orders) ==========
SELECT *
FROM customers c
WHERE NOT EXISTS (
SELECT 1
FROM orders o
WHERE o.customer_id = c.id
);
Pivot and Unpivot
Pivot (Rows to Columns)
-- ========== Manual Pivot ==========
SELECT
product_id,
SUM(CASE WHEN MONTH(order_date) = 1 THEN quantity ELSE 0 END) AS Jan,
SUM(CASE WHEN MONTH(order_date) = 2 THEN quantity ELSE 0 END) AS Feb,
SUM(CASE WHEN MONTH(order_date) = 3 THEN quantity ELSE 0 END) AS Mar
FROM orders
GROUP BY product_id;
-- ========== Dynamic Pivot (SQL Server) ==========
SELECT *
FROM (
SELECT product_id, MONTH(order_date) AS month, quantity
FROM orders
) AS source_data
PIVOT (
SUM(quantity)
FOR month IN ([1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12])
) AS pivoted_data;
Unpivot (Columns to Rows)
-- ========== Manual Unpivot ==========
SELECT product_id, 'Jan' AS month, jan_sales AS sales FROM sales_summary
UNION ALL
SELECT product_id, 'Feb' AS month, feb_sales FROM sales_summary
UNION ALL
SELECT product_id, 'Mar' AS month, mar_sales FROM sales_summary;
-- ========== UNPIVOT (SQL Server) ==========
SELECT product_id, month, sales
FROM sales_summary
UNPIVOT (
sales FOR month IN (jan_sales, feb_sales, mar_sales)
) AS unpivoted_data;
Query Optimization
Using Indexes Effectively
-- ========== Create Indexes ==========
CREATE INDEX idx_orders_customer_date ON orders(customer_id, order_date);
CREATE INDEX idx_products_category ON products(category_id);
CREATE INDEX idx_employees_dept_salary ON employees(department, salary DESC);
-- ========== Covering Index ==========
CREATE INDEX idx_orders_covering ON orders(customer_id, order_date)
INCLUDE (total, status);
-- ========== Partial Index (PostgreSQL) ==========
CREATE INDEX idx_active_orders ON orders(customer_id)
WHERE status = 'active';
-- ========== Check Index Usage ==========
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;
EXPLAIN ANALYZE SELECT * FROM orders WHERE customer_id = 123;
Query Optimization Techniques
-- ❌ BAD: Avoid functions on indexed columns
SELECT * FROM orders
WHERE YEAR(order_date) = 2024;
-- ✅ GOOD: Use range instead
SELECT * FROM orders
WHERE order_date >= '2024-01-01'
AND order_date < '2025-01-01';
-- ❌ BAD: Avoid OR with different columns
SELECT * FROM products
WHERE category_id = 1 OR price > 100;
-- ✅ GOOD: Use UNION if needed
SELECT * FROM products WHERE category_id = 1
UNION
SELECT * FROM products WHERE price > 100;
-- ❌ BAD: SELECT *
SELECT * FROM orders;
-- ✅ GOOD: Select only needed columns
SELECT order_id, order_date, total FROM orders;
-- ❌ BAD: NOT IN with NULL values
SELECT * FROM customers
WHERE id NOT IN (SELECT customer_id FROM orders);
-- ✅ GOOD: NOT EXISTS or IS NOT NULL
SELECT * FROM customers c
WHERE NOT EXISTS (
SELECT 1 FROM orders o WHERE o.customer_id = c.id
);
Transactions and Locking
Transaction Control
-- ========== Basic Transaction ==========
BEGIN TRANSACTION;
UPDATE accounts SET balance = balance - 100 WHERE account_id = 1;
UPDATE accounts SET balance = balance + 100 WHERE account_id = 2;
COMMIT; -- Save changes
-- ROLLBACK; -- Undo changes
-- ========== Transaction with Savepoint ==========
BEGIN TRANSACTION;
UPDATE inventory SET quantity = quantity - 10 WHERE product_id = 1;
SAVEPOINT after_inventory_update;
UPDATE orders SET status = 'shipped' WHERE order_id = 123;
-- If something goes wrong
ROLLBACK TO SAVEPOINT after_inventory_update;
COMMIT;
Isolation Levels
-- ========== Set Isolation Level ==========
SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
-- ========== Example with Read Committed ==========
SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
BEGIN TRANSACTION;
SELECT balance FROM accounts WHERE account_id = 1;
-- Other transactions can modify this row
SELECT balance FROM accounts WHERE account_id = 1; -- May be different
COMMIT;
Locking
-- ========== Explicit Locks ==========
-- Shared lock (read)
SELECT * FROM orders WITH (HOLDLOCK) WHERE order_id = 123;
-- Exclusive lock (write)
SELECT * FROM orders WITH (UPDLOCK) WHERE order_id = 123;
-- Table-level lock
LOCK TABLE orders IN EXCLUSIVE MODE;
-- Row-level lock (PostgreSQL)
SELECT * FROM orders WHERE order_id = 123 FOR UPDATE;
SELECT * FROM orders WHERE order_id = 123 FOR SHARE;
Advanced Aggregations
GROUPING SETS
-- ========== Multiple Grouping Levels ==========
SELECT
department,
job_title,
COUNT(*) AS employee_count,
AVG(salary) AS avg_salary
FROM employees
GROUP BY GROUPING SETS (
(department, job_title), -- By department and job
(department), -- By department only
(job_title), -- By job only
() -- Grand total
);
-- ========== ROLLUP (Hierarchical aggregation) ==========
SELECT
year,
quarter,
month,
SUM(sales) AS total_sales
FROM sales_data
GROUP BY ROLLUP (year, quarter, month);
-- ========== CUBE (All combinations) ==========
SELECT
region,
product_category,
SUM(sales) AS total_sales
FROM sales
GROUP BY CUBE (region, product_category);
FILTER Clause
-- ========== Conditional Aggregation ==========
SELECT
department,
COUNT(*) AS total_employees,
COUNT(*) FILTER (WHERE salary > 100000) AS high_earners,
AVG(salary) FILTER (WHERE hire_date >= '2023-01-01') AS avg_new_hire_salary
FROM employees
GROUP BY department;
JSON Operations (PostgreSQL)
JSON Queries
-- ========== Extract JSON Data ==========
SELECT
id,
data->>'name' AS name,
data->>'email' AS email,
data->'address'->>'city' AS city
FROM users;
-- ========== JSON Array Operations ==========
SELECT
id,
jsonb_array_elements(data->'tags') AS tag
FROM posts;
-- ========== Query JSON Fields ==========
SELECT * FROM products
WHERE data @> '{"category": "electronics"}';
SELECT * FROM products
WHERE data->>'price' > '100';
-- ========== Update JSON Fields ==========
UPDATE users
SET data = jsonb_set(data, '{address,city}', '"Jakarta"')
WHERE id = 1;
Full-Text Search
PostgreSQL Full-Text Search
-- ========== Create Text Search Vector ==========
ALTER TABLE articles
ADD COLUMN search_vector tsvector;
UPDATE articles
SET search_vector = to_tsvector('english', title || ' ' || content);
CREATE INDEX idx_search ON articles USING GIN(search_vector);
-- ========== Search ==========
SELECT
title,
ts_rank(search_vector, query) AS rank
FROM articles,
to_tsquery('english', 'kubernetes & docker') AS query
WHERE search_vector @@ query
ORDER BY rank DESC;
-- ========== Highlight Results ==========
SELECT
ts_headline('english', content, query) AS snippet
FROM articles,
to_tsquery('english', 'kubernetes') AS query
WHERE search_vector @@ query;
Database Design Patterns
Slowly Changing Dimensions (SCD Type 2)
-- ========== Track Historical Changes ==========
CREATE TABLE customer_history (
id SERIAL PRIMARY KEY,
customer_id INT NOT NULL,
name VARCHAR(100),
email VARCHAR(255),
address TEXT,
valid_from DATE NOT NULL,
valid_to DATE,
is_current BOOLEAN DEFAULT TRUE
);
-- ========== Insert New Record ==========
-- 1. Close old record
UPDATE customer_history
SET valid_to = CURRENT_DATE,
is_current = FALSE
WHERE customer_id = 123 AND is_current = TRUE;
-- 2. Insert new record
INSERT INTO customer_history (customer_id, name, email, address, valid_from)
VALUES (123, 'John Doe', 'john@example.com', 'New Address', CURRENT_DATE);
-- ========== Query Current State ==========
SELECT * FROM customer_history
WHERE is_current = TRUE;
-- ========== Query Historical State ==========
SELECT * FROM customer_history
WHERE customer_id = 123
AND '2023-06-15' BETWEEN valid_from AND COALESCE(valid_to, '9999-12-31');
Temporal Tables (SQL Server)
-- ========== System-Versioned Table ==========
CREATE TABLE products (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
price DECIMAL(10, 2),
valid_from DATETIME2 GENERATED ALWAYS AS ROW START,
valid_to DATETIME2 GENERATED ALWAYS AS ROW END,
PERIOD FOR SYSTEM_TIME (valid_from, valid_to)
)
WITH (SYSTEM_VERSIONING = ON);
-- ========== Query Historical Data ==========
SELECT * FROM products
FOR SYSTEM_TIME AS OF '2024-01-01'
WHERE product_id = 1;
SELECT * FROM products
FOR SYSTEM_TIME BETWEEN '2024-01-01' AND '2024-12-31'
WHERE product_id = 1;
Performance Monitoring
Query Performance
-- ========== PostgreSQL ==========
-- Enable query timing
\timing
-- Analyze query plan
EXPLAIN ANALYZE
SELECT * FROM orders
WHERE customer_id = 123;
-- View slow queries
SELECT
query,
calls,
total_time,
mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
-- ========== MySQL ==========
-- Enable slow query log
SET GLOBAL slow_query_log = 'ON';
SET GLOBAL long_query_time = 2;
-- Analyze query
EXPLAIN FORMAT=JSON
SELECT * FROM orders
WHERE customer_id = 123;
-- View query profile
SHOW PROFILE FOR QUERY 1;
Index Analysis
-- ========== Find Missing Indexes ==========
-- PostgreSQL
SELECT
schemaname,
tablename,
seq_scan,
seq_tup_read,
idx_scan
FROM pg_stat_user_tables
WHERE seq_scan > idx_scan
ORDER BY seq_tup_read DESC;
-- ========== Find Unused Indexes ==========
SELECT
schemaname,
tablename,
indexname,
idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0;
Tips
Common Anti-Patterns
-- ❌ BAD: N+1 Query Problem
-- Fetching orders in loop
SELECT * FROM customers;
-- Then for each customer:
SELECT * FROM orders WHERE customer_id = ?;
-- ✅ GOOD: Single query with JOIN
SELECT
c.*,
o.order_id,
o.order_date,
o.total
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id;
-- ❌ BAD: Using DISTINCT to fix bad join
SELECT DISTINCT c.* FROM customers c
JOIN orders o ON c.id = o.customer_id;
-- ✅ GOOD: Use proper aggregation or EXISTS
SELECT c.* FROM customers c
WHERE EXISTS (SELECT 1 FROM orders o WHERE o.customer_id = c.id);
-- ❌ BAD: Implicit type conversion
SELECT * FROM products WHERE product_id = '123'; -- id is INT
-- ✅ GOOD: Use correct type
SELECT * FROM products WHERE product_id = 123;