System Design Fundamentals: Scalability
Deep dive into scalability principles, vertical vs horizontal scaling, and practical implementation strategies with real-world examples.
System Design Fundamentals: Scalability
Scalability is the ability of a system to handle increasing amounts of work by adding resources. It's one of the most critical aspects of system design, especially for applications that need to serve millions of users or process large volumes of data.
Types of Scalability
1. Vertical Scaling (Scale Up)
Vertical scaling involves adding more power to existing machines by upgrading CPU, RAM, or storage.
Advantages:
- Simple to implement
- No code changes required
- Maintains data consistency
Disadvantages:
- Limited by hardware constraints
- Single point of failure
- Expensive at scale
2. Horizontal Scaling (Scale Out)
Horizontal scaling involves adding more machines to the pool of resources.
Advantages:
- Nearly unlimited scaling potential
- Better fault tolerance
- Cost-effective
Disadvantages:
- Complex implementation
- Data consistency challenges
- Network overhead
Scalability Patterns
1. Database Scaling
Read Replicas
# Python example using SQLAlchemy with read/write splitting
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import random
class DatabaseManager:
def __init__(self):
# Primary database for writes
self.write_engine = create_engine('postgresql://user:pass@primary-db:5432/mydb')
# Read replicas for read operations
self.read_engines = [
create_engine('postgresql://user:pass@replica1-db:5432/mydb'),
create_engine('postgresql://user:pass@replica2-db:5432/mydb'),
create_engine('postgresql://user:pass@replica3-db:5432/mydb')
]
self.WriteSession = sessionmaker(bind=self.write_engine)
self.ReadSessions = [sessionmaker(bind=engine) for engine in self.read_engines]
def get_write_session(self):
return self.WriteSession()
def get_read_session(self):
# Load balance across read replicas
session_class = random.choice(self.ReadSessions)
return session_class()
# Usage example
db_manager = DatabaseManager()
# For write operations
with db_manager.get_write_session() as session:
user = User(name="John Doe", email="john@example.com")
session.add(user)
session.commit()
# For read operations
with db_manager.get_read_session() as session:
users = session.query(User).filter(User.active == True).all()
Database Sharding in Go
package main
import (
"fmt"
"hash/fnv"
"database/sql"
_ "github.com/lib/pq"
)
type ShardManager struct {
shards []*sql.DB
numShards int
}
func NewShardManager(connectionStrings []string) (*ShardManager, error) {
shards := make([]*sql.DB, len(connectionStrings))
for i, connStr := range connectionStrings {
db, err := sql.Open("postgres", connStr)
if err != nil {
return nil, err
}
shards[i] = db
}
return &ShardManager{
shards: shards,
numShards: len(shards),
}, nil
}
func (sm *ShardManager) GetShard(userID string) *sql.DB {
hash := fnv.New32a()
hash.Write([]byte(userID))
shardIndex := int(hash.Sum32()) % sm.numShards
return sm.shards[shardIndex]
}
func (sm *ShardManager) InsertUser(userID, name, email string) error {
db := sm.GetShard(userID)
query := `INSERT INTO users (id, name, email) VALUES ($1, $2, $3)`
_, err := db.Exec(query, userID, name, email)
return err
}
func (sm *ShardManager) GetUser(userID string) (User, error) {
db := sm.GetShard(userID)
query := `SELECT id, name, email FROM users WHERE id = $1`
row := db.QueryRow(query, userID)
var user User
err := row.Scan(&user.ID, &user.Name, &user.Email)
return user, err
}
type User struct {
ID string
Name string
Email string
}
func main() {
connectionStrings := []string{
"postgres://user:pass@shard1:5432/db1",
"postgres://user:pass@shard2:5432/db2",
"postgres://user:pass@shard3:5432/db3",
}
shardManager, err := NewShardManager(connectionStrings)
if err != nil {
panic(err)
}
// Insert user - automatically routed to correct shard
err = shardManager.InsertUser("user123", "Alice Johnson", "alice@example.com")
if err != nil {
fmt.Printf("Error inserting user: %v\n", err)
}
// Retrieve user from correct shard
user, err := shardManager.GetUser("user123")
if err != nil {
fmt.Printf("Error getting user: %v\n", err)
} else {
fmt.Printf("Retrieved user: %+v\n", user)
}
}
2. Application Server Scaling
Auto-scaling Implementation
# Python auto-scaling logic
import psutil
import requests
import time
from threading import Thread
class AutoScaler:
def __init__(self, min_instances=2, max_instances=10, target_cpu=70):
self.min_instances = min_instances
self.max_instances = max_instances
self.target_cpu = target_cpu
self.current_instances = []
self.monitoring = True
def get_average_cpu_usage(self):
"""Get average CPU usage across all instances"""
if not self.current_instances:
return 0
total_cpu = 0
active_instances = 0
for instance in self.current_instances:
try:
response = requests.get(f"http://{instance}/health/cpu", timeout=5)
if response.status_code == 200:
cpu_usage = response.json()['cpu_usage']
total_cpu += cpu_usage
active_instances += 1
except requests.RequestException:
continue
return total_cpu / active_instances if active_instances > 0 else 0
def scale_out(self):
"""Add new instance"""
if len(self.current_instances) >= self.max_instances:
return False
new_instance = self.create_new_instance()
if new_instance:
self.current_instances.append(new_instance)
self.register_with_load_balancer(new_instance)
print(f"Scaled out: Added instance {new_instance}")
return True
return False
def scale_in(self):
"""Remove instance"""
if len(self.current_instances) <= self.min_instances:
return False
instance_to_remove = self.current_instances.pop()
self.deregister_from_load_balancer(instance_to_remove)
self.terminate_instance(instance_to_remove)
print(f"Scaled in: Removed instance {instance_to_remove}")
return True
def create_new_instance(self):
"""Create new server instance (placeholder)"""
# In reality, this would call cloud provider APIs
import uuid
instance_id = f"server-{uuid.uuid4().hex[:8]}"
# Simulate instance creation
return f"{instance_id}.internal:8080"
def register_with_load_balancer(self, instance):
"""Register instance with load balancer"""
try:
requests.post("http://load-balancer/register",
json={"instance": instance})
except requests.RequestException as e:
print(f"Failed to register {instance}: {e}")
def deregister_from_load_balancer(self, instance):
"""Deregister instance from load balancer"""
try:
requests.post("http://load-balancer/deregister",
json={"instance": instance})
except requests.RequestException as e:
print(f"Failed to deregister {instance}: {e}")
def terminate_instance(self, instance):
"""Terminate server instance"""
# In reality, this would call cloud provider APIs to terminate
print(f"Terminating instance: {instance}")
def monitor_and_scale(self):
"""Main monitoring and scaling loop"""
while self.monitoring:
try:
avg_cpu = self.get_average_cpu_usage()
print(f"Average CPU usage: {avg_cpu}%")
if avg_cpu > self.target_cpu + 10: # Scale out threshold
self.scale_out()
elif avg_cpu < self.target_cpu - 20: # Scale in threshold
self.scale_in()
time.sleep(60) # Check every minute
except Exception as e:
print(f"Error in monitoring: {e}")
time.sleep(30)
def start_monitoring(self):
"""Start monitoring in background thread"""
monitor_thread = Thread(target=self.monitor_and_scale)
monitor_thread.daemon = True
monitor_thread.start()
# Usage
if __name__ == "__main__":
scaler = AutoScaler(min_instances=2, max_instances=10, target_cpu=70)
scaler.current_instances = ["server1:8080", "server2:8080"] # Initial instances
scaler.start_monitoring()
# Keep the main thread alive
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
scaler.monitoring = False
print("Shutting down auto-scaler...")
3. Microservices Scaling
// Go microservice with built-in health checks and metrics
package main
import (
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"runtime"
"sync"
"time"
"github.com/gorilla/mux"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/mem"
)
type HealthStatus struct {
Status string `json:"status"`
Timestamp time.Time `json:"timestamp"`
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
Goroutines int `json:"goroutines"`
Uptime string `json:"uptime"`
}
type UserService struct {
users map[string]User
mutex sync.RWMutex
startTime time.Time
}
type User struct {
ID string `json:"id"`
Name string `json:"name"`
Email string `json:"email"`
}
func NewUserService() *UserService {
return &UserService{
users: make(map[string]User),
startTime: time.Now(),
}
}
func (us *UserService) CreateUser(w http.ResponseWriter, r *http.Request) {
var user User
if err := json.NewDecoder(r.Body).Decode(&user); err != nil {
http.Error(w, "Invalid JSON", http.StatusBadRequest)
return
}
us.mutex.Lock()
us.users[user.ID] = user
us.mutex.Unlock()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(user)
}
func (us *UserService) GetUser(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
userID := vars["id"]
us.mutex.RLock()
user, exists := us.users[userID]
us.mutex.RUnlock()
if !exists {
http.Error(w, "User not found", http.StatusNotFound)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(user)
}
func (us *UserService) HealthCheck(w http.ResponseWriter, r *http.Request) {
// Get CPU usage
cpuPercent, err := cpu.Percent(time.Second, false)
var cpuUsage float64
if err == nil && len(cpuPercent) > 0 {
cpuUsage = cpuPercent[0]
}
// Get memory usage
memStats, err := mem.VirtualMemory()
var memUsage float64
if err == nil {
memUsage = memStats.UsedPercent
}
status := HealthStatus{
Status: "healthy",
Timestamp: time.Now(),
CPUUsage: cpuUsage,
MemoryUsage: memUsage,
Goroutines: runtime.NumGoroutine(),
Uptime: time.Since(us.startTime).String(),
}
// Mark as unhealthy if resource usage is too high
if cpuUsage > 90 || memUsage > 95 {
status.Status = "unhealthy"
w.WriteHeader(http.StatusServiceUnavailable)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(status)
}
func (us *UserService) ListUsers(w http.ResponseWriter, r *http.Request) {
us.mutex.RLock()
users := make([]User, 0, len(us.users))
for _, user := range us.users {
users = append(users, user)
}
us.mutex.RUnlock()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(users)
}
func main() {
userService := NewUserService()
r := mux.NewRouter()
// API routes
api := r.PathPrefix("/api/v1").Subrouter()
api.HandleFunc("/users", userService.CreateUser).Methods("POST")
api.HandleFunc("/users/{id}", userService.GetUser).Methods("GET")
api.HandleFunc("/users", userService.ListUsers).Methods("GET")
// Health check routes
r.HandleFunc("/health", userService.HealthCheck).Methods("GET")
r.HandleFunc("/health/cpu", func(w http.ResponseWriter, r *http.Request) {
cpuPercent, _ := cpu.Percent(time.Second, false)
var cpuUsage float64
if len(cpuPercent) > 0 {
cpuUsage = cpuPercent[0]
}
response := map[string]float64{"cpu_usage": cpuUsage}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}).Methods("GET")
// Graceful shutdown
srv := &http.Server{
Addr: ":8080",
Handler: r,
ReadTimeout: 15 * time.Second,
WriteTimeout: 15 * time.Second,
IdleTimeout: 60 * time.Second,
}
log.Printf("User service starting on :8080")
log.Fatal(srv.ListenAndServe())
}
Scalability Metrics and Monitoring
Key Metrics to Track
# Python monitoring system for scalability metrics
import time
import psutil
import requests
from dataclasses import dataclass
from typing import Dict, List
import threading
@dataclass
class ScalabilityMetrics:
timestamp: float
requests_per_second: float
response_time_p50: float
response_time_p95: float
response_time_p99: float
cpu_usage: float
memory_usage: float
active_connections: int
error_rate: float
throughput_mbps: float
class MetricsCollector:
def __init__(self):
self.metrics_history: List[ScalabilityMetrics] = []
self.request_times: List[float] = []
self.error_count = 0
self.total_requests = 0
self.lock = threading.Lock()
def record_request(self, response_time: float, is_error: bool = False):
"""Record a request's response time and error status"""
with self.lock:
self.request_times.append(response_time)
self.total_requests += 1
if is_error:
self.error_count += 1
def calculate_percentiles(self, values: List[float]) -> Dict[str, float]:
"""Calculate percentiles for response times"""
if not values:
return {"p50": 0, "p95": 0, "p99": 0}
sorted_values = sorted(values)
length = len(sorted_values)
return {
"p50": sorted_values[int(length * 0.5)] if length > 0 else 0,
"p95": sorted_values[int(length * 0.95)] if length > 0 else 0,
"p99": sorted_values[int(length * 0.99)] if length > 0 else 0,
}
def collect_system_metrics(self) -> ScalabilityMetrics:
"""Collect current system metrics"""
with self.lock:
# Calculate request metrics
current_time = time.time()
recent_requests = [t for t in self.request_times if current_time - t < 60] # Last minute
rps = len(recent_requests) / 60.0 if recent_requests else 0
percentiles = self.calculate_percentiles([t for t in self.request_times if current_time - t < 60])
error_rate = (self.error_count / self.total_requests * 100) if self.total_requests > 0 else 0
# Reset counters for next collection
self.request_times = []
self.error_count = 0
self.total_requests = 0
# System metrics
cpu_usage = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
network = psutil.net_io_counters()
metrics = ScalabilityMetrics(
timestamp=current_time,
requests_per_second=rps,
response_time_p50=percentiles["p50"],
response_time_p95=percentiles["p95"],
response_time_p99=percentiles["p99"],
cpu_usage=cpu_usage,
memory_usage=memory.percent,
active_connections=len(psutil.net_connections()),
error_rate=error_rate,
throughput_mbps=(network.bytes_sent + network.bytes_recv) / (1024 * 1024) # MB/s
)
self.metrics_history.append(metrics)
return metrics
def should_scale_out(self, current_metrics: ScalabilityMetrics) -> bool:
"""Determine if system should scale out based on metrics"""
return (
current_metrics.cpu_usage > 80 or
current_metrics.memory_usage > 85 or
current_metrics.response_time_p95 > 2000 or # 2 seconds
current_metrics.error_rate > 5 # 5%
)
def should_scale_in(self, current_metrics: ScalabilityMetrics) -> bool:
"""Determine if system can scale in based on metrics"""
recent_metrics = self.metrics_history[-5:] # Last 5 measurements
if len(recent_metrics) < 5:
return False
avg_cpu = sum(m.cpu_usage for m in recent_metrics) / len(recent_metrics)
avg_memory = sum(m.memory_usage for m in recent_metrics) / len(recent_metrics)
avg_response_time = sum(m.response_time_p95 for m in recent_metrics) / len(recent_metrics)
return (
avg_cpu < 30 and
avg_memory < 40 and
avg_response_time < 500 and # 500ms
current_metrics.error_rate < 1 # 1%
)
# Usage example
collector = MetricsCollector()
# Simulate recording requests
def simulate_load():
import random
while True:
response_time = random.uniform(100, 1000) # 100ms to 1s
is_error = random.random() < 0.02 # 2% error rate
collector.record_request(response_time, is_error)
time.sleep(0.1)
# Start load simulation
threading.Thread(target=simulate_load, daemon=True).start()
# Monitor and make scaling decisions
while True:
metrics = collector.collect_system_metrics()
print(f"Metrics: RPS={metrics.requests_per_second:.1f}, "
f"P95={metrics.response_time_p95:.0f}ms, "
f"CPU={metrics.cpu_usage:.1f}%, "
f"Memory={metrics.memory_usage:.1f}%, "
f"Error Rate={metrics.error_rate:.1f}%")
if collector.should_scale_out(metrics):
print("🚀 Recommendation: Scale OUT")
elif collector.should_scale_in(metrics):
print("📉 Recommendation: Scale IN")
else:
print("✅ Current capacity is optimal")
time.sleep(10)
Best Practices for Scalability
1. Design for Horizontal Scaling from Day One
- Use stateless application servers
- Implement proper session management
- Design database schema with sharding in mind
- Use microservices architecture when appropriate
2. Implement Proper Monitoring
3. Use Appropriate Data Storage
- Relational databases: For ACID transactions
- NoSQL databases: For flexible schemas and horizontal scaling
- Caching layers: For frequently accessed data
- CDNs: For static content delivery
4. Implement Circuit Breakers
// Circuit breaker pattern in Go
package main
import (
"errors"
"sync"
"time"
)
type CircuitState int
const (
Closed CircuitState = iota
Open
HalfOpen
)
type CircuitBreaker struct {
maxFailures int
resetTimeout time.Duration
state CircuitState
failures int
lastFailureTime time.Time
mutex sync.RWMutex
}
func NewCircuitBreaker(maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
maxFailures: maxFailures,
resetTimeout: resetTimeout,
state: Closed,
}
}
func (cb *CircuitBreaker) Execute(fn func() error) error {
cb.mutex.Lock()
defer cb.mutex.Unlock()
if cb.state == Open {
if time.Since(cb.lastFailureTime) >= cb.resetTimeout {
cb.state = HalfOpen
cb.failures = 0
} else {
return errors.New("circuit breaker is open")
}
}
err := fn()
if err != nil {
cb.failures++
cb.lastFailureTime = time.Now()
if cb.failures >= cb.maxFailures {
cb.state = Open
}
return err
}
// Success - reset circuit breaker
cb.failures = 0
cb.state = Closed
return nil
}
Common Scalability Pitfalls
- Database bottlenecks: Not optimizing queries or indices
- Session stickiness: Storing session data on application servers
- Synchronous processing: Blocking operations that don't scale
- No caching strategy: Repeated expensive operations
- Ignoring network latency: Not considering geographic distribution
Conclusion
Scalability is not just about handling more users—it's about building systems that can grow efficiently and cost-effectively. The key principles are:
- Plan for scale early: Design with horizontal scaling in mind
- Monitor everything: Use metrics to make informed scaling decisions
- Automate scaling: Implement auto-scaling based on real metrics
- Optimize continuously: Regular performance testing and optimization
- Consider the entire stack: From application code to database to infrastructure
Remember that premature optimization can be as harmful as not planning for scale. Start with simple solutions and add complexity only when needed, guided by real data and metrics.