Skip to main content

IoT & Microcontroller Deployment

TF Lite Micro, edge TPUs, Jetson deployment, real-world constraints, and OTA model updates

~40 min
Listen to this lesson

IoT & Microcontroller Deployment

The most extreme edge: deploying ML models on microcontrollers with kilobytes of RAM, milliwatts of power budget, and no operating system. Welcome to TinyML — where a model must fit in less memory than a single image takes on your phone.

Microcontroller Constraints

A typical microcontroller (Arduino Nano, ESP32, STM32) has 256KB-2MB of Flash memory and 32KB-520KB of RAM. Your entire model — weights, activation buffers, and inference engine — must fit within these limits. For comparison, a smartphone has 4-8 GIGABYTES of RAM, roughly 10,000x more.

TensorFlow Lite for Microcontrollers (TFLM)

A stripped-down version of TF Lite designed for bare-metal microcontrollers:

  • No OS required: Runs on bare metal
  • No dynamic memory allocation: All memory is pre-allocated at compile time
  • Tiny footprint: Core runtime is ~20KB
  • Supported ops: Subset of TF Lite operators optimized for MCUs
  • Target Hardware

    PlatformFlashRAMUse Cases
    Arduino Nano 33 BLE1 MB256 KBKeyword spotting, gesture recognition
    ESP324-16 MB520 KBWake word, anomaly detection
    STM32F7461 MB320 KBPredictive maintenance, simple vision
    Raspberry Pi Pico2 MB264 KBSensor classification

    The TinyML Pipeline

    Train model (PC/cloud)
      → Convert to TF Lite (INT8 quantized)
        → Convert to C array (xxd)
          → Compile into firmware
            → Flash to microcontroller
    

    python
    1# TinyML model preparation pipeline
    2import numpy as np
    3
    4class TinyMLPreparer:
    5    """Prepare a model for microcontroller deployment."""
    6
    7    def __init__(self, model_path: str, target_flash_kb: int,
    8                 target_ram_kb: int):
    9        self.model_path = model_path
    10        self.target_flash_kb = target_flash_kb
    11        self.target_ram_kb = target_ram_kb
    12
    13    def analyze_model(self, model_size_bytes: int,
    14                       peak_activation_bytes: int,
    15                       runtime_overhead_kb: int = 20):
    16        """Check if a model fits on the target microcontroller.
    17
    18        Args:
    19            model_size_bytes: Size of quantized model weights
    20            peak_activation_bytes: Peak memory for activations during inference
    21            runtime_overhead_kb: TFLM runtime overhead (~20KB)
    22        """
    23        model_kb = model_size_bytes / 1024
    24        activation_kb = peak_activation_bytes / 1024
    25
    26        total_flash_kb = model_kb + runtime_overhead_kb
    27        total_ram_kb = activation_kb + 2  # 2KB stack overhead
    28
    29        flash_ok = total_flash_kb <= self.target_flash_kb
    30        ram_ok = total_ram_kb <= self.target_ram_kb
    31
    32        print(f"=== TinyML Deployment Analysis ===")
    33        print(f"Target: Flash={self.target_flash_kb}KB, RAM={self.target_ram_kb}KB")
    34        print()
    35        print(f"Flash usage:")
    36        print(f"  Model weights: {model_kb:.1f} KB")
    37        print(f"  TFLM runtime:  {runtime_overhead_kb} KB")
    38        print(f"  Total:         {total_flash_kb:.1f} / {self.target_flash_kb} KB "
    39              f"({'OK' if flash_ok else 'EXCEEDS LIMIT'})")
    40        print()
    41        print(f"RAM usage:")
    42        print(f"  Activations:   {activation_kb:.1f} KB")
    43        print(f"  Stack:         2 KB")
    44        print(f"  Total:         {total_ram_kb:.1f} / {self.target_ram_kb} KB "
    45              f"({'OK' if ram_ok else 'EXCEEDS LIMIT'})")
    46        print()
    47
    48        if flash_ok and ram_ok:
    49            flash_util = total_flash_kb / self.target_flash_kb * 100
    50            ram_util = total_ram_kb / self.target_ram_kb * 100
    51            print(f"DEPLOYABLE! Flash: {flash_util:.0f}%, RAM: {ram_util:.0f}%")
    52        else:
    53            if not flash_ok:
    54                reduction = (total_flash_kb - self.target_flash_kb)
    55                print(f"Need to reduce model by {reduction:.0f} KB")
    56            if not ram_ok:
    57                reduction = (total_ram_kb - self.target_ram_kb)
    58                print(f"Need to reduce activations by {reduction:.0f} KB")
    59
    60        return flash_ok and ram_ok
    61
    62
    63def estimate_model_memory(layers, dtype_bytes=1):
    64    """Estimate model size and peak activation memory.
    65
    66    Args:
    67        layers: List of (type, params) tuples
    68            - ("dense", (input_dim, output_dim))
    69            - ("conv2d", (in_ch, out_ch, kernel_h, kernel_w))
    70        dtype_bytes: Bytes per weight (1 for INT8, 4 for FP32)
    71    """
    72    total_weights = 0
    73    peak_activation = 0
    74    current_activation = 0
    75
    76    print("Layer-by-layer analysis:")
    77    for i, (layer_type, params) in enumerate(layers):
    78        if layer_type == "dense":
    79            in_dim, out_dim = params
    80            weights = in_dim * out_dim + out_dim  # weights + bias
    81            activation = out_dim * dtype_bytes
    82        elif layer_type == "conv2d":
    83            in_ch, out_ch, kh, kw = params
    84            weights = in_ch * out_ch * kh * kw + out_ch
    85            activation = out_ch * 16 * 16 * dtype_bytes  # assume 16x16 output
    86
    87        total_weights += weights
    88        current_activation = activation
    89        peak_activation = max(peak_activation, current_activation)
    90
    91        weight_kb = weights * dtype_bytes / 1024
    92        act_kb = activation / 1024
    93        print(f"  Layer {i}: {layer_type} {params} -> "
    94              f"weights={weight_kb:.1f}KB, act={act_kb:.1f}KB")
    95
    96    model_bytes = total_weights * dtype_bytes
    97    print(f"\nTotal weights: {total_weights:,} ({model_bytes/1024:.1f} KB)")
    98    print(f"Peak activation: {peak_activation/1024:.1f} KB")
    99
    100    return model_bytes, peak_activation
    101
    102
    103# --- Example: Keyword spotting model for Arduino Nano ---
    104print("=" * 50)
    105print("Keyword Spotting Model ("Hey Device")")
    106print("=" * 50)
    107
    108layers = [
    109    ("conv2d", (1, 8, 3, 3)),      # 8 filters, 3x3
    110    ("conv2d", (8, 16, 3, 3)),     # 16 filters, 3x3
    111    ("dense", (16 * 16 * 16, 64)), # Flatten + dense
    112    ("dense", (64, 4)),            # 4 classes: hey_device, unknown, silence, noise
    113]
    114
    115model_bytes, peak_act = estimate_model_memory(layers, dtype_bytes=1)
    116
    117print()
    118preparer = TinyMLPreparer("keyword_model.tflite",
    119                           target_flash_kb=256,
    120                           target_ram_kb=64)
    121preparer.analyze_model(model_bytes, peak_act)

    Edge TPUs and Accelerators

    For workloads beyond what a CPU microcontroller can handle, dedicated edge ML accelerators provide 10-100x speedup:

    Google Coral Edge TPU

  • USB stick or module form factor
  • 4 TOPS (trillion operations per second)
  • ~2W power consumption
  • Runs INT8 TF Lite models only
  • Ideal for: real-time object detection, image classification
  • NVIDIA Jetson Family

    ModelGPU CoresAI PerformancePowerUse Case
    Jetson Nano128 CUDA472 GFLOPS5-10WHobbyist, prototype
    Jetson Xavier NX384 CUDA + Tensor21 TOPS10-15WDrones, robots
    Jetson Orin Nano1024 CUDA + Tensor40 TOPS7-15WProduction edge AI
    Jetson AGX Orin2048 CUDA + Tensor275 TOPS15-60WAutonomous vehicles

    Real-World Constraints

    Memory

  • Flash (non-volatile): stores model weights and code
  • RAM (volatile): stores activations during inference
  • Both are severely limited on MCUs
  • Power

  • Battery-powered devices need microjoules per inference
  • Always-on sensing requires aggressive duty cycling
  • Wake-on-event: run cheap detector, wake full model only when triggered
  • Latency

  • Safety-critical applications need sub-millisecond response
  • Network round-trip to cloud takes 50-500ms — unacceptable for autonomous driving
  • Edge inference provides consistent, predictable latency
  • OTA (Over-The-Air) Model Updates

    Deployed edge models need updates for:

  • Fixing bugs and accuracy regressions
  • Adapting to data drift
  • Adding new capabilities
  • OTA Update Pipeline

    1. Train new model version in the cloud 2. Validate on held-out data and shadow deployment 3. Package the model as a firmware update 4. Distribute to devices (staged rollout: 1% → 10% → 100%) 5. Verify successful update on each device 6. Rollback if metrics degrade

    python
    1# OTA Model Update Manager
    2from dataclasses import dataclass, field
    3from typing import List, Dict, Optional
    4from datetime import datetime
    5import random
    6
    7@dataclass
    8class ModelVersion:
    9    version: str
    10    size_kb: float
    11    accuracy: float
    12    created_at: str
    13
    14@dataclass
    15class Device:
    16    device_id: str
    17    model_version: str
    18    hardware: str
    19    last_seen: str
    20    status: str = "online"  # online, offline, updating
    21
    22class OTAUpdateManager:
    23    def __init__(self):
    24        self.devices: Dict[str, Device] = {}
    25        self.model_versions: Dict[str, ModelVersion] = {}
    26        self.rollout_log: List[dict] = []
    27
    28    def register_device(self, device: Device):
    29        self.devices[device.device_id] = device
    30
    31    def add_model_version(self, version: ModelVersion):
    32        self.model_versions[version.version] = version
    33
    34    def staged_rollout(self, target_version: str,
    35                        stages: List[float] = [0.01, 0.1, 0.5, 1.0],
    36                        min_success_rate: float = 0.95):
    37        """Perform a staged rollout to all online devices."""
    38        model = self.model_versions.get(target_version)
    39        if not model:
    40            print(f"Model version {target_version} not found!")
    41            return
    42
    43        online_devices = [d for d in self.devices.values()
    44                          if d.status == "online"
    45                          and d.model_version != target_version]
    46
    47        print(f"Staged rollout: v{target_version}")
    48        print(f"Target devices: {len(online_devices)}")
    49        print(f"Stages: {[f'{s:.0%}' for s in stages]}")
    50        print()
    51
    52        updated_devices = []
    53
    54        for stage_pct in stages:
    55            n_target = int(len(online_devices) * stage_pct)
    56            n_remaining = n_target - len(updated_devices)
    57
    58            if n_remaining <= 0:
    59                continue
    60
    61            candidates = [d for d in online_devices
    62                          if d not in updated_devices][:n_remaining]
    63
    64            # Simulate update (some may fail)
    65            successes = 0
    66            failures = 0
    67            for device in candidates:
    68                success = random.random() < 0.97  # 97% success rate
    69                if success:
    70                    device.model_version = target_version
    71                    successes += 1
    72                else:
    73                    failures += 1
    74                updated_devices.append(device)
    75
    76            success_rate = successes / len(candidates) if candidates else 1
    77            print(f"Stage {stage_pct:.0%}: "
    78                  f"{successes}/{len(candidates)} succeeded "
    79                  f"({success_rate:.1%})")
    80
    81            if success_rate < min_success_rate:
    82                print(f"HALT: Success rate {success_rate:.1%} below "
    83                      f"threshold {min_success_rate:.1%}")
    84                print("Rolling back failed devices...")
    85                return False
    86
    87        total_updated = sum(
    88            1 for d in self.devices.values()
    89            if d.model_version == target_version
    90        )
    91        print(f"\nRollout complete: {total_updated}/{len(self.devices)} "
    92              f"devices on v{target_version}")
    93        return True
    94
    95    def fleet_status(self):
    96        versions = {}
    97        for d in self.devices.values():
    98            versions[d.model_version] = versions.get(d.model_version, 0) + 1
    99
    100        print("\n=== Fleet Status ===")
    101        print(f"Total devices: {len(self.devices)}")
    102        for v, count in sorted(versions.items()):
    103            pct = count / len(self.devices) * 100
    104            bar = "#" * int(pct / 2)
    105            print(f"  v{v}: {count:>4d} ({pct:>5.1f}%) {bar}")
    106
    107
    108# --- Simulate an IoT fleet ---
    109random.seed(42)
    110manager = OTAUpdateManager()
    111
    112# Register model versions
    113manager.add_model_version(ModelVersion("1.0", 45.2, 0.89, "2024-01-01"))
    114manager.add_model_version(ModelVersion("2.0", 42.8, 0.93, "2024-03-01"))
    115
    116# Register 100 devices
    117for i in range(100):
    118    device = Device(
    119        device_id=f"device_{i:03d}",
    120        model_version="1.0",
    121        hardware="ESP32",
    122        last_seen="2024-03-15",
    123        status="online" if random.random() > 0.05 else "offline",
    124    )
    125    manager.register_device(device)
    126
    127manager.fleet_status()
    128print()
    129manager.staged_rollout("2.0")
    130manager.fleet_status()

    Brick Prevention

    OTA updates on embedded devices carry a real risk of 'bricking' — rendering the device non-functional. Always implement: (1) dual-partition firmware with fallback, (2) checksum verification before applying updates, (3) watchdog timers that revert if the new firmware crashes, and (4) staged rollouts that catch issues before they affect the entire fleet.