Stopping systemd services under memory pressure

Do you have your "favourite" server that is responsible for just a tiny number too many things ?

I know I have one, it's my CI runner, but also serves as a platform to experiment with local LLMs, but also is a Nix builder for all other machines I have. Lots of responsiblities ! Luckily, I don't need it to do all of those things at the same time, it just didn't know that previously. I would run a local build, it would get sent to that machine, but that machine would have e.g. an LLM loaded in memory and the build would OOM. I would then need to go there, stop the LLM service and restart the build. What if a computer could do it all by itself ?!

I present to you - a simple service that monitors available memory and based on that starts and stops other services dynamically. The code is in Nix, that helps with keeping the service code and definition in one place, making sure paths make sense etc. However, there is nothing fundamentally Nix specific to the service itself, you can just take the Bash code and use it anywhere.

Speaking of usage (from Nix) you can use it like so; if you want to, say, stop LLM server if available mem is under 8GB and start it again when it's over whatever LLM needs (in my case it's 64GB):

services.custom.memoryManager = {
  enable = true;
  stopThresholdMB = 8192;
  startThresholdMB = config.services.llm.memoryRequirementMB;
  checkIntervalSeconds = 30;
  managedServices = [ "llama-cpp-fast" ];
};

And this gives you

Dec 15 12:05:22 memory-manager: Memory: 7543MB available | Decision: STOP (below 8192MB)
Dec 15 12:05:22 memory-manager: LOW MEMORY: Stopping llama-cpp-fast
Dec 15 12:05:23 memory-manager: llama-cpp-fast stopped successfully
Dec 15 12:08:53 memory-manager: Memory: 88902MB available | Decision: START (above 65536MB)
Dec 15 12:08:53 memory-manager: MEMORY OK: Starting llama-cpp-fast
Dec 15 12:08:54 memory-manager: llama-cpp-fast started successfully

And here is the module definition itself. The code is not the prettiest but it does help a lot already !

{
  config,
  lib,
  pkgs,
  ...
}:

with lib;

let
  cfg = config.services.custom.memoryManager;

  monitorScript = pkgs.writeShellScript "memory-manager" ''
    set -euo pipefail

    STOP_THRESHOLD_MB=${toString cfg.stopThresholdMB}
    START_THRESHOLD_MB=${toString cfg.startThresholdMB}
    CHECK_INTERVAL=${toString cfg.checkIntervalSeconds}
    MANAGED_SERVICES="${concatStringsSep " " cfg.managedServices}"
    STATE_DIR="/run/memory-manager"
    DEBUG=${if cfg.debug then "true" else "false"}

    mkdir -p "$STATE_DIR"

    log() {
      echo "$1"
    }

    get_available_memory_mb() {
      ${pkgs.gawk}/bin/awk '/^MemAvailable:/ { printf "%d", $2 / 1024 }' /proc/meminfo
    }

    is_service_stopped_by_us() {
      local service="$1"
      [ -f "$STATE_DIR/$service.stopped" ]
    }

    mark_service_stopped() {
      local service="$1"
      touch "$STATE_DIR/$service.stopped"
    }

    mark_service_started() {
      local service="$1"
      rm -f "$STATE_DIR/$service.stopped"
    }

    stop_service() {
      local service="$1"
      
      if ${pkgs.systemd}/bin/systemctl is-active --quiet "$service"; then
        log "LOW MEMORY: Stopping $service"
        
        ${pkgs.systemd}/bin/systemctl disable --runtime --now "$service" 2>/dev/null || true
        
        if ! ${pkgs.systemd}/bin/systemctl is-active --quiet "$service"; then
          mark_service_stopped "$service"
          log "$service stopped successfully"
        else
          log "WARNING: Failed to stop $service"
        fi
      fi
    }

    start_service() {
      local service="$1"
      
      if is_service_stopped_by_us "$service"; then
        log "MEMORY OK: Starting $service"
        ${pkgs.systemd}/bin/systemctl enable --runtime "$service" 2>/dev/null || true
        ${pkgs.systemd}/bin/systemctl start "$service" 2>/dev/null || true
        
        if ${pkgs.systemd}/bin/systemctl is-active --quiet "$service"; then
          mark_service_started "$service"
          log "$service started successfully"
        else
          log "WARNING: Failed to start $service"
        fi
      fi
    }

    log "Memory manager started"
    log "Stop threshold: ''${STOP_THRESHOLD_MB}MB, Start threshold: ''${START_THRESHOLD_MB}MB"
    log "Managed services: $MANAGED_SERVICES"
    log "Check interval: ''${CHECK_INTERVAL}s, Debug: $DEBUG"

    while true; do
      AVAILABLE_MB=$(get_available_memory_mb)
      
      if [ "$AVAILABLE_MB" -lt "$STOP_THRESHOLD_MB" ]; then
        DECISION="STOP (below ''${STOP_THRESHOLD_MB}MB)"
        log "Memory: ''${AVAILABLE_MB}MB available | Decision: $DECISION"
        for service in $MANAGED_SERVICES; do
          stop_service "$service"
        done
      elif [ "$AVAILABLE_MB" -gt "$START_THRESHOLD_MB" ]; then
        DECISION="START (above ''${START_THRESHOLD_MB}MB)"
        NEEDS_START=false
        for service in $MANAGED_SERVICES; do
          if is_service_stopped_by_us "$service"; then
            NEEDS_START=true
            break
          fi
        done
        if [ "$NEEDS_START" = "true" ]; then
          log "Memory: ''${AVAILABLE_MB}MB available | Decision: $DECISION"
          for service in $MANAGED_SERVICES; do
            start_service "$service"
          done
        else
          log "Memory: ''${AVAILABLE_MB}MB available | Decision: OK (services running)"
        fi
      else
        if [ "$DEBUG" = "true" ]; then
          DECISION="WAIT (between ''${STOP_THRESHOLD_MB}MB and ''${START_THRESHOLD_MB}MB)"
          SERVICE_STATUS=""
          for service in $MANAGED_SERVICES; do
            if ${pkgs.systemd}/bin/systemctl is-active --quiet "$service"; then
              SERVICE_STATUS="$SERVICE_STATUS $service=running"
            elif is_service_stopped_by_us "$service"; then
              SERVICE_STATUS="$SERVICE_STATUS $service=stopped-by-us"
            else
              SERVICE_STATUS="$SERVICE_STATUS $service=stopped"
            fi
          done
          log "Memory: ''${AVAILABLE_MB}MB available | Decision: $DECISION | Services:$SERVICE_STATUS"
        fi
      fi
      
      sleep "$CHECK_INTERVAL"
    done
  '';
in
{
  options.services.custom.memoryManager = {
    enable = mkEnableOption "memory pressure manager that stops/starts services based on available memory";

    stopThresholdMB = mkOption {
      type = types.int;
      default = 8192;
      description = "Stop managed services when available memory drops below this threshold (in MB)";
      example = 4096;
    };

    startThresholdMB = mkOption {
      type = types.int;
      default = 16384;
      description = "Restart managed services when available memory rises above this threshold (in MB). Should be higher than stopThresholdMB to prevent flapping.";
      example = 32768;
    };

    checkIntervalSeconds = mkOption {
      type = types.int;
      default = 10;
      description = "How often to check memory availability (in seconds)";
      example = 30;
    };

    managedServices = mkOption {
      type = types.listOf types.str;
      default = [ ];
      description = "List of systemd service names to manage (without .service suffix)";
      example = [ "llama-cpp-fast" ];
    };

    debug = mkOption {
      type = types.bool;
      default = false;
      description = "Enable debug logging (logs WAIT states on every check interval)";
    };
  };

  config = mkIf cfg.enable {
    assertions = [
      {
        assertion = cfg.startThresholdMB > cfg.stopThresholdMB;
        message = "services.custom.memoryManager.startThresholdMB must be greater than stopThresholdMB to prevent flapping";
      }
      {
        assertion = cfg.managedServices != [ ];
        message = "services.custom.memoryManager.managedServices must not be empty when enabled";
      }
    ];

    systemd.services.memory-manager = {
      description = "Memory Pressure Service Manager";
      wantedBy = [ "multi-user.target" ];
      after = [ "multi-user.target" ];

      serviceConfig = {
        Type = "simple";
        Restart = "always";
        RestartSec = "10s";
        ExecStart = "${monitorScript}";
        RuntimeDirectory = "memory-manager";
        RuntimeDirectoryMode = "0755";
      };
    };
  };
}