Christian Theune 2d0a489125 nixos/acme: improve scalability - reduce superfluous unit activations
The previous setup caused all renewal units to be triggered upon
ever so slight changes in config. In larger setups (100+ certificates)
adding a new certificate caused high system load and/or large memory
consumption issues. The memory issues are already a alleviated with
the locking mechanism. However, this then causes long delays upwards
of multiple minutes depending on individual runs and also caused
superfluous activations.

In this change we streamline the overall setup of units:

1. The unit that other services can depend upon is 'acme-{cert}.service'.
We call this the 'base unit'. As this one as `RemainAfterExit` set
the `acme-finished-{cert}` targets are not required any longer.

2. We now always generate initial self-signed certificates to simplify
the dependency structure. This deprecates the `preliminarySelfsigned`
option.

3. The `acme-order-renew-{cert}` service gets activated after the base
unit and services using certificates have started and performs all acme
interactions. When it finishes others services (like web servers) will
be notified through the `reloadServices` option or they can use
`wantedBy` and `after` dependencies if they implement their own reload
units.

The renewal timer also triggers this unit.

4. The timer unit is explicitly blocked from being started by s-t-c.

5. Permission management has been cleaned up a bit: there was an
   inconsistency between having the .lego files set to 600 vs 640
   on the exposed side. This is unified to 640 now.

6. Exempt the account target from being restarted by s-t-c. This will
   happen automatically if something relevant to the account changes.
2025-08-08 16:28:42 +02:00

153 lines
5.0 KiB
Nix

{
config,
lib,
pkgs,
...
}:
with lib;
let
format = pkgs.formats.yaml { };
in
{
options.services.pomerium = {
enable = mkEnableOption "the Pomerium authenticating reverse proxy";
configFile = mkOption {
type = with types; nullOr path;
default = null;
description = "Path to Pomerium config YAML. If set, overrides services.pomerium.settings.";
};
useACMEHost = mkOption {
type = with types; nullOr str;
default = null;
description = ''
If set, use a NixOS-generated ACME certificate with the specified name.
Note that this will require you to use a non-HTTP-based challenge, or
disable Pomerium's in-built HTTP redirect server by setting
http_redirect_addr to null and use a different HTTP server for serving
the challenge response.
If you're using an HTTP-based challenge, you should use the
Pomerium-native autocert option instead.
'';
};
settings = mkOption {
description = ''
The contents of Pomerium's config.yaml, in Nix expressions.
Specifying configFile will override this in its entirety.
See [the Pomerium
configuration reference](https://pomerium.io/reference/) for more information about what to put
here.
'';
default = { };
type = format.type;
};
secretsFile = mkOption {
type = with types; nullOr path;
default = null;
description = ''
Path to file containing secrets for Pomerium, in systemd
EnvironmentFile format. See the {manpage}`systemd.exec(5)` man page.
'';
};
};
config =
let
cfg = config.services.pomerium;
cfgFile =
if cfg.configFile != null then cfg.configFile else (format.generate "pomerium.yaml" cfg.settings);
in
mkIf cfg.enable ({
systemd.services.pomerium = {
description = "Pomerium authenticating reverse proxy";
wants = [
"network.target"
]
++ (optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service");
after = [
"network.target"
]
++ (optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service");
wantedBy = [ "multi-user.target" ];
environment = optionalAttrs (cfg.useACMEHost != null) {
CERTIFICATE_FILE = "fullchain.pem";
CERTIFICATE_KEY_FILE = "key.pem";
};
startLimitIntervalSec = 60;
script = ''
if [[ -v CREDENTIALS_DIRECTORY ]]; then
cd "$CREDENTIALS_DIRECTORY"
fi
exec "${pkgs.pomerium}/bin/pomerium" -config "${cfgFile}"
'';
serviceConfig = {
DynamicUser = true;
StateDirectory = [ "pomerium" ];
PrivateUsers = false; # breaks CAP_NET_BIND_SERVICE
MemoryDenyWriteExecute = false; # breaks LuaJIT
NoNewPrivileges = true;
PrivateTmp = true;
PrivateDevices = true;
DevicePolicy = "closed";
ProtectSystem = "strict";
ProtectHome = true;
ProtectControlGroups = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectKernelLogs = true;
RestrictAddressFamilies = "AF_UNIX AF_INET AF_INET6 AF_NETLINK";
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
LockPersonality = true;
SystemCallArchitectures = "native";
EnvironmentFile = cfg.secretsFile;
AmbientCapabilities = [ "CAP_NET_BIND_SERVICE" ];
CapabilityBoundingSet = [ "CAP_NET_BIND_SERVICE" ];
LoadCredential = optionals (cfg.useACMEHost != null) [
"fullchain.pem:/var/lib/acme/${cfg.useACMEHost}/fullchain.pem"
"key.pem:/var/lib/acme/${cfg.useACMEHost}/key.pem"
];
};
};
# postRun hooks on cert renew can't be used to restart Nginx since renewal
# runs as the unprivileged acme user. sslTargets are added to wantedBy + before
# which allows the acme-order-renew-$cert.target to signify the successful updating
# of certs end-to-end.
systemd.services.pomerium-config-reload = mkIf (cfg.useACMEHost != null) {
# TODO(lukegb): figure out how to make config reloading work with credentials.
wantedBy = [
"acme-order-renew-${cfg.useACMEHost}.service"
"multi-user.target"
];
after = [ "acme-order-renew-${cfg.useACMEHost}.service" ];
# Block reloading if not all certs exist yet.
unitConfig.ConditionPathExists = [
"${config.security.acme.certs.${cfg.useACMEHost}.directory}/fullchain.pem"
];
serviceConfig = {
Type = "oneshot";
TimeoutSec = 60;
ExecCondition = "/run/current-system/systemd/bin/systemctl -q is-active pomerium.service";
ExecStart = "/run/current-system/systemd/bin/systemctl --no-block restart pomerium.service";
};
};
});
}