nixos/acme: improve scalability - reduce superfluous unit activations

The previous setup caused all renewal units to be triggered upon
ever so slight changes in config. In larger setups (100+ certificates)
adding a new certificate caused high system load and/or large memory
consumption issues. The memory issues are already a alleviated with
the locking mechanism. However, this then causes long delays upwards
of multiple minutes depending on individual runs and also caused
superfluous activations.

In this change we streamline the overall setup of units:

1. The unit that other services can depend upon is 'acme-{cert}.service'.
We call this the 'base unit'. As this one as `RemainAfterExit` set
the `acme-finished-{cert}` targets are not required any longer.

2. We now always generate initial self-signed certificates to simplify
the dependency structure. This deprecates the `preliminarySelfsigned`
option.

3. The `acme-order-renew-{cert}` service gets activated after the base
unit and services using certificates have started and performs all acme
interactions. When it finishes others services (like web servers) will
be notified through the `reloadServices` option or they can use
`wantedBy` and `after` dependencies if they implement their own reload
units.

The renewal timer also triggers this unit.

4. The timer unit is explicitly blocked from being started by s-t-c.

5. Permission management has been cleaned up a bit: there was an
   inconsistency between having the .lego files set to 600 vs 640
   on the exposed side. This is unified to 640 now.

6. Exempt the account target from being restarted by s-t-c. This will
   happen automatically if something relevant to the account changes.
This commit is contained in:
Christian Theune 2025-08-08 16:28:42 +02:00
parent dfe6a41c36
commit 2d0a489125
14 changed files with 382 additions and 278 deletions

View File

@ -318,7 +318,7 @@ can be applied to any service.
# Now you must augment OpenSMTPD's systemd service to load
# the certificate files.
systemd.services.opensmtpd.requires = [ "acme-finished-mail.example.com.target" ];
systemd.services.opensmtpd.requires = [ "acme-mail.example.com.service" ];
systemd.services.opensmtpd.serviceConfig.LoadCredential =
let
certDir = config.security.acme.certs."mail.example.com".directory;

View File

@ -160,58 +160,49 @@ let
);
# This is defined with lib.mkMerge so that we can separate the config per function.
setupService = lib.mkMerge [
{
description = "Set up the ACME certificate renewal infrastructure";
script = lib.mkBefore ''
${lib.optionalString cfg.defaults.enableDebugLogs "set -x"}
set -euo pipefail
'';
serviceConfig = commonServiceConfig // {
# This script runs with elevated privileges, denoted by the +
# ExecStartPre is used instead of ExecStart so that the `script` continues to work.
ExecStartPre = "+${lib.getExe privilegedSetupScript}";
setupService = {
description = "Set up the ACME certificate renewal infrastructure";
path = [ pkgs.minica ];
# We don't want this to run every time a renewal happens
RemainAfterExit = true;
script = lib.mkBefore ''
${lib.optionalString cfg.defaults.enableDebugLogs "set -x"}
set -euo pipefail
test -e ca/key.pem || minica \
--ca-key ca/key.pem \
--ca-cert ca/cert.pem \
--domains selfsigned.local
'';
# StateDirectory entries are a cleaner, service-level mechanism
# for dealing with persistent service data
StateDirectory = [
"acme"
"acme/.lego"
"acme/.lego/accounts"
];
StateDirectoryMode = "0755";
serviceConfig = commonServiceConfig // {
# This script runs with elevated privileges, denoted by the +
# ExecStartPre is used instead of ExecStart so that the `script` continues to work.
ExecStartPre = "+${lib.getExe privilegedSetupScript}";
# Creates ${lockdir}. Earlier RemainAfterExit=true means
# it does not get deleted immediately.
RuntimeDirectory = "acme";
RuntimeDirectoryMode = "0700";
# We don't want this to run every time a renewal happens
RemainAfterExit = true;
# Generally, we don't write anything that should be group accessible.
# Group varies for most ACME units, and setup files are only used
# under the acme user.
UMask = "0077";
};
}
# StateDirectory entries are a cleaner, service-level mechanism
# for dealing with persistent service data
StateDirectory = [
"acme"
"acme/.lego"
"acme/.lego/accounts"
"acme/.minica"
];
BindPaths = "/var/lib/acme/.minica:/tmp/ca";
StateDirectoryMode = "0755";
# Avoid race conditions creating the CA for selfsigned certs
(lib.mkIf cfg.preliminarySelfsigned {
path = [ pkgs.minica ];
# Working directory will be /tmp
script = ''
test -e ca/key.pem || minica \
--ca-key ca/key.pem \
--ca-cert ca/cert.pem \
--domains selfsigned.local
'';
serviceConfig = {
StateDirectory = [ "acme/.minica" ];
BindPaths = "/var/lib/acme/.minica:/tmp/ca";
};
})
];
# Creates ${lockdir}. Earlier RemainAfterExit=true means
# it does not get deleted immediately.
RuntimeDirectory = "acme";
RuntimeDirectoryMode = "0700";
# Generally, we don't write anything that should be group accessible.
# Group varies for most ACME units, and setup files are only used
# under the acme user.
UMask = "0077";
};
};
certToConfig =
cert: data:
@ -219,7 +210,6 @@ let
acmeServer = data.server;
useDns = data.dnsProvider != null;
destPath = "/var/lib/acme/${cert}";
selfsignedDeps = lib.optionals (cfg.preliminarySelfsigned) [ "acme-selfsigned-${cert}.service" ];
# Minica and lego have a "feature" which replaces * with _. We need
# to make this substitution to reference the output files from both programs.
@ -339,16 +329,18 @@ let
certificateKey = if data.csrKey != null then "${data.csrKey}" else "certificates/${keyName}.key";
in
{
inherit accountHash cert selfsignedDeps;
inherit accountHash cert;
group = data.group;
renewTimer = {
description = "Renew ACME Certificate for ${cert}";
wantedBy = [ "timers.target" ];
# Avoid triggering certificate renewals accidentally when running s-t-c.
unitConfig."X-OnlyManualStart" = true;
timerConfig = {
OnCalendar = data.renewInterval;
Unit = "acme-${cert}.service";
Unit = "acme-order-renew-${cert}.service";
Persistent = "yes";
# Allow systemd to pick a convenient time within the day
@ -364,15 +356,29 @@ let
};
};
selfsignService = lockfileName: {
description = "Generate self-signed certificate for ${cert}";
baseService = lockfileName: {
description = "Ensure certificate for ${cert}";
wantedBy = [ "multi-user.target" ];
after = [ "acme-setup.service" ];
requires = [ "acme-setup.service" ];
# Whenever this service starts (on boot, through dependencies, through
# changes) we trigger the acme-order-renew service to give it a chance
# to catch up with the potentially changed config.
wants = [
"acme-setup.service"
"acme-order-renew-${cert}.service"
];
before = [ "acme-order-renew-${cert}.service" ];
restartTriggers = [
config.systemd.services."acme-order-renew-${cert}".script
];
path = [ pkgs.minica ];
unitConfig = {
ConditionPathExists = "!/var/lib/acme/${cert}/key.pem";
StartLimitIntervalSec = 0;
};
@ -380,11 +386,13 @@ let
Group = data.group;
UMask = "0027";
RemainAfterExit = true;
StateDirectory = "acme/${cert}";
BindPaths = [
"/var/lib/acme/.minica:/tmp/ca"
"/var/lib/acme/${cert}:/tmp/${keyName}"
"/var/lib/acme/${cert}:/tmp/out"
];
};
@ -392,40 +400,69 @@ let
# minica will output to a folder sharing the name of the first domain
# in the list, which will be ${data.domain}
script = (if (lockfileName == null) then lib.id else wrapInFlock "${lockdir}${lockfileName}") ''
set -ex
# Regenerate self-signed certificates (in case the SANs change) until we
# have seen a succesfull ACME certificate at least once.
if [ -e out/acme-success ]; then
exit 0
fi
minica \
--ca-key ca/key.pem \
--ca-cert ca/cert.pem \
--domains ${lib.escapeShellArg (builtins.concatStringsSep "," ([ data.domain ] ++ extraDomains))}
# Create files to match directory layout for real certificates
cd '${keyName}'
cp ../ca/cert.pem chain.pem
cat cert.pem chain.pem > fullchain.pem
cat key.pem fullchain.pem > full.pem
(
cd '${keyName}'
cp -vp cert.pem ../out/cert.pem
cp -vp key.pem ../out/key.pem
)
cat out/cert.pem ca/cert.pem > out/fullchain.pem
cp ca/cert.pem out/chain.pem
cat out/key.pem out/fullchain.pem > out/full.pem
# Group might change between runs, re-apply it
chown '${user}:${data.group}' -- *
# Fix up the output files to adhere to the group and
# have consistent permissions. This needs to be kept
# consistent with the acme-setup script above.
for fixpath in out certificates; do
if [ -d "$fixpath" ]; then
chmod -R u=rwX,g=rX,o= "$fixpath"
chown -R ${user}:${data.group} "$fixpath"
fi
done
# Default permissions make the files unreadable by group + anon
# Need to be readable by group
chmod 640 -- *
${lib.optionalString (data.webroot != null) ''
# Ensure the webroot exists. Fixing group is required in case configuration was changed between runs.
# Lego will fail if the webroot does not exist at all.
(
mkdir -p '${data.webroot}/.well-known/acme-challenge' \
&& chgrp '${data.group}' ${data.webroot}/.well-known/acme-challenge
) || (
echo 'Please ensure ${data.webroot}/.well-known/acme-challenge exists and is writable by acme:${data.group}' \
&& exit 1
)
''}
'';
};
renewService = lockfileName: {
description = "Renew ACME certificate for ${cert}";
orderRenewService = lockfileName: {
description = "Order (and renew) ACME certificate for ${cert}";
after = [
"network.target"
"network-online.target"
"acme-setup.service"
"nss-lookup.target"
]
++ selfsignedDeps;
wants = [ "network-online.target" ] ++ selfsignedDeps;
requires = [ "acme-setup.service" ];
# https://github.com/NixOS/nixpkgs/pull/81371#issuecomment-605526099
wantedBy = lib.optionals (!config.boot.isContainer) [ "multi-user.target" ];
"acme-${cert}.service"
];
wants = [
"network-online.target"
"acme-setup.service"
"acme-${cert}.service"
];
# Ensure that certificates are generated if people use `security.acme.certs`
# without having/declaring other systemd units that depend on the cert.
path = with pkgs; [
lego
@ -523,25 +560,12 @@ let
[[ $expiration_days -gt ${toString data.validMinDays} ]]
}
${lib.optionalString (data.webroot != null) ''
# Ensure the webroot exists. Fixing group is required in case configuration was changed between runs.
# Lego will fail if the webroot does not exist at all.
(
mkdir -p '${data.webroot}/.well-known/acme-challenge' \
&& chgrp '${data.group}' ${data.webroot}/.well-known/acme-challenge
) || (
echo 'Please ensure ${data.webroot}/.well-known/acme-challenge exists and is writable by acme:${data.group}' \
&& exit 1
)
''}
echo '${domainHash}' > domainhash.txt
# Check if we can renew.
# Check if a new order is needed
# We can only renew if the list of domains has not changed.
# We also need an account key. Avoids #190493
if cmp -s domainhash.txt certificates/domainhash.txt && [ -e '${certificateKey}' ] && [ -e 'certificates/${keyName}.crt' ] && [ -n "$(find accounts -name '${data.email}.key')" ]; then
# Even if a cert is not expired, it may be revoked by the CA.
# Try to renew, and silently fail if the cert is not expired.
# Avoids #85794 and resolves #129838
@ -553,13 +577,12 @@ let
exit 11
fi
fi
# Otherwise do a full run
# Do a full run
elif ! lego ${runOpts}; then
# Produce a nice error for those doing their first nixos-rebuild with these certs
echo Failed to fetch certificates. \
This may mean your DNS records are set up incorrectly. \
${lib.optionalString (cfg.preliminarySelfsigned) "Selfsigned certs are in place and dependant services will still start."}
Self-signed certs are in place and dependant services will still start.
# Exit 10 so that users can potentially amend SuccessExitStatus to ignore this error.
# High number to avoid Systemd reserved codes.
exit 10
@ -567,10 +590,12 @@ let
mv domainhash.txt certificates/
# Group might change between runs, re-apply it
chown '${user}:${data.group}' certificates/*
touch out/acme-success
# Copy all certs to the "real" certs directory
# lego has only an interesting subset of files available,
# construct reasonably compatible files that clients can consume
# as expected.
if ! cmp -s 'certificates/${keyName}.crt' out/fullchain.pem; then
touch out/renewed
echo Installing new certificate
@ -581,10 +606,13 @@ let
cat out/key.pem out/fullchain.pem > out/full.pem
fi
# By default group will have no access to the cert files.
# This chmod will fix that.
chmod 640 out/*
# Keep permissions consistent. Needs to be in sync with the other scripts.
for fixpath in out certificates; do
if [ -d "$fixpath" ]; then
chmod -R u=rwX,g=rX,o= "$fixpath"
chown -R ${user}:${data.group} "$fixpath"
fi
done
# Also ensure safer permissions on the account directory.
chmod -R u=rwX,g=,o= accounts/.
'';
@ -905,19 +933,6 @@ in
options = {
security.acme = {
preliminarySelfsigned = lib.mkOption {
type = lib.types.bool;
default = true;
description = ''
Whether a preliminary self-signed certificate should be generated before
doing ACME requests. This can be useful when certificates are required in
a webserver, but ACME needs the webserver to make its requests.
With preliminary self-signed certificate the webserver can be started and
can later reload the correct ACME certificates.
'';
};
acceptTerms = lib.mkOption {
type = lib.types.bool;
default = false;
@ -1003,10 +1018,13 @@ in
"ACME Directory is now hardcoded to /var/lib/acme and its permissions are managed by systemd. See https://github.com/NixOS/nixpkgs/issues/53852 for more info."
)
(lib.mkRemovedOptionModule [ "security" "acme" "preDelay" ]
"This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service to the service you want to execute before the cert renewal"
"This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service and Before=acme-\${cert}.service to the service you want to execute before the cert renewal"
)
(lib.mkRemovedOptionModule [ "security" "acme" "activationDelay" ]
"This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service to the service you want to execute before the cert renewal"
"This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service and Before=acme-\${cert}.service to the service you want to execute before the cert renewal"
)
(lib.mkRemovedOptionModule [ "security" "acme" "preliminarySelfsigned" ]
"This option has been removed. Preliminary self-signed certificates are now always generated to simplify the dependency structure."
)
(lib.mkChangedOptionModule
[ "security" "acme" "validMin" ]
@ -1161,45 +1179,35 @@ in
systemd.services =
let
renewServiceFunctions = lib.mapAttrs' (
cert: conf: lib.nameValuePair "acme-${cert}" conf.renewService
orderRenewServiceFunctions = lib.mapAttrs' (
cert: conf: lib.nameValuePair "acme-order-renew-${cert}" conf.orderRenewService
) certConfigs;
renewServices =
orderRenewServices =
if cfg.maxConcurrentRenewals > 0 then
roundRobinApplyAttrs renewServiceFunctions concurrencyLockfiles
roundRobinApplyAttrs orderRenewServiceFunctions concurrencyLockfiles
else
lib.mapAttrs (_: f: f null) renewServiceFunctions;
selfsignServiceFunctions = lib.mapAttrs' (
cert: conf: lib.nameValuePair "acme-selfsigned-${cert}" conf.selfsignService
lib.mapAttrs (_: f: f null) orderRenewServiceFunctions;
baseServiceFunctions = lib.mapAttrs' (
cert: conf: lib.nameValuePair "acme-${cert}" conf.baseService
) certConfigs;
selfsignServices =
baseServices =
if cfg.maxConcurrentRenewals > 0 then
roundRobinApplyAttrs selfsignServiceFunctions concurrencyLockfiles
roundRobinApplyAttrs baseServiceFunctions concurrencyLockfiles
else
lib.mapAttrs (_: f: f null) selfsignServiceFunctions;
lib.mapAttrs (_: f: f null) baseServiceFunctions;
in
{
acme-setup = setupService;
}
// renewServices
// lib.optionalAttrs cfg.preliminarySelfsigned selfsignServices;
// baseServices
// orderRenewServices;
systemd.timers = lib.mapAttrs' (
cert: conf: lib.nameValuePair "acme-${cert}" conf.renewTimer
cert: conf: lib.nameValuePair "acme-renew-${cert}" conf.renewTimer
) certConfigs;
systemd.targets =
let
# Create some targets which can be depended on to be "active" after cert renewals
finishedTargets = lib.mapAttrs' (
cert: conf:
lib.nameValuePair "acme-finished-${cert}" {
wantedBy = [ "default.target" ];
requires = [ "acme-${cert}.service" ];
after = [ "acme-${cert}.service" ];
}
) certConfigs;
# Create targets to limit the number of simultaneous account creations
# How it works:
# - Pick a "leader" cert service, which will be in charge of creating the account,
@ -1214,8 +1222,8 @@ in
let
dnsConfs = builtins.filter (conf: cfg.certs.${conf.cert}.dnsProvider != null) confs;
leaderConf = if dnsConfs != [ ] then builtins.head dnsConfs else builtins.head confs;
leader = "acme-${leaderConf.cert}.service";
followers = map (conf: "acme-${conf.cert}.service") (
leader = "acme-order-renew-${leaderConf.cert}.service";
followers = map (conf: "acme-order-renew-${conf.cert}.service") (
builtins.filter (conf: conf != leaderConf) confs
);
in
@ -1224,10 +1232,11 @@ in
before = followers;
requires = [ leader ];
after = [ leader ];
unitConfig.RefuseManualStart = true;
}
) (lib.groupBy (conf: conf.accountHash) (lib.attrValues certConfigs));
in
finishedTargets // accountTargets;
accountTargets;
})
];

View File

@ -156,7 +156,7 @@ in
"network.target"
]
++ lib.optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service";
wants = lib.optional (cfg.useACMEHost != null) "acme-finished-${cfg.useACMEHost}.target";
wants = lib.optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service";
wantedBy = [ "multi-user.target" ];
serviceConfig = {
AmbientCapabilities = "CAP_NET_BIND_SERVICE";

View File

@ -48,8 +48,6 @@ let
) (filter (hostOpts: hostOpts.enableACME || hostOpts.useACMEHost != null) vhosts);
vhostCertNames = unique (map (hostOpts: hostOpts.certName) acmeEnabledVhosts);
dependentCertNames = filter (cert: certs.${cert}.dnsProvider == null) vhostCertNames; # those that might depend on the HTTP server
independentCertNames = filter (cert: certs.${cert}.dnsProvider != null) vhostCertNames; # those that don't depend on the HTTP server
mkListenInfo =
hostOpts:
@ -914,13 +912,14 @@ in
systemd.services.httpd = {
description = "Apache HTTPD";
wantedBy = [ "multi-user.target" ];
wants = concatLists (map (certName: [ "acme-finished-${certName}.target" ]) vhostCertNames);
wants = concatLists (map (certName: [ "acme-${certName}.service" ]) vhostCertNames);
after = [
"network.target"
]
++ map (certName: "acme-selfsigned-${certName}.service") vhostCertNames
++ map (certName: "acme-${certName}.service") independentCertNames; # avoid loading self-signed key w/ real cert, or vice-versa
before = map (certName: "acme-${certName}.service") dependentCertNames;
# Ensure httpd runs with baseline certificates in place.
++ map (certName: "acme-${certName}.service") vhostCertNames;
# Ensure httpd runs (with current config) before the actual ACME jobs run
before = map (certName: "acme-order-renew-${certName}.service") vhostCertNames;
restartTriggers = [ cfg.configFile ];
path = [
@ -960,19 +959,17 @@ in
# postRun hooks on cert renew can't be used to restart Apache since renewal
# runs as the unprivileged acme user. sslTargets are added to wantedBy + before
# which allows the acme-finished-$cert.target to signify the successful updating
# which allows the acme-order-renew-$cert.service to signify the successful updating
# of certs end-to-end.
systemd.services.httpd-config-reload =
let
sslServices = map (certName: "acme-${certName}.service") vhostCertNames;
sslTargets = map (certName: "acme-finished-${certName}.target") vhostCertNames;
sslServices = map (certName: "acme-order-renew-${certName}.service") vhostCertNames;
in
mkIf (vhostCertNames != [ ]) {
wantedBy = sslServices ++ [ "multi-user.target" ];
# Before the finished targets, after the renew services.
# This service might be needed for HTTP-01 challenges, but we only want to confirm
# certs are updated _after_ config has been reloaded.
before = sslTargets;
after = sslServices;
restartTriggers = [ cfg.configFile ];
# Block reloading if not all certs exist yet.

View File

@ -14,13 +14,11 @@ let
virtualHosts = attrValues cfg.virtualHosts;
acmeEnabledVhosts = filter (hostOpts: hostOpts.useACMEHost != null) virtualHosts;
vhostCertNames = unique (map (hostOpts: hostOpts.useACMEHost) acmeEnabledVhosts);
dependentCertNames = filter (cert: certs.${cert}.dnsProvider == null) vhostCertNames; # those that might depend on the HTTP server
independentCertNames = filter (cert: certs.${cert}.dnsProvider != null) vhostCertNames; # those that don't depend on the HTTP server
mkVHostConf =
hostOpts:
let
sslCertDir = config.security.acme.certs.${hostOpts.useACMEHost}.directory;
sslCertDir = certs.${hostOpts.useACMEHost}.directory;
in
''
${hostOpts.hostName} ${concatStringsSep " " hostOpts.serverAliases} {
@ -392,7 +390,7 @@ in
++ map (
name:
mkCertOwnershipAssertion {
cert = config.security.acme.certs.${name};
cert = certs.${name};
groups = config.users.groups;
services = [ config.systemd.services.caddy ];
}
@ -412,11 +410,8 @@ in
systemd.packages = [ cfg.package ];
systemd.services.caddy = {
wants = map (certName: "acme-finished-${certName}.target") vhostCertNames;
after =
map (certName: "acme-selfsigned-${certName}.service") vhostCertNames
++ map (certName: "acme-${certName}.service") independentCertNames; # avoid loading self-signed key w/ real cert, or vice-versa
before = map (certName: "acme-${certName}.service") dependentCertNames;
wants = map (certName: "acme-${certName}.service") vhostCertNames;
after = map (certName: "acme-${certName}.service") vhostCertNames;
wantedBy = [ "multi-user.target" ];
startLimitIntervalSec = 14400;

View File

@ -434,14 +434,13 @@ in
systemd.services.h2o = {
description = "H2O HTTP server";
wantedBy = [ "multi-user.target" ];
wants = lib.concatLists (map (certName: [ "acme-finished-${certName}.target" ]) acmeCertNames.all);
wants = lib.concatLists (map (certName: [ "acme-${certName}.service" ]) acmeCertNames.all);
# Since H2O will be hosting the challenges, H2O must be started
before = builtins.map (certName: "acme-${certName}.service") acmeCertNames.dependent;
before = builtins.map (certName: "acme-order-renew-${certName}.service") acmeCertNames.all;
after = [
"network.target"
]
++ builtins.map (certName: "acme-selfsigned-${certName}.service") acmeCertNames.all
++ builtins.map (certName: "acme-${certName}.service") acmeCertNames.independent; # avoid loading self-signed key w/ real cert, or vice-versa
++ builtins.map (certName: "acme-${certName}.service") acmeCertNames.all;
serviceConfig = {
ExecStart = "${h2oExe} --mode 'master'";
@ -490,16 +489,14 @@ in
# This service waits for all certificates to be available before reloading
# H2O configuration. `tlsTargets` are added to `wantedBy` + `before` which
# allows the `acme-finished-$cert.target` to signify the successful updating
# allows the `acme-order-renew-$cert.service` to signify the successful updating
# of certs end-to-end.
systemd.services.h2o-config-reload =
let
tlsTargets = map (certName: "acme-${certName}.target") acmeCertNames.all;
tlsServices = map (certName: "acme-${certName}.service") acmeCertNames.all;
tlsServices = map (certName: "acme-order-renew-${certName}.service") acmeCertNames.all;
in
mkIf (acmeCertNames.all != [ ]) {
wantedBy = tlsServices ++ [ "multi-user.target" ];
before = tlsTargets;
after = tlsServices;
unitConfig = {
ConditionPathExists = map (

View File

@ -15,8 +15,6 @@ let
vhostConfig: vhostConfig.enableACME || vhostConfig.useACMEHost != null
) vhostsConfigs;
vhostCertNames = unique (map (hostOpts: hostOpts.certName) acmeEnabledVhosts);
dependentCertNames = filter (cert: certs.${cert}.dnsProvider == null) vhostCertNames; # those that might depend on the HTTP server
independentCertNames = filter (cert: certs.${cert}.dnsProvider != null) vhostCertNames; # those that don't depend on the HTTP server
virtualHosts = mapAttrs (
vhostName: vhostConfig:
let
@ -442,6 +440,7 @@ let
auth_basic off;
auth_request off;
proxy_pass http://${vhost.acmeFallbackHost};
proxy_set_header Host $host;
}
''}
'';
@ -1481,16 +1480,14 @@ in
systemd.services.nginx = {
description = "Nginx Web Server";
wantedBy = [ "multi-user.target" ];
wants = concatLists (map (certName: [ "acme-finished-${certName}.target" ]) vhostCertNames);
wants = concatLists (map (certName: [ "acme-${certName}.service" ]) vhostCertNames);
after = [
"network.target"
]
++ map (certName: "acme-selfsigned-${certName}.service") vhostCertNames
++ map (certName: "acme-${certName}.service") independentCertNames; # avoid loading self-signed key w/ real cert, or vice-versa
# Nginx needs to be started in order to be able to request certificates
# (it's hosting the acme challenge after all)
# This fixes https://github.com/NixOS/nixpkgs/issues/81842
before = map (certName: "acme-${certName}.service") dependentCertNames;
# Ensure nginx runs with baseline certificates in place.
++ map (certName: "acme-${certName}.service") vhostCertNames;
# Ensure nginx runs (with current config) before the actual ACME jobs run
before = map (certName: "acme-order-renew-${certName}.service") vhostCertNames;
stopIfChanged = false;
preStart = ''
${cfg.preStart}
@ -1585,26 +1582,24 @@ in
# This service waits for all certificates to be available
# before reloading nginx configuration.
# sslTargets are added to wantedBy + before
# which allows the acme-finished-$cert.target to signify the successful updating
# which allows the acme-order-renew-$cert.service to signify the successful updating
# of certs end-to-end.
systemd.services.nginx-config-reload =
let
sslServices = map (certName: "acme-${certName}.service") vhostCertNames;
sslTargets = map (certName: "acme-finished-${certName}.target") vhostCertNames;
sslOrderRenewServices = map (certName: "acme-order-renew-${certName}.service") vhostCertNames;
in
mkIf (cfg.enableReload || vhostCertNames != [ ]) {
wants = optionals cfg.enableReload [ "nginx.service" ];
wantedBy = sslServices ++ [ "multi-user.target" ];
# Before the finished targets, after the renew services.
wantedBy = sslOrderRenewServices ++ [ "multi-user.target" ];
# XXX Before the finished targets, after the renew services.
# This service might be needed for HTTP-01 challenges, but we only want to confirm
# certs are updated _after_ config has been reloaded.
before = sslTargets;
after = sslServices;
after = sslOrderRenewServices;
restartTriggers = optionals cfg.enableReload [ configFile ];
# Block reloading if not all certs exist yet.
# Happens when config changes add new vhosts/certs.
unitConfig = {
ConditionPathExists = optionals (sslServices != [ ]) (
ConditionPathExists = optionals (vhostCertNames != [ ]) (
map (certName: certs.${certName}.directory + "/fullchain.pem") vhostCertNames
);
# Disable rate limiting for this, because it may be triggered quickly a bunch of times

View File

@ -72,11 +72,11 @@ in
wants = [
"network.target"
]
++ (optional (cfg.useACMEHost != null) "acme-finished-${cfg.useACMEHost}.target");
++ (optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service");
after = [
"network.target"
]
++ (optional (cfg.useACMEHost != null) "acme-finished-${cfg.useACMEHost}.target");
++ (optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service");
wantedBy = [ "multi-user.target" ];
environment = optionalAttrs (cfg.useACMEHost != null) {
CERTIFICATE_FILE = "fullchain.pem";
@ -127,18 +127,16 @@ in
# postRun hooks on cert renew can't be used to restart Nginx since renewal
# runs as the unprivileged acme user. sslTargets are added to wantedBy + before
# which allows the acme-finished-$cert.target to signify the successful updating
# which allows the acme-order-renew-$cert.target to signify the successful updating
# of certs end-to-end.
systemd.services.pomerium-config-reload = mkIf (cfg.useACMEHost != null) {
# TODO(lukegb): figure out how to make config reloading work with credentials.
wantedBy = [
"acme-finished-${cfg.useACMEHost}.target"
"acme-order-renew-${cfg.useACMEHost}.service"
"multi-user.target"
];
# Before the finished targets, after the renew services.
before = [ "acme-finished-${cfg.useACMEHost}.target" ];
after = [ "acme-${cfg.useACMEHost}.service" ];
after = [ "acme-order-renew-${cfg.useACMEHost}.service" ];
# Block reloading if not all certs exist yet.
unitConfig.ConditionPathExists = [
"${config.security.acme.certs.${cfg.useACMEHost}.directory}/fullchain.pem"

View File

@ -85,33 +85,24 @@ in
ca_domain = "${nodes.acme.test-support.acme.caDomain}"
fqdn = "${nodes.caddy.networking.fqdn}"
with subtest("Boot and start with selfsigned certificates"):
caddy.start()
caddy.wait_for_unit("caddy.service")
check_issuer(caddy, fqdn, "minica")
# Check that the web server has picked up the selfsigned cert
check_connection(caddy, fqdn, minica=True)
acme.start()
wait_for_running(acme)
acme.wait_for_open_port(443)
with subtest("Boot and acquire a new cert"):
caddy.start()
wait_for_running(caddy)
with subtest("Acquire a new cert"):
caddy.succeed(f"systemctl restart acme-{fqdn}.service")
check_issuer(caddy, fqdn, "pebble")
check_domain(caddy, fqdn, fqdn)
download_ca_certs(caddy, ca_domain)
check_connection(caddy, fqdn)
with subtest("Can run on selfsigned certificates"):
# Switch to selfsigned first
caddy.succeed(f"systemctl clean acme-{fqdn}.service --what=state")
caddy.succeed(f"systemctl start acme-selfsigned-{fqdn}.service")
check_issuer(caddy, fqdn, "minica")
caddy.succeed("systemctl restart caddy.service")
# Check that the web server has picked up the selfsigned cert
check_connection(caddy, fqdn, minica=True)
caddy.succeed(f"systemctl start acme-{fqdn}.service")
# This may fail a couple of times before caddy is restarted
check_issuer(caddy, fqdn, "pebble")
check_connection(caddy, fqdn)
with subtest("security.acme changes reflect on caddy"):
check_connection(caddy, f"caddy-alt.{domain}", fail=True)
switch_to(caddy, "add_domain")

View File

@ -1,10 +1,14 @@
{ runTest }:
let
domain = "example.test";
in
{
http01-builtin = runTest ./http01-builtin.nix;
dns01 = runTest ./dns01.nix;
caddy = runTest ./caddy.nix;
nginx = runTest (
import ./webserver.nix {
inherit domain;
serverName = "nginx";
group = "nginx";
baseModule = {
@ -22,17 +26,17 @@
addSSL = true;
useACMEHost = "proxied.example.test";
acmeFallbackHost = "localhost:8080";
# lego will refuse the request if the host header is not correct
extraConfig = ''
proxy_set_header Host $host;
'';
};
};
specialisation.nullroot.configuration = {
services.nginx.virtualHosts."nullroot.${domain}".acmeFallbackHost = "localhost:8081";
};
};
}
);
httpd = runTest (
import ./webserver.nix {
inherit domain;
serverName = "httpd";
group = "wwwrun";
baseModule = {
@ -50,6 +54,16 @@
};
};
};
specialisation.nullroot.configuration = {
services.httpd.virtualHosts."nullroot.${domain}" = {
locations."/.well-known/acme-challenge" = {
proxyPass = "http://localhost:8081/.well-known/acme-challenge";
extraConfig = ''
ProxyPreserveHost On
'';
};
};
};
};
}
);

View File

@ -37,6 +37,12 @@ in
listenHTTP = ":80";
};
systemd.targets."renew-triggered" = {
wantedBy = [ "acme-order-renew-${config.networking.fqdn}.service" ];
after = [ "acme-order-renew-${config.networking.fqdn}.service" ];
unitConfig.RefuseManualStart = true;
};
specialisation = {
renew.configuration = {
# Pebble provides 5 year long certs,
@ -177,17 +183,29 @@ in
# old_hash will be used in the preservation tests later
old_hash = hash
builtin.succeed(f"systemctl start acme-{cert}.service")
builtin.succeed(f"systemctl start acme-order-renew-{cert}.service")
builtin.wait_for_unit("renew-triggered.target")
hash_after = builtin.succeed(f"sha256sum /var/lib/acme/{cert}/cert.pem")
assert hash == hash_after, "Certificate was unexpectedly changed"
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "renew")
builtin.wait_for_unit("renew-triggered.target")
check_issuer(builtin, cert, "pebble")
hash_after = builtin.succeed(f"sha256sum /var/lib/acme/{cert}/cert.pem | tee /dev/stderr")
assert hash != hash_after, "Certificate was not renewed"
check_permissions(builtin, cert, "acme")
with subtest("Handles email change correctly"):
hash = builtin.succeed(f"sha256sum /var/lib/acme/{cert}/cert.pem")
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "accountchange")
builtin.wait_for_unit("renew-triggered.target")
check_issuer(builtin, cert, "pebble")
# Check that there are now 2 account directories
builtin.succeed("test $(ls -1 /var/lib/acme/.lego/accounts | tee /dev/stderr | wc -l) -eq 2")
@ -202,58 +220,101 @@ in
# old_hash will be used in the preservation tests later
old_hash = hash_after
check_permissions(builtin, cert, "acme")
with subtest("Correctly implements OCSP stapling"):
check_stapling(builtin, cert, "${caDomain}", fail=True)
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "ocsp_stapling")
builtin.wait_for_unit("renew-triggered.target")
check_stapling(builtin, cert, "${caDomain}")
check_permissions(builtin, cert, "acme")
with subtest("Handles keyType change correctly"):
check_key_bits(builtin, cert, 256)
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "keytype")
builtin.wait_for_unit("renew-triggered.target")
check_key_bits(builtin, cert, 384)
# keyType is part of the accountHash, thus a new account will be created
builtin.succeed("test $(ls -1 /var/lib/acme/.lego/accounts | tee /dev/stderr | wc -l) -eq 2")
check_permissions(builtin, cert, "acme")
with subtest("Reuses generated, valid certs from previous configurations"):
# Right now, the hash should not match due to the previous test
hash = builtin.succeed(f"sha256sum /var/lib/acme/{cert}/cert.pem | tee /dev/stderr")
assert hash != old_hash, "Expected certificate to differ"
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "preservation")
builtin.wait_for_unit("renew-triggered.target")
hash = builtin.succeed(f"sha256sum /var/lib/acme/{cert}/cert.pem | tee /dev/stderr")
assert hash == old_hash, "Expected certificate to match from older configuration"
check_permissions(builtin, cert, "acme")
with subtest("Add a new cert, extend existing cert domains"):
check_domain(builtin, cert, f"builtin-alt.{domain}", fail=True)
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "add_cert_and_domain")
builtin.wait_for_unit("renew-triggered.target")
check_issuer(builtin, cert, "pebble")
check_domain(builtin, cert, f"builtin-alt.{domain}")
check_issuer(builtin, cert2, "pebble")
check_domain(builtin, cert2, cert2)
# There should not be a new account folder created
builtin.succeed("test $(ls -1 /var/lib/acme/.lego/accounts | tee /dev/stderr | wc -l) -eq 2")
check_permissions(builtin, cert, "acme")
check_permissions(builtin, cert2, "acme")
with subtest("Check account hashing compatibility with pre-24.05 settings"):
switch_to(builtin, "legacy_account_hash", fail=True)
builtin.succeed(f"stat {legacy_account_dir} > /dev/stderr && rm -rf {legacy_account_dir}")
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "legacy_account_hash"
)
builtin.wait_for_unit("renew-triggered.target")
with subtest("Ensure Concurrency limits work"):
builtin.succeed(f"stat {legacy_account_dir} > /dev/stderr && rm -rf {legacy_account_dir}")
check_permissions(builtin, cert, "acme")
with subtest("Ensure concurrency limits work"):
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "concurrency")
builtin.wait_for_unit("renew-triggered.target")
check_issuer(builtin, cert3, "pebble")
check_domain(builtin, cert3, cert3)
check_permissions(builtin, cert, "acme")
with subtest("Can renew using a CSR"):
builtin.succeed(f"systemctl stop acme-{cert}.service")
builtin.succeed(f"systemctl clean acme-{cert}.service --what=state")
builtin.succeed("systemctl stop renew-triggered.target")
switch_to(builtin, "csr")
builtin.wait_for_unit("renew-triggered.target")
check_issuer(builtin, cert, "pebble")
with subtest("Generate self-signed certs"):
acme.shutdown()
check_issuer(builtin, cert, "pebble")
builtin.succeed(f"systemctl stop acme-{cert}.service")
builtin.succeed(f"systemctl clean acme-{cert}.service --what=state")
builtin.succeed(f"systemctl start acme-selfsigned-{cert}.service")
builtin.succeed(f"systemctl start acme-{cert}.service")
check_issuer(builtin, cert, "minica")
check_domain(builtin, cert, cert)
with subtest("Validate permissions (self-signed)"):
check_permissions(builtin, cert, "acme")
with subtest("Can renew using a CSR"):
builtin.succeed(f"systemctl clean acme-{cert}.service --what=state")
switch_to(builtin, "csr")
check_issuer(builtin, cert, "pebble")
'';
}

View File

@ -3,6 +3,36 @@ import time
TOTAL_RETRIES = 20
# BackoffTracker provides a robust system for handling test retries
class BackoffTracker:
delay = 1
increment = 1
def handle_fail(self, retries, message) -> int:
assert retries < TOTAL_RETRIES, message
print(f"Retrying in {self.delay}s, {retries + 1}/{TOTAL_RETRIES}")
time.sleep(self.delay)
# Only increment after the first try
if retries == 0:
self.delay += self.increment
self.increment *= 2
return retries + 1
def protect(self, func):
def wrapper(*args, retries: int = 0, **kwargs):
try:
return func(*args, **kwargs)
except Exception as err:
retries = self.handle_fail(retries, err.args)
return wrapper(*args, retries=retries, **kwargs)
return wrapper
backoff = BackoffTracker()
def run(node, cmd, fail=False):
if fail:
@ -39,6 +69,7 @@ def switch_to(node, name, fail=False) -> None:
# and matches the issuer we expect it to be.
# It's a good validation to ensure the cert.pem and fullchain.pem
# are not still selfsigned after verification
@backoff.protect
def check_issuer(node, cert_name, issuer) -> None:
for fname in ("cert.pem", "fullchain.pem"):
actual_issuer = node.succeed(
@ -102,9 +133,10 @@ def check_permissions(node, cert_name, group):
f"test $({stat} /var/lib/acme/{cert_name}/*.pem"
f" | tee /dev/stderr | grep -v '640 acme {group}' | wc -l) -eq 0"
)
node.execute(f"ls -lahR /var/lib/acme/.lego/{cert_name}/* > /dev/stderr")
node.succeed(
f"test $({stat} /var/lib/acme/.lego/{cert_name}/*/{cert_name}*"
f" | tee /dev/stderr | grep -v '600 acme {group}' | wc -l) -eq 0"
f" | tee /dev/stderr | grep -v '640 acme {group}' | wc -l) -eq 0"
)
node.succeed(
f"test $({stat} /var/lib/acme/{cert_name}"
@ -115,37 +147,6 @@ def check_permissions(node, cert_name, group):
f" | tee /dev/stderr | grep -v '600 acme {group}' | wc -l) -eq 0"
)
# BackoffTracker provides a robust system for handling test retries
class BackoffTracker:
delay = 1
increment = 1
def handle_fail(self, retries, message) -> int:
assert retries < TOTAL_RETRIES, message
print(f"Retrying in {self.delay}s, {retries + 1}/{TOTAL_RETRIES}")
time.sleep(self.delay)
# Only increment after the first try
if retries == 0:
self.delay += self.increment
self.increment *= 2
return retries + 1
def protect(self, func):
def wrapper(*args, retries: int = 0, **kwargs):
try:
return func(*args, **kwargs)
except Exception as err:
retries = self.handle_fail(retries, err.args)
return wrapper(*args, retries=retries, **kwargs)
return wrapper
backoff = BackoffTracker()
@backoff.protect
def download_ca_certs(node, ca_domain):

View File

@ -2,7 +2,7 @@
serverName,
group,
baseModule,
domain ? "example.test",
domain,
}:
{
config,
@ -18,6 +18,8 @@
timeout = 300;
};
interactive.sshBackdoor.enable = true;
nodes = {
# The fake ACME server which will respond to client requests
acme =
@ -45,6 +47,7 @@
"certchange.${domain}"
"zeroconf.${domain}"
"zeroconf2.${domain}"
"zeroconf3.${domain}"
"nullroot.${domain}"
];
@ -57,6 +60,7 @@
systemd.targets."renew-triggered" = {
wantedBy = [ "${serverName}-config-reload.service" ];
after = [ "${serverName}-config-reload.service" ];
unitConfig.RefuseManualStart = true;
};
security.acme.certs."proxied.${domain}" = {
@ -101,13 +105,42 @@
# Test that "acmeRoot = null" still results in
# valid cert generation by inheriting defaults.
nullroot.configuration = {
security.acme.defaults.listenHTTP = ":8080";
# The default.nix has the server-type dependent config statements
# to properly set up the proxying. We need a separate port here to
# avoid hostname issues with the proxy already running on :8080
security.acme.defaults.listenHTTP = ":8081";
services.${serverName}.virtualHosts."nullroot.${domain}" = {
onlySSL = true;
addSSL = true;
enableACME = true;
acmeRoot = null;
};
};
# Test that a adding a second virtual host will not trigger
# other units (account and renewal service for first)
zeroconf3.configuration = {
services.${serverName}.virtualHosts = {
"zeroconf.${domain}" = {
addSSL = true;
enableACME = true;
serverAliases = [ "zeroconf2.${domain}" ];
};
"zeroconf3.${domain}" = {
addSSL = true;
enableACME = true;
};
};
# We're doing something risky with the combination of the service unit being persistent
# that could end up that the timers do not trigger properly. Show that timers have the
# desired effect.
systemd.timers."acme-renew-zeroconf3.${domain}".timerConfig = {
OnCalendar = lib.mkForce "*-*-* *:*:0/5";
AccuracySec = lib.mkForce 0;
# Skew randomly within the day, per https://letsencrypt.org/docs/integration-guide/.
RandomizedDelaySec = lib.mkForce 0;
FixedRandomDelay = lib.mkForce 0;
};
};
};
};
};
@ -121,30 +154,24 @@
ca_domain = "${nodes.acme.test-support.acme.caDomain}"
fqdn = f"proxied.{domain}"
webserver.start()
webserver.wait_for_unit("${serverName}.service")
with subtest("Can run on self-signed certificates"):
check_issuer(webserver, fqdn, "minica")
# Check that the web server has picked up the selfsigned cert
check_connection(webserver, fqdn, minica=True)
acme.start()
wait_for_running(acme)
acme.wait_for_open_port(443)
with subtest("Acquire a cert through a proxied lego"):
webserver.start()
webserver.succeed("systemctl is-system-running --wait")
wait_for_running(webserver)
download_ca_certs(webserver, ca_domain)
check_connection(webserver, fqdn)
with subtest("Can run on selfsigned certificates"):
# Switch to selfsigned first
webserver.succeed(f"systemctl clean acme-{fqdn}.service --what=state")
webserver.succeed(f"systemctl start acme-selfsigned-{fqdn}.service")
check_issuer(webserver, fqdn, "minica")
webserver.succeed("systemctl restart ${serverName}-config-reload.service")
# Check that the web server has picked up the selfsigned cert
check_connection(webserver, fqdn, minica=True)
webserver.succeed("systemctl stop renew-triggered.target")
webserver.succeed(f"systemctl start acme-{fqdn}.service")
webserver.wait_for_unit("renew-triggered.target")
check_issuer(webserver, fqdn, "pebble")
check_connection(webserver, fqdn)
webserver.succeed(f"systemctl start acme-order-renew-{fqdn}.service")
webserver.wait_for_unit("renew-triggered.target")
download_ca_certs(webserver, ca_domain)
check_issuer(webserver, fqdn, "pebble")
check_connection(webserver, fqdn)
with subtest("security.acme changes reflect on web server part 1"):
check_connection(webserver, f"certchange.{domain}", fail=True)
@ -181,5 +208,23 @@
switch_to(webserver, "nullroot")
webserver.wait_for_unit("renew-triggered.target")
check_connection(webserver, f"nullroot.{domain}")
with subtest("Ensure that adding a second vhost does not trigger first vhost acme units"):
switch_to(webserver, "zeroconf")
webserver.wait_for_unit("renew-triggered.target")
webserver.succeed("journalctl --cursor-file=/tmp/cursor | grep acme")
switch_to(webserver, "zeroconf3")
webserver.wait_for_unit("renew-triggered.target")
output = webserver.succeed("journalctl --cursor-file=/tmp/cursor | grep acme")
# The new certificate unit gets triggered:
t.assertIn(f"acme-zeroconf3.{domain}-start", output)
# The account generation should not be triggered again:
t.assertNotIn("acme-account-d590213ed52603e9128d.target", output)
# The other certificates should also not be triggered:
t.assertNotIn(f"acme-zeroconf.{domain}-start", output)
t.assertNotIn(f"acme-proxied.{domain}-start", output)
# Ensure the timer works, due to our shenanigans with
# RemainAfterExit=true
webserver.wait_until_succeeds(f"journalctl --cursor-file=/tmp/cursor | grep 'Starting Order (and renew) ACME certificate for zeroconf3.{domain}...'")
'';
}

View File

@ -137,17 +137,18 @@ import ./make-test-python.nix (
caserver.wait_for_unit("step-ca.service")
caserver.wait_until_succeeds("journalctl -o cat -u step-ca.service | grep '${pkgs.step-ca.version}'")
caclient.wait_for_unit("acme-finished-caclient.target")
catester.succeed("curl https://caclient/ | grep \"Welcome to nginx!\"")
caclient.wait_for_unit("acme-caclient.service")
# The order is run asynchonously, keep trying.
catester.wait_until_succeeds("curl https://caclient/ | grep \"Welcome to nginx!\"")
caclientcaddy.wait_for_unit("caddy.service")
# Its hard to know when Caddy has finished the ACME dance with
# step-ca, so we keep trying cURL until success.
catester.wait_until_succeeds("curl https://caclientcaddy/ | grep \"Welcome to Caddy!\"")
caclienth2o.wait_for_unit("acme-finished-caclienth2o.target")
caclienth2o.wait_for_unit("acme-caclienth2o.service")
caclienth2o.wait_for_unit("h2o.service")
catester.succeed("curl https://caclienth2o/ | grep \"Welcome to H2O!\"")
catester.wait_until_succeeds("curl https://caclienth2o/ | grep \"Welcome to H2O!\"")
'';
}
)