1
0
mirror of https://git.FreeBSD.org/ports.git synced 2025-01-17 08:01:36 +00:00

Overhaul of the job scheduler. The new scheduler runs builds

synchronously instead of probabilistically scheduling jobs, which
means that the job load on a machine never exceeds a desired
threshold, and we can preferentially use faster machines when they are
available.  This has a dramatic effect on package build throughput,
although I don't yet have precise measurements of the performance
improvements.

Specifically, the changes are:

* Introduce the new variable maxjobs in portbuild.  This replaces the
build scheduling weights previously listed in the mlist file, which
now changes format to list the build machines only, ranked in order of
preference for job dispatches (i.e. faster machines first).

* The ${arch}/queue directory is used to list machines available for
jobs (file content is the number of jobs currently running on the
machine).  Changes to files in this directory are serialized using
lockf on the .lock file.

* Claim a machine with the getmachine script, with the .lock held.
This picks the machine with the fewestnumber of jobs running, which is
listed highest in the mlist file in case of multiple machines with
equal load.  The job counter is incremented, and the file removed if
the counter reaches ${maxjobs} for that machine.  If all machines are
busy, sleep for 15 seconds and retry.

* After we have claimed a machine, we run claim-chroot on it to claim
an empty chroot, as before.  If the claim fails, release the job from
the queue with the releasemachine script and retry after a 15 second
wait.

* When the build is finished, decrement the job counter with the
releasemachine script, with .lock held.

* The checkmachines script now exists only to poll the load averages
for admin convenience (every 2 minutes), and to ping for unreachable
machines.  When a machine cannot be reached, remove the entry in the
queue directory to stop further job dispatches to it.  This needs more
work to deal with reinitialization of machines after they become
available again.

Additional changes to this file:

* Exit if passed a null package name, to avoid badness later on

* Send a nag-mail if pkg-plist errors are detected in the build
This commit is contained in:
Kris Kennaway 2004-12-28 05:40:15 +00:00
parent b6626b754a
commit e47e305126
Notes: svn2git 2021-03-31 03:12:20 +00:00
svn path=/head/; revision=125316

View File

@ -36,21 +36,12 @@ if grep -qxF ${pkgname} ${pb}/${arch}/${branch}/duds; then
exit 1
fi
args=${1+"$@"}
num=$(wc -w ${pb}/${arch}/ulist | awk '{print $1}')
random=$(jot -r 1 1 ${num})
mach=$(cat ${pb}/${arch}/ulist | cut -f ${random} -d ' ' )
# If ulist is empty, then all build machines are busy, so try again in 15 seconds.
if [ -z "${mach}" ]; then
echo "All machines busy, sleeping"
sleep 15
echo "Retrying build of ${pkgname}"
make ${pkgname}
exit 0
if [ -z "${pkgname}" ]; then
echo "null packagename"
exit 1
fi
set $mach
args=${1+"$@"}
flags=""
noclean=0
if [ "x$NOCLEAN" != "x" ]; then
@ -75,20 +66,27 @@ fi
if [ "x$TRYBROKEN" != "x" ]; then
flags="${flags} -trybroken"
fi
host=$1
. ${pb}/${arch}/portbuild.${host}
while [ -z "${chroot}" ]; do
echo "Claiming a directory for ${pkgname} on ${host}"
# May still fail if ssh times out?
chroot=$(ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/claim-chroot ${arch} ${branch} ${pkgname})
status=$?
if [ ! ${status} ]; then
echo "!!! Exiting from claim-chroot with status ${status} (${host} ${pkgname})"
exit ${status}
host=
chroot=
while [ -z "${host}" -o -z "${chroot}" ]; do
chroot=
host=$(lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/getmachine ${pb} ${arch} ${branch})
# If ulist is empty, then all build machines are busy, so try again in 15 seconds.
if [ -z "${host}" ]; then
sleep 15
else
test -f ${pb}/${arch}/portbuild.${host} && . ${pb}/${arch}/portbuild.${host}
chroot=$(ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/claim-chroot ${arch} ${branch} ${pkgname})
if [ -z "${chroot}" ]; then
echo "Failed to claim chroot on ${host}"
lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/releasemachine ${arch} ${host}
fi
fi
done
echo "--> got directory ${chroot}"
echo ${chroot}@${host}
test -f ${pb}/${arch}/portbuild.${host} && . ${pb}/${arch}/portbuild.${host}
echo "dispatching: ssh -a -t -n ${client_user}@${host} ${sudo_cmd} ${command} ${arch} ${branch} ${chroot} ${flags} \"$ED\" \"$PD\" \"$FD\" \"$BD\" \"$RD\" ${args}"
${pb}/scripts/ptimeout.host $timeout ssh -a -t -n ${client_user}@${host} ${sudo_cmd} ${command} ${arch} ${branch} ${chroot} ${flags} \"$ED\" \"$PD\" \"$FD\" \"$BD\" \"$RD\" ${args}
@ -113,9 +111,17 @@ if [ "${error}" = 0 ]; then
touch ${pb}/${arch}/${branch}/packages/All/${pkgname}${PKGSUFFIX}
rm -f ${pb}/${arch}/${branch}/errors/${pkgname}.log
lockf ${pb}/${arch}/${branch}/failure.lock ${pb}/scripts/buildsuccess ${arch} ${branch} ${pkgname}
if grep -q "even though it is marked BROKEN" ${pb}/${arch}/${branch}/logs/$pkgname.log; then
log=${pb}/${arch}/${branch}/logs/$pkgname.log
if grep -q "even though it is marked BROKEN" ${log}; then
echo | mail -s "${pkgname} BROKEN but built on ${arch} ${branch}" kris@FreeBSD.org
fi
if [ "${arch}" = "i386" ]; then
if grep -q "^list of .*file" ${log}; then
buildlogdir=$(realpath ${pb}/${arch}/${branch}/logs/)
baselogdir=$(basename ${buildlogdir})
(sed -e '/^build started/,$d' $log;echo;echo "For the full build log, see"; echo; echo " http://${master}/errorlogs/${arch}-errorlogs/${baselogdir}/$(basename $log)";echo;sed -e '1,/^=== Checking filesystem state/d' $log) | mail -s "${pkgname} pkg-plist errors on ${arch} ${branch}" kris@FreeBSD.org
fi
fi
else
log=${pb}/${arch}/${branch}/errors/${pkgname}.log
scp ${client_user}@${host}:${chroot}/tmp/${pkgname}.log ${log} || (echo ${chroot}@${host}; ssh -a -n ${client_user}@${host} ls -laR ${chroot}/tmp) | mail -s "${pkgname} logfile not found" kris@FreeBSD.org
@ -127,4 +133,5 @@ fi
ssh -a -n ${client_user}@${host} ${sudo_cmd} ${pb}/scripts/clean-chroot ${arch} ${branch} ${chroot} ${noclean}
lockf ${pb}/${arch}/queue/.lock ${pb}/scripts/releasemachine ${arch} ${host}
exit ${error}