test-driver: Use futures for OCR parallelization

This commit is contained in:
Jacek Galowicz 2025-07-02 11:43:13 +00:00
parent e6ea13f4ea
commit 819d304a39

View File

@ -1,7 +1,7 @@
import multiprocessing
import os import os
import shutil import shutil
import subprocess import subprocess
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from test_driver.errors import MachineError from test_driver.errors import MachineError
@ -33,17 +33,19 @@ def perform_ocr_variants_on_screenshot(
# Docs suggest to run it with OMP_THREAD_LIMIT=1 for hundreds of parallel # Docs suggest to run it with OMP_THREAD_LIMIT=1 for hundreds of parallel
# runs. Our average test run is somewhere inbetween. # runs. Our average test run is somewhere inbetween.
# https://github.com/tesseract-ocr/tesseract/issues/3109 # https://github.com/tesseract-ocr/tesseract/issues/3109
processes = max(1, int(os.process_cpu_count() / 4)) workers = max(1, int(os.process_cpu_count() / 4))
with multiprocessing.Pool(processes=processes) as pool: with ThreadPoolExecutor(max_workers=workers) as e:
image_paths: list[Path] = [screenshot_path] # The idea here is to let the first tesseract call run on the raw image
# while the other two are preprocessed + tesseracted in parallel
future_results: list[Future] = [e.submit(_run_tesseract, screenshot_path)]
if variants: if variants:
image_paths.extend(
pool.starmap( def tesseract_processed(inverted: bool) -> str:
_preprocess_screenshot, return _run_tesseract(_preprocess_screenshot(screenshot_path, inverted))
[(screenshot_path, False), (screenshot_path, True)],
) future_results.append(e.submit(tesseract_processed, False))
) future_results.append(e.submit(tesseract_processed, True))
return pool.map(_run_tesseract, image_paths) return [future.result() for future in future_results]
def _run_tesseract(image: Path) -> str: def _run_tesseract(image: Path) -> str: