"""Run RISC-V Torture against a compiled Chipyard/BOOM RTL simulator and diff the
Spike vs RTL architectural signatures."""
import fcntl
import logging
import os
import re
import stat
import subprocess
import uuid
from chia.chipyard.chisel_build_node import ChiselBuildNode
from chia.chipyard.state_def import (
BuildArtifact,
TortureMode,
TortureResult,
TortureTestRun,
)
from chia.base.ChiaFunction import ChiaFunction
# Drops -XX:MaxPermSize=128M (Unrecognized VM option on JDK 9+) and adds
# -Djava.security.manager=allow so the bundled sbt-launch.jar (1.4.4) can
# still call System.setSecurityManager on JDK 17+ without UnsupportedOperationException.
SBT_OVERRIDE = "java -Djava.security.manager=allow -Xmx2G -Xss8M -jar sbt-launch.jar"
[docs]
class TortureRunNode:
"""Runs RISC-V Torture against a compiled Chipyard/BOOM RTL simulator.
`Torture <https://github.com/ucb-bar/riscv-torture>`_ randomly generates
RISC-V assembly tests, runs each on both Spike (the ISA reference model)
and the RTL simulator under test (the DUT), and diffs their architectural
signatures. This node drives
the ``make`` flow in ``<chipyard_path>/tools/torture``: it writes the
simulator binary from a :class:`BuildArtifact` to disk, passes it as the
Torture ``R_SIM``, invokes the appropriate target for the requested
:class:`TortureMode`, and collects the generated test, disassembly, and
Spike/RTL signatures for each run.
Because Torture writes into a shared ``tools/torture/output`` directory,
concurrent runs on the same checkout are serialized with a file lock.
"""
logging_name = "TortureRunNode"
def __init__(
self,
chipyard_path: str,
sbt_override: str = SBT_OVERRIDE,
timeout_seconds: int = 1800,
logging_level: int = logging.DEBUG,
):
"""Configure a Torture runner against a Chipyard checkout.
Args:
chipyard_path: Absolute path to the Chipyard checkout on the build
node. Torture is driven out of ``<chipyard_path>/tools/torture``
(with its shared ``output/`` subdirectory). Path for the CHIA chipyard container is ``/home/ray/chipyard``
sbt_override: The ``SBT=`` command handed to ``make``, overriding the
Makefile's default launcher.
timeout_seconds: Wall-clock timeout for the ``make`` subprocess
(default 30 min). For :attr:`TortureMode.OVERNIGHT` the effective
timeout is raised to at least ``overnight_minutes * 60 + 600`` so
the make process outlives the overnight loop.
logging_level: Logging level for this node's logger.
"""
self.chipyard_path = chipyard_path
self.torture_path = os.path.join(chipyard_path, "tools", "torture")
self.output_path = os.path.join(self.torture_path, "output")
self.sbt_override = sbt_override
self.timeout_seconds = timeout_seconds
self.logger = logging.getLogger(self.logging_name)
self.logger.setLevel(logging_level)
def _setup(self, artifact: BuildArtifact, work_dir: str) -> tuple[str, str]:
os.makedirs(work_dir, exist_ok=True)
task_dir = os.path.join(work_dir, uuid.uuid4().hex[:8])
os.makedirs(task_dir, exist_ok=True)
sim_path = os.path.join(task_dir, artifact.simulator_binary_name)
with open(sim_path, "wb") as f:
f.write(artifact.simulator_binary_content)
os.chmod(sim_path, os.stat(sim_path).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
self.logger.info(f"Setup complete. task_dir={task_dir}, sim_path={sim_path}")
return task_dir, sim_path
def _flock_torture(self) -> int:
lock_path = os.path.join(self.torture_path, ".chia-torture.lock")
lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
fcntl.flock(lock_fd, fcntl.LOCK_EX)
return lock_fd
def _release(self, lock_fd: int) -> None:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
os.close(lock_fd)
def _clean_output(self) -> None:
if not os.path.isdir(self.output_path):
return
try:
subprocess.run(
["make", "-C", self.output_path, "clean-all"],
capture_output=True, text=True, timeout=60,
)
except subprocess.TimeoutExpired:
self.logger.warning("torture output clean-all timed out")
def _run_make(self, args: list[str], timeout_seconds: int) -> subprocess.CompletedProcess:
cmd = ["make", "-C", self.torture_path] + args + [f"SBT={self.sbt_override}"]
self.logger.info(f"Running: {cmd}")
return subprocess.run(
cmd,
capture_output=True, text=True, timeout=timeout_seconds,
)
def _slurp(self, path: str) -> str:
try:
with open(path, "r", errors="replace") as f:
return f.read()
except (FileNotFoundError, IsADirectoryError):
return ""
def _parse_single_stdout(self, stdout: str) -> tuple[bool, list[str]]:
"""testrun emits '// All signatures match for <bin>' on success and
'// Simulation failed for <bin>:' / '// Mismatched sigs for <bin>:' on failure."""
fails = re.findall(r"//\s+Simulation failed for (\S+):", stdout)
mism = re.findall(r"//\s+Mismatched sigs for (\S+):", stdout)
if fails or mism:
# dedupe preserving order
return False, list(dict.fromkeys(fails + mism))
if "All signatures match" in stdout:
return True, []
# Neither marker — testrun did not reach the diff phase (compile error, generator crash).
return False, []
def _build_test_run(self, abs_bin: str, success: bool) -> TortureTestRun:
"""Slurp .S/.dump/.spike.sig/.rtlsim.sig (and any narrowed pseg) for one test."""
base = os.path.basename(abs_bin)
test_s = self._slurp(abs_bin + ".S")
test_dump = self._slurp(abs_bin + ".dump")
spike_sig = self._slurp(abs_bin + ".spike.sig")
rtlsim_sig = self._slurp(abs_bin + ".rtlsim.sig")
pseg_s: str | None = None
if not success:
output_dir = os.path.dirname(abs_bin)
if os.path.isdir(output_dir):
for fname in sorted(os.listdir(output_dir)):
if fname.startswith(base + "_pseg_") and fname.endswith(".S"):
pseg_s = self._slurp(os.path.join(output_dir, fname))
break
return TortureTestRun(
name=base, success=success,
test_s=test_s, test_dump=test_dump,
spike_sig=spike_sig, rtlsim_sig=rtlsim_sig,
pseg_test_s=pseg_s,
)
def _persist_test(self, test: TortureTestRun, task_dir: str) -> None:
"""Copy a test's artifacts into <task_dir>/tests/<name>/ so they survive cross-run cleanup."""
persist_dir = os.path.join(task_dir, "tests", test.name)
os.makedirs(persist_dir, exist_ok=True)
for ext, content in [
(".S", test.test_s), (".dump", test.test_dump),
(".spike.sig", test.spike_sig), (".rtlsim.sig", test.rtlsim_sig),
]:
if content:
with open(os.path.join(persist_dir, test.name + ext), "w") as f:
f.write(content)
if test.pseg_test_s is not None:
with open(os.path.join(persist_dir, test.name + "_pseg.S"), "w") as f:
f.write(test.pseg_test_s)
def _gather_single(self, stdout: str, all_match: bool, failing_binaries: list[str], task_dir: str) -> list[TortureTestRun]:
"""SINGLE/REPLAY: one test ran. Pull its artifacts whether it passed or failed."""
tests: list[TortureTestRun] = []
if all_match:
# On success, testrun emits the binary path in "All signatures match for <bin>".
success_bins = re.findall(r"//\s+All signatures match for (\S+)", stdout)
for bin_path in dict.fromkeys(success_bins):
abs_bin = bin_path if os.path.isabs(bin_path) else os.path.join(self.torture_path, bin_path)
t = self._build_test_run(abs_bin, success=True)
self._persist_test(t, task_dir)
tests.append(t)
else:
for bin_path in failing_binaries:
abs_bin = bin_path if os.path.isabs(bin_path) else os.path.join(self.torture_path, bin_path)
t = self._build_test_run(abs_bin, success=False)
self._persist_test(t, task_dir)
tests.append(t)
return tests
def _gather_overnight(self, stdout: str, failed_dir: str, task_dir: str) -> tuple[int, int, list[TortureTestRun]]:
"""OVERNIGHT: count passes from stdout (overnight deletes passing artifacts);
slurp every failure from failedtests/. Returns (num_tests, num_failures, tests)."""
passes = len(re.findall(r"All signatures match", stdout))
tests: list[TortureTestRun] = []
if os.path.isdir(failed_dir):
seen: set[str] = set()
for fname in sorted(os.listdir(failed_dir)):
base, _ = os.path.splitext(fname)
if base.endswith((".spike", ".rtlsim", ".csim")):
base = base.rsplit(".", 1)[0]
if base in seen:
continue
seen.add(base)
fbase = os.path.join(failed_dir, base)
t = self._build_test_run(fbase, success=False)
self._persist_test(t, task_dir)
tests.append(t)
return passes + len(tests), len(tests), tests
[docs]
@ChiaFunction(resources={"chipyard": 1})
def torture(
self,
artifact: BuildArtifact,
mode: TortureMode = TortureMode.SINGLE,
work_dir: str = "/tmp/chia-torture",
torture_config_file: str | None = None,
overnight_minutes: int = 30,
overnight_max_failures: int = 1,
replay_test_s: str | None = None,
) -> TortureResult:
"""Run Torture against a pre-built simulator and collect the results.
The simulator binary from ``artifact`` is written to a per-run task
directory under ``work_dir`` and passed to Torture as ``R_SIM``. The
make target invoked depends on ``mode``:
* :attr:`TortureMode.SINGLE` → ``make rgentest`` — generate one test,
run it on the DUT and Spike, and diff signatures.
* :attr:`TortureMode.OVERNIGHT` → ``make rnight`` — loop generating and
running tests until ``overnight_max_failures`` failures accumulate or
``overnight_minutes`` elapse; failing tests are saved under
``failedtests/``.
* :attr:`TortureMode.REPLAY` → ``make rtest`` — run a caller-supplied
assembly test (``replay_test_s``) instead of generating one.
If ``artifact.success`` is False the run is skipped and an unsuccessful
:class:`TortureResult` is returned immediately. Runs are serialized on a
per-checkout file lock.
Args:
artifact: Compiled RTL simulator to test, from a
:class:`ChiselBuildNode` build. Its ELF bytes become the Torture
``R_SIM`` (the DUT); Spike is the implicit reference model.
mode: Which Torture flow to run — see :class:`TortureMode`.
work_dir: Base directory under which a fresh per-run task directory
(random 8-hex name) is created to hold the simulator binary and
collected artifacts.
torture_config_file: Optional Torture generator config, passed as
``-C <file>`` (Torture's ``--config``). Controls the generated
instruction mix / sequences. ``None`` uses Torture's default.
overnight_minutes: OVERNIGHT only — how long the generate-and-test
loop runs, passed as ``-m <minutes>``. Also extends the make
subprocess timeout. Ignored in other modes.
overnight_max_failures: OVERNIGHT only — stop after this many failing
tests, passed as Torture's ``-t <count>`` threshold. Ignored in
other modes.
replay_test_s: REPLAY only — the assembly source to replay. Written
to ``replay.S`` and passed as ``TEST=`` (Torture's ``-a``).
Required for REPLAY; raises :class:`ValueError` if missing.
Returns:
A :class:`TortureResult` with overall ``success``, the test/failure
counts, per-test artifacts (:class:`TortureTestRun`), and the raw
make ``stdout``/``stderr``/``returncode``. ``build_artifact`` is left
``None`` here; :meth:`torture_from_config` populates it.
"""
if not artifact.success:
return TortureResult(
name="torture",
config=artifact.config,
config_package=artifact.config_package,
mode=mode,
success=False,
num_tests=0,
num_failures=0,
tests=[],
stdout=artifact.stdout,
stderr=artifact.stderr + "\nTorture skipped: build artifact was unsuccessful.",
returncode=artifact.returncode,
build_artifact=None,
)
task_dir, sim_path = self._setup(artifact, work_dir)
lock_fd = self._flock_torture()
try:
self._clean_output()
opts: list[str] = []
if torture_config_file:
opts += ["-C", torture_config_file]
run_timeout = self.timeout_seconds
if mode == TortureMode.SINGLE:
make_args = ["rgentest", f"R_SIM={sim_path}"]
if opts:
make_args.append(f"OPTIONS={' '.join(opts)}")
elif mode == TortureMode.OVERNIGHT:
failed_dir = os.path.join(task_dir, "failedtests")
os.makedirs(failed_dir, exist_ok=True)
onight_opts = opts + [
"-m", str(overnight_minutes),
"-t", str(overnight_max_failures),
"-p", failed_dir,
]
make_args = ["rnight", f"R_SIM={sim_path}", f"OPTIONS={' '.join(onight_opts)}"]
# Ensure the make subprocess doesn't time out before the overnight loop completes.
run_timeout = max(self.timeout_seconds, overnight_minutes * 60 + 600)
elif mode == TortureMode.REPLAY:
if not replay_test_s:
raise ValueError("REPLAY mode requires replay_test_s")
replay_path = os.path.join(task_dir, "replay.S")
with open(replay_path, "w") as f:
f.write(replay_test_s)
make_args = ["rtest", f"R_SIM={sim_path}", f"TEST={replay_path}"]
if opts:
make_args.append(f"OPTIONS={' '.join(opts)}")
else:
raise ValueError(f"Unsupported TortureMode: {mode}")
try:
proc = self._run_make(make_args, run_timeout)
stdout, stderr, rc = proc.stdout, proc.stderr, proc.returncode
except subprocess.TimeoutExpired as e:
stdout = e.stdout.decode(errors="replace") if isinstance(e.stdout, bytes) else (e.stdout or "")
stderr = e.stderr.decode(errors="replace") if isinstance(e.stderr, bytes) else (e.stderr or "")
stderr += f"\nTorture run timed out after {run_timeout}s"
rc = -1
if mode == TortureMode.OVERNIGHT:
num_tests, num_failures, tests = self._gather_overnight(
stdout, os.path.join(task_dir, "failedtests"), task_dir)
# overnight/run does System.exit(2) on errors; that's a *normal* completion for us.
run_completed = rc in (0, 2)
success = run_completed and num_failures == 0
else:
all_match, failing_bins = self._parse_single_stdout(stdout)
tests = self._gather_single(stdout, all_match, failing_bins, task_dir)
num_failures = sum(1 for t in tests if not t.success)
num_tests = max(len(tests), 1)
success = (rc == 0) and all_match
finally:
self._release(lock_fd)
return TortureResult(
name="torture",
config=artifact.config,
config_package=artifact.config_package,
mode=mode,
success=success,
num_tests=num_tests,
num_failures=num_failures,
tests=tests,
stdout=stdout,
stderr=stderr,
returncode=rc,
)
@ChiaFunction(resources={"chipyard": 1})
def torture_from_config(
self,
config: str,
config_package: str = "chipyard",
mode: TortureMode = TortureMode.SINGLE,
work_dir: str = "/tmp/chia-torture",
build_kwargs: dict | None = None,
torture_config_file: str | None = None,
overnight_minutes: int = 30,
overnight_max_failures: int = 1,
replay_test_s: str | None = None,
) -> TortureResult:
builder = ChiselBuildNode(
chipyard_path=self.chipyard_path,
config=config,
config_package=config_package,
**(build_kwargs or {}),
)
artifact = builder.build()
if not artifact.success:
return TortureResult(
name="torture",
config=config,
config_package=config_package,
mode=mode,
success=False,
num_tests=0,
num_failures=0,
tests=[],
stdout=artifact.stdout,
stderr=artifact.stderr + "\nTorture skipped: ChiselBuildNode failed.",
returncode=artifact.returncode,
build_artifact=artifact,
)
result = self.torture(
artifact=artifact,
mode=mode,
work_dir=work_dir,
torture_config_file=torture_config_file,
overnight_minutes=overnight_minutes,
overnight_max_failures=overnight_max_failures,
replay_test_s=replay_test_s,
)
result.build_artifact = artifact
return result