"""
ToxPrint fingerprinter: compute binary chemotype fingerprints for molecules
using ToxPrint v2.0 (729 bits) or TxP_PFAS v1.0 (129 bits) definitions.
Usage::
from pyToxPrint.fingerprinter import ToxPrintFingerprinter, PFASFingerprinter
from rdkit import Chem
fp = ToxPrintFingerprinter() # loads bundled ToxPrint v2 XML
mol = Chem.MolFromSmiles("c1ccccc1")
arr, names = fp.fingerprint(mol) # numpy bool array + list of bit names
fp_pfas = PFASFingerprinter() # loads bundled TxP_PFAS XML
arr_pfas, names_pfas = fp_pfas.fingerprint(mol)
Pattern matching strategy
--------------------------
Each chemotype is defined by:
1. A primary SMARTS pattern (substructureMatch molecule)
2. Zero or more exception SMARTS patterns (substructureException molecules)
A fingerprint bit is set to 1 if:
* The molecule contains a substructure match for the primary pattern, AND
* The molecule does NOT contain a substructure match for any exception pattern
(exception patterns are only applied when the exception molecule contains
matchingQueryAtom cross-references to the main pattern; otherwise the
exception acts as a global exclusion)
Note: The exception logic is a reasonable approximation; the original ChemoTyper
tool may produce slightly different results for edge cases.
"""
from __future__ import annotations
import json
import os
import warnings
from functools import lru_cache
from pathlib import Path
from typing import Optional, Union
import numpy as np
_DATA_DIR = Path(__file__).parent / "data"
#: Path to the bundled ToxPrint v2.0 JSON fingerprint definition.
#: Pass this to :class:`Fingerprinter` to load ToxPrint instantly::
#:
#: fp = Fingerprinter(TOXPRINT_PATH)
TOXPRINT_PATH: Path = _DATA_DIR / "toxprint_V2.0_r711.json"
#: Path to the bundled TxP_PFAS v1.0.4 JSON fingerprint definition.
#: Pass this to :class:`Fingerprinter` to load TxP_PFAS instantly::
#:
#: fp = Fingerprinter(TXPPFAS_PATH)
TXPPFAS_PATH: Path = _DATA_DIR / "TxP_PFAS_v1.0.4.json"
# ---------------------------------------------------------------------------
# Lazy RDKit import
# ---------------------------------------------------------------------------
try:
from rdkit import Chem
from rdkit.Chem import MolFromSmarts
_HAS_RDKIT = True
except ImportError:
_HAS_RDKIT = False
class _FakeChem: # type: ignore
@staticmethod
def MolFromSmarts(s):
return None
@staticmethod
def MolFromSmiles(s):
return None
Chem = _FakeChem() # type: ignore
MolFromSmarts = Chem.MolFromSmarts
# ---------------------------------------------------------------------------
# Compiled pattern cache
# ---------------------------------------------------------------------------
_SMARTS_CACHE: dict[str, object] = {}
def _compile_smarts(smarts: str):
"""Return a cached compiled SMARTS query mol, or None on failure."""
if smarts in _SMARTS_CACHE:
return _SMARTS_CACHE[smarts]
result = None
if _HAS_RDKIT and smarts:
try:
result = Chem.MolFromSmarts(smarts)
except Exception:
result = None
_SMARTS_CACHE[smarts] = result
return result
# ---------------------------------------------------------------------------
# Load/build fingerprint spec
# ---------------------------------------------------------------------------
def _normalise_spec(raw: dict) -> dict:
"""
Accept both the flat list format ``[{id, label, smarts, ...}, ...]`` and
the wrapper dict format ``{"bits": [...], "id": ..., "title": ...}``.
Always returns the wrapper dict format.
"""
if isinstance(raw, list):
return {"id": "", "title": "", "csrml_version": "", "n_bits": len(raw), "bits": raw}
if "bits" not in raw:
raise ValueError(
"JSON/YAML fingerprint spec must be a list of bit dicts or a dict with a 'bits' key."
)
return raw
def _load_from_json(path: Path) -> dict:
"""Load a fingerprint spec from a JSON file."""
with open(path, encoding="utf-8") as f:
return _normalise_spec(json.load(f))
def _load_from_yaml(path: Path) -> dict:
"""Load a fingerprint spec from a YAML file (requires PyYAML)."""
try:
import yaml # noqa: PLC0415
except ImportError as exc:
raise ImportError(
"PyYAML is required to load YAML fingerprint definitions. "
"Install it with: pip install pyyaml"
) from exc
with open(path, encoding="utf-8") as f:
return _normalise_spec(yaml.safe_load(f))
def _load_from_xml(xml_path: Path, json_cache: Optional[Path]) -> dict:
"""
Parse a CSRML XML file; use / populate a JSON cache when provided.
"""
if json_cache and json_cache.exists():
_parser = Path(__file__).parent / "_csrml.py"
_parser_mtime = _parser.stat().st_mtime if _parser.exists() else 0.0
if (
(not xml_path.exists() or json_cache.stat().st_mtime >= xml_path.stat().st_mtime)
and json_cache.stat().st_mtime >= _parser_mtime
):
with open(json_cache, encoding="utf-8") as f:
return json.load(f)
# Parse XML
from pyCSRML._csrml import parse_csrml_xml, ordered_bit_list # noqa: PLC0415
parsed = parse_csrml_xml(str(xml_path))
bit_order = ordered_bit_list(parsed)
bits = []
for bit_id in bit_order:
sg = parsed["subgraph_index"].get(bit_id)
if sg is None:
continue
bits.append(
{
"id": sg["id"],
"label": sg["label"],
"smarts": sg["smarts"],
"exception_smarts": sg["exception_smarts"],
}
)
spec = {
"id": parsed["id"],
"title": parsed["title"],
"csrml_version": parsed["csrml_version"],
"n_bits": len(bits),
"bits": bits,
}
if json_cache:
try:
json_cache.parent.mkdir(parents=True, exist_ok=True)
with open(json_cache, "w", encoding="utf-8") as f:
json.dump(spec, f, ensure_ascii=False, indent=2)
except Exception: # noqa: BLE001
pass
return spec
def _load_spec(source: Path, json_cache: Optional[Path]) -> dict:
"""
Load a fingerprint spec from *source*, which may be an XML, JSON, or YAML file.
"""
suffix = source.suffix.lower()
if suffix == ".json":
return _load_from_json(source)
if suffix in (".yaml", ".yml"):
return _load_from_yaml(source)
return _load_from_xml(source, json_cache)
# ---------------------------------------------------------------------------
# Fingerprinter class
# ---------------------------------------------------------------------------
[docs]
class Fingerprinter:
"""
Compute binary chemotype fingerprints from a CSRML fingerprint definition.
The definition file can be in any of these formats:
* **XML** (``.xml``) — a CSRML XML file (ToxPrint v2 or TxP_PFAS). The
parser converts the subgraph patterns to SMARTS on the fly. An optional
JSON cache speeds up subsequent loads.
* **JSON** (``.json``) — a pre-built spec file (see
:doc:`../json_yaml_format` for the schema).
* **YAML** (``.yaml`` / ``.yml``) — same schema as JSON but in YAML
syntax. Requires ``pyyaml``.
Parameters
----------
source : str or Path
Path to the fingerprint definition file (.xml, .json, .yaml, or .yml).
json_cache : str or Path, optional
Path to a JSON cache file. Only used when *source* is an XML file.
If the cache is newer than the XML, it is loaded directly (faster).
verbose : bool
If True, emit a warning for every pattern that fails to compile.
"""
def __init__(
self,
source: Union[str, Path],
json_cache: Optional[Union[str, Path]] = None,
verbose: bool = False,
):
if not _HAS_RDKIT:
raise ImportError("RDKit is required for fingerprint computation.")
self._source = Path(source)
self._json_cache = Path(json_cache) if json_cache else None
self._verbose = verbose
self._spec = _load_spec(self._source, self._json_cache)
self._bits = self._spec["bits"]
self._n_bits = len(self._bits)
# Pre-compile SMARTS and warn on failures
self._queries: list[tuple] = [] # (main_query, [exc_query, ...])
self._valid: list[bool] = []
n_failed = 0
for bit in self._bits:
main_q = _compile_smarts(bit["smarts"]) if bit.get("smarts") else None
exc_qs = [
q
for s in bit.get("exception_smarts", [])
for q in [_compile_smarts(s)]
if q is not None
]
ok = main_q is not None
if not ok:
n_failed += 1
self._queries.append((main_q, exc_qs))
self._valid.append(ok)
if n_failed > 0 and self._verbose:
warnings.warn(
f"{n_failed}/{self._n_bits} patterns failed to compile and will "
"always produce 0.",
stacklevel=2,
)
# ------------------------------------------------------------------
@property
def n_bits(self) -> int:
"""Number of fingerprint bits."""
return self._n_bits
@property
def bit_names(self) -> list[str]:
"""Ordered list of bit labels (one per bit)."""
return [b["label"] for b in self._bits]
@property
def bit_ids(self) -> list[str]:
"""Ordered list of bit IDs (original subgraph IDs)."""
return [b["id"] for b in self._bits]
@property
def title(self) -> str:
return self._spec.get("title", "")
# ------------------------------------------------------------------
[docs]
def fingerprint(self, mol) -> tuple[np.ndarray, list[str]]:
"""
Compute the binary fingerprint for a molecule.
Parameters
----------
mol : rdkit.Chem.Mol
An RDKit molecule object (must be pre-sanitized).
Returns
-------
array : numpy.ndarray of dtype bool
Binary fingerprint vector of length n_bits.
names : list[str]
Corresponding bit labels.
"""
if mol is None:
return np.zeros(self._n_bits, dtype=bool), self.bit_names
# Add explicit Hs so that SMARTS patterns with [#1] (e.g. CH2 groups
# in fluorotelomer chains) and [#8;X2] (OH groups) match correctly.
mol_h = Chem.AddHs(mol)
arr = np.zeros(self._n_bits, dtype=bool)
for i, (main_q, exc_qs) in enumerate(self._queries):
if main_q is None:
continue
if mol_h.HasSubstructMatch(main_q):
# Check exceptions
hit = all(not mol_h.HasSubstructMatch(eq) for eq in exc_qs) if exc_qs else True
arr[i] = hit
return arr, self.bit_names
[docs]
def fingerprint_smiles(self, smiles: str) -> tuple[np.ndarray, list[str]]:
"""
Compute fingerprint from a SMILES string.
Returns all-zeros array if SMILES is invalid.
"""
mol = Chem.MolFromSmiles(smiles)
return self.fingerprint(mol)
[docs]
def fingerprint_batch(
self,
mols,
smiles_list: Optional[list[str]] = None,
) -> np.ndarray:
"""
Compute fingerprints for a list of molecules (or SMILES strings).
Parameters
----------
mols : iterable of rdkit.Chem.Mol or None
If None is passed for a molecule, zeros are used.
smiles_list : list of str, optional
If provided, mols is ignored and this list of SMILES is used instead.
Returns
-------
matrix : numpy.ndarray of shape (n_mols, n_bits), dtype bool
"""
if smiles_list is not None:
mols = [Chem.MolFromSmiles(s) for s in smiles_list]
results = [self.fingerprint(m)[0] for m in mols]
return np.vstack(results) if results else np.empty((0, self._n_bits), dtype=bool)