Quick start
Computing fingerprints for a single molecule
from pyCSRML import Fingerprinter, TOXPRINT_PATH
from rdkit import Chem
fp = Fingerprinter(TOXPRINT_PATH) # loads bundled ToxPrint v2.0 (729 bits)
mol = Chem.MolFromSmiles("c1ccccc1") # benzene
arr, names = fp.fingerprint(mol)
print(f"Bits set: {arr.sum()} / {fp.n_bits}")
on_bits = [names[i] for i in range(len(arr)) if arr[i]]
print(on_bits[:5])
arr is a boolean numpy.ndarray of length fp.n_bits.
names is the matching list of chemotype labels (e.g. "ring:aro_6_C").
Using the bundled TxP_PFAS definition
from pyCSRML import Fingerprinter, TXPPFAS_PATH
from rdkit import Chem
fp = Fingerprinter(TXPPFAS_PATH) # loads bundled TxP_PFAS v1.0.4 (129 bits)
mol = Chem.MolFromSmiles(
"FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)O" # PFOA
)
arr, names = fp.fingerprint(mol)
print(f"Bits set: {arr.sum()} / {fp.n_bits}")
Batch processing
fingerprint_batch() processes a list of molecules
and returns a 2-D boolean NumPy matrix of shape (n_molecules, n_bits).
from pyCSRML import Fingerprinter, TXPPFAS_PATH
from rdkit import Chem
smiles = [
"FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(=O)O", # PFOA
"FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)S(=O)(=O)O", # PFOS
"CCO", # ethanol (negative control)
]
mols = [Chem.MolFromSmiles(s) for s in smiles]
fp = Fingerprinter(TXPPFAS_PATH)
matrix = fp.fingerprint_batch(mols) # shape (3, 129), dtype bool
print(matrix.shape, matrix.dtype)
Using a custom CSRML XML file
from pyCSRML import Fingerprinter
fp = Fingerprinter("path/to/my_fingerprints.xml")
mol = Chem.MolFromSmiles("CCO")
arr, names = fp.fingerprint(mol)