Skip to content

Configuration API

The configuration system provides type-safe configuration classes for data generation.

Configuration Schemas

Molecular Configuration

synthbiodata.config.schema.v1.molecular.MolecularConfig

Bases: BaseConfig

Configuration schema for molecular descriptor data.

This class defines the configuration options for generating synthetic molecular descriptor data, including ranges and distributions for molecular weight (MW), LogP, and TPSA, as well as target protein family probabilities.

Parameters

mw_mean : float Mean molecular weight of generated molecules. mw_std : float Standard deviation of molecular weight. mw_min : float Minimum allowed molecular weight. mw_max : float Maximum allowed molecular weight. logp_mean : float Mean LogP (octanol-water partition coefficient) value. logp_std : float Standard deviation of LogP. logp_min : float Minimum allowed LogP value. logp_max : float Maximum allowed LogP value. tpsa_mean : float Mean topological polar surface area (TPSA) value. tpsa_std : float Standard deviation of TPSA. tpsa_min : float Minimum allowed TPSA value. tpsa_max : float Maximum allowed TPSA value. target_families : list of str List of target protein families to sample from. target_family_probs : list of float Probability distribution for selecting each target family.

Examples

config = MolecularConfig() config.mw_mean 350.0 config.target_families ['GPCR', 'Kinase', 'Protease', 'Ion Channel', 'Nuclear Receptor']

Source code in src/synthbiodata/config/schema/v1/molecular.py
class MolecularConfig(BaseConfig):
    """
    Configuration schema for molecular descriptor data.

    This class defines the configuration options for generating synthetic molecular
    descriptor data, including ranges and distributions for molecular weight (MW),
    LogP, and TPSA, as well as target protein family probabilities.

    Parameters
    ----------
    mw_mean : float
        Mean molecular weight of generated molecules.
    mw_std : float
        Standard deviation of molecular weight.
    mw_min : float
        Minimum allowed molecular weight.
    mw_max : float
        Maximum allowed molecular weight.
    logp_mean : float
        Mean LogP (octanol-water partition coefficient) value.
    logp_std : float
        Standard deviation of LogP.
    logp_min : float
        Minimum allowed LogP value.
    logp_max : float
        Maximum allowed LogP value.
    tpsa_mean : float
        Mean topological polar surface area (TPSA) value.
    tpsa_std : float
        Standard deviation of TPSA.
    tpsa_min : float
        Minimum allowed TPSA value.
    tpsa_max : float
        Maximum allowed TPSA value.
    target_families : list of str
        List of target protein families to sample from.
    target_family_probs : list of float
        Probability distribution for selecting each target family.

    Examples
    --------
    >>> config = MolecularConfig()
    >>> config.mw_mean
    350.0
    >>> config.target_families
    ['GPCR', 'Kinase', 'Protease', 'Ion Channel', 'Nuclear Receptor']
    """
    # Molecular descriptor ranges
    mw_mean: float = Field(MOLECULAR_DEFAULTS["MW_MEAN"], 
                        description="Mean molecular weight")
    mw_std: float = Field(MOLECULAR_DEFAULTS["MW_STD"], 
                       description="Standard deviation of molecular weight")
    mw_min: float = Field(MOLECULAR_DEFAULTS["MW_MIN"], 
                       description="Minimum molecular weight")
    mw_max: float = Field(MOLECULAR_DEFAULTS["MW_MAX"], 
                       description="Maximum molecular weight")

    logp_mean: float = Field(MOLECULAR_DEFAULTS["LOGP_MEAN"], 
                          description="Mean LogP value")
    logp_std: float = Field(MOLECULAR_DEFAULTS["LOGP_STD"], 
                         description="Standard deviation of LogP")
    logp_min: float = Field(MOLECULAR_DEFAULTS["LOGP_MIN"], 
                         description="Minimum LogP value")
    logp_max: float = Field(MOLECULAR_DEFAULTS["LOGP_MAX"], 
                         description="Maximum LogP value")

    tpsa_mean: float = Field(MOLECULAR_DEFAULTS["TPSA_MEAN"], 
                          description="Mean TPSA value")
    tpsa_std: float = Field(MOLECULAR_DEFAULTS["TPSA_STD"], 
                         description="Standard deviation of TPSA")
    tpsa_min: float = Field(MOLECULAR_DEFAULTS["TPSA_MIN"], 
                         description="Minimum TPSA value")
    tpsa_max: float = Field(MOLECULAR_DEFAULTS["TPSA_MAX"], 
                         description="Maximum TPSA value")

    # Target protein families
    target_families: list[str] = Field(
        default=TARGET_FAMILIES,
        description="List of target protein families"
    )
    target_family_probs: list[float] = Field(
        default=TARGET_FAMILY_PROBS,
        description="Probability distribution for target families"
    )

    @model_validator(mode='after')
    def validate_ranges(self) -> 'MolecularConfig':
        """Validate molecular descriptor ranges and standard deviations."""
        # Validate min/max ranges
        for param in ['mw', 'logp', 'tpsa']:
            min_val = getattr(self, f"{param}_min")
            max_val = getattr(self, f"{param}_max")
            if min_val >= max_val:
                logger.error(f"Invalid {param} range: min={min_val}, max={max_val}")
                raise RangeError(f"{param}_min", min_val, max_val=max_val)

        # Validate standard deviations
        for param in ['mw', 'logp', 'tpsa']:
            std_val = getattr(self, f"{param}_std")
            if std_val <= 0:
                logger.error(f"Invalid {param} standard deviation: {std_val}")
                raise RangeError(f"{param}_std", std_val, min_val=0)

        # Validate target distributions
        if len(self.target_families) != len(self.target_family_probs):
            logger.error(
                f"Mismatched lengths: target_families={len(self.target_families)}, "
                f"target_family_probs={len(self.target_family_probs)}"
            )
            raise DistributionError(
                f"Length mismatch: target_families ({len(self.target_families)}) "
                f"!= target_family_probs ({len(self.target_family_probs)})"
            )

        prob_sum = sum(self.target_family_probs)
        if abs(prob_sum - 1.0) > 1e-6:
            logger.error(f"Target family probabilities sum to {prob_sum}, should be 1.0")
            raise DistributionError(
                f"Target family probabilities must sum to 1.0, got {prob_sum}"
            )

        logger.debug("Validated molecular descriptor ranges successfully")
        return self

Functions

validate_ranges() -> MolecularConfig

Validate molecular descriptor ranges and standard deviations.

Source code in src/synthbiodata/config/schema/v1/molecular.py
@model_validator(mode='after')
def validate_ranges(self) -> 'MolecularConfig':
    """Validate molecular descriptor ranges and standard deviations."""
    # Validate min/max ranges
    for param in ['mw', 'logp', 'tpsa']:
        min_val = getattr(self, f"{param}_min")
        max_val = getattr(self, f"{param}_max")
        if min_val >= max_val:
            logger.error(f"Invalid {param} range: min={min_val}, max={max_val}")
            raise RangeError(f"{param}_min", min_val, max_val=max_val)

    # Validate standard deviations
    for param in ['mw', 'logp', 'tpsa']:
        std_val = getattr(self, f"{param}_std")
        if std_val <= 0:
            logger.error(f"Invalid {param} standard deviation: {std_val}")
            raise RangeError(f"{param}_std", std_val, min_val=0)

    # Validate target distributions
    if len(self.target_families) != len(self.target_family_probs):
        logger.error(
            f"Mismatched lengths: target_families={len(self.target_families)}, "
            f"target_family_probs={len(self.target_family_probs)}"
        )
        raise DistributionError(
            f"Length mismatch: target_families ({len(self.target_families)}) "
            f"!= target_family_probs ({len(self.target_family_probs)})"
        )

    prob_sum = sum(self.target_family_probs)
    if abs(prob_sum - 1.0) > 1e-6:
        logger.error(f"Target family probabilities sum to {prob_sum}, should be 1.0")
        raise DistributionError(
            f"Target family probabilities must sum to 1.0, got {prob_sum}"
        )

    logger.debug("Validated molecular descriptor ranges successfully")
    return self

ADME Configuration

synthbiodata.config.schema.v1.adme.ADMEConfig

Bases: BaseConfig

Configuration for ADME data generation.

Source code in src/synthbiodata/config/schema/v1/adme.py
class ADMEConfig(BaseConfig):
    """Configuration for ADME data generation."""
    # Absorption
    absorption_mean: float = Field(ADME_DEFAULTS["ABSORPTION_MEAN"], 
                               description="Mean absorption percentage")
    absorption_std: float = Field(ADME_DEFAULTS["ABSORPTION_STD"], 
                              description="Standard deviation of absorption")

    # Distribution
    plasma_protein_binding_mean: float = Field(ADME_DEFAULTS["PROTEIN_BINDING_MEAN"], 
                                           description="Mean plasma protein binding percentage")
    plasma_protein_binding_std: float = Field(ADME_DEFAULTS["PROTEIN_BINDING_STD"], 
                                          description="Standard deviation of plasma protein binding")

    # Metabolism
    clearance_mean: float = Field(ADME_DEFAULTS["CLEARANCE_MEAN"], 
                              description="Mean clearance rate (L/h)")
    clearance_std: float = Field(ADME_DEFAULTS["CLEARANCE_STD"], 
                             description="Standard deviation of clearance")
    half_life_mean: float = Field(ADME_DEFAULTS["HALF_LIFE_MEAN"], 
                              description="Mean half-life (hours)")
    half_life_std: float = Field(ADME_DEFAULTS["HALF_LIFE_STD"], 
                             description="Standard deviation of half-life")

    # Excretion
    renal_clearance_ratio: float = Field(ADME_DEFAULTS["RENAL_CLEARANCE_RATIO"], 
                                     description="Ratio of renal to total clearance")

    @model_validator(mode='after')
    def validate_parameters(self) -> 'ADMEConfig':
        """Validate ADME parameters and standard deviations."""
        # Percentage: Parameters that must be between 0 and 100
        percentage_params = [
            ('absorption_mean', self.absorption_mean),
            ('plasma_protein_binding_mean', self.plasma_protein_binding_mean)
        ]
        for param, value in percentage_params:
            if value < 0 or value > 100:
                logger.error(f"Invalid {param}: {value}")
                raise RangeError(param, value, min_val=0, max_val=100)

        # Means: Parameters that must be positive
        positive_means = [
            ('clearance_mean', self.clearance_mean),
            ('half_life_mean', self.half_life_mean)
        ]
        for param, value in positive_means:
            if value <= 0:
                logger.error(f"Invalid {param}: {value}")
                raise RangeError(param, value, min_val=0)

        # Ratios: Parameters that must be between 0 and 1
        ratio_params = [
            ('renal_clearance_ratio', self.renal_clearance_ratio)
        ]
        for param, value in ratio_params:
            if value < 0 or value > 1:
                logger.error(f"Invalid {param}: {value}")
                raise RangeError(param, value, min_val=0, max_val=1)

        # All standard deviations must be positive
        std_params = [
            ('absorption_std', self.absorption_std),
            ('plasma_protein_binding_std', self.plasma_protein_binding_std),
            ('clearance_std', self.clearance_std),
            ('half_life_std', self.half_life_std)
        ]
        for param, value in std_params:
            if value <= 0:
                logger.error(f"Invalid {param}: {value}")
                raise RangeError(param, value, min_val=0)

        logger.debug("Validated ADME parameters successfully")
        return self

Functions

validate_parameters() -> ADMEConfig

Validate ADME parameters and standard deviations.

Source code in src/synthbiodata/config/schema/v1/adme.py
@model_validator(mode='after')
def validate_parameters(self) -> 'ADMEConfig':
    """Validate ADME parameters and standard deviations."""
    # Percentage: Parameters that must be between 0 and 100
    percentage_params = [
        ('absorption_mean', self.absorption_mean),
        ('plasma_protein_binding_mean', self.plasma_protein_binding_mean)
    ]
    for param, value in percentage_params:
        if value < 0 or value > 100:
            logger.error(f"Invalid {param}: {value}")
            raise RangeError(param, value, min_val=0, max_val=100)

    # Means: Parameters that must be positive
    positive_means = [
        ('clearance_mean', self.clearance_mean),
        ('half_life_mean', self.half_life_mean)
    ]
    for param, value in positive_means:
        if value <= 0:
            logger.error(f"Invalid {param}: {value}")
            raise RangeError(param, value, min_val=0)

    # Ratios: Parameters that must be between 0 and 1
    ratio_params = [
        ('renal_clearance_ratio', self.renal_clearance_ratio)
    ]
    for param, value in ratio_params:
        if value < 0 or value > 1:
            logger.error(f"Invalid {param}: {value}")
            raise RangeError(param, value, min_val=0, max_val=1)

    # All standard deviations must be positive
    std_params = [
        ('absorption_std', self.absorption_std),
        ('plasma_protein_binding_std', self.plasma_protein_binding_std),
        ('clearance_std', self.clearance_std),
        ('half_life_std', self.half_life_std)
    ]
    for param, value in std_params:
        if value <= 0:
            logger.error(f"Invalid {param}: {value}")
            raise RangeError(param, value, min_val=0)

    logger.debug("Validated ADME parameters successfully")
    return self

Base Configuration

synthbiodata.config.base.BaseConfig

Bases: BaseModel

Base configuration for all data types.

Source code in src/synthbiodata/config/base.py
class BaseConfig(BaseModel):
    """Base configuration for all data types."""
    schema_version: Literal["1.0"] = "1.0"

    n_samples: int = Field(DATASET_DEFAULTS["DEFAULT_SAMPLES"], 
                        description="Number of samples to generate")
    positive_ratio: float = Field(DATASET_DEFAULTS["IMBALANCED_RATIO"], 
                              description="Ratio of positive samples")
    test_size: float = Field(DATASET_DEFAULTS["TEST_SIZE"], 
                         description="Test set size ratio")
    val_size: float = Field(DATASET_DEFAULTS["VAL_SIZE"], 
                        description="Validation set size ratio")
    random_state: int = Field(DATASET_DEFAULTS["RANDOM_SEED"], 
                          description="Random seed for reproducibility")
    imbalanced: bool = Field(False, description="Whether to generate imbalanced dataset")

    @model_validator(mode='after')
    def validate_splits(self) -> 'BaseConfig':
        """Validate dataset split ratios."""
        total_split = self.test_size + self.val_size
        if total_split >= 1:
            logger.error(f"Invalid split ratios: test_size={self.test_size}, val_size={self.val_size}, total={total_split}")
            raise RangeError("total split ratio", total_split, max_val=1.0)

        if self.positive_ratio <= 0 or self.positive_ratio >= 1:
            logger.error(f"Invalid positive ratio: {self.positive_ratio}")
            raise RangeError("positive_ratio", self.positive_ratio, min_val=0.0, max_val=1.0)

        logger.debug("Validated dataset split ratios successfully")
        return self

Functions

validate_splits() -> BaseConfig

Validate dataset split ratios.

Source code in src/synthbiodata/config/base.py
@model_validator(mode='after')
def validate_splits(self) -> 'BaseConfig':
    """Validate dataset split ratios."""
    total_split = self.test_size + self.val_size
    if total_split >= 1:
        logger.error(f"Invalid split ratios: test_size={self.test_size}, val_size={self.val_size}, total={total_split}")
        raise RangeError("total split ratio", total_split, max_val=1.0)

    if self.positive_ratio <= 0 or self.positive_ratio >= 1:
        logger.error(f"Invalid positive ratio: {self.positive_ratio}")
        raise RangeError("positive_ratio", self.positive_ratio, min_val=0.0, max_val=1.0)

    logger.debug("Validated dataset split ratios successfully")
    return self