Vision AutoML¶

FastAPI app init for the vision AutoML service.

`AudioClassificationTask` ¶

Bases: BaseModel

Configuration for audio classification tasks.

audio_dir is the root directory containing audio files. labels_file is a CSV with audio_path and label columns.

Source code in app/vision_automl/models.py

class AudioClassificationTask(BaseModel):
    """Configuration for audio classification tasks.

    ``audio_dir`` is the root directory containing audio files.
    ``labels_file`` is a CSV with ``audio_path`` and ``label`` columns.
    """

    task_type: Literal["audio_classification"] = "audio_classification"
    audio_dir: Path
    labels_file: Path

    class Config:
        arbitrary_types_allowed = True

`CausalLMTask` ¶

Bases: TextTask

Configuration for causal language modelling tasks.

CSV must have a text column.

Source code in app/vision_automl/models.py

class CausalLMTask(TextTask):
    """Configuration for causal language modelling tasks.

    CSV must have a ``text`` column.
    """

    task_type: Literal["causal_lm"] = "causal_lm"

`ImageClassificationTask` ¶

Bases: ImageTask

Configuration for single-label image classification tasks.

Source code in app/vision_automl/models.py

class ImageClassificationTask(ImageTask):
    """Configuration for single-label image classification tasks."""

    task_type: Literal["image_classification"] = "image_classification"

`ImageMultiLabelClassificationTask` ¶

Bases: ImageTask

Configuration for multi-label image classification tasks.

Source code in app/vision_automl/models.py

class ImageMultiLabelClassificationTask(ImageTask):
    """Configuration for multi-label image classification tasks."""

    task_type: str = "image_multilabel_classification"
    label_format: Literal["csv", "json"] = "csv"  # required

`ImageRegressionTask` ¶

Bases: ImageTask

Configuration for image regression tasks (predict numeric values).

Source code in app/vision_automl/models.py

class ImageRegressionTask(ImageTask):
    """Configuration for image regression tasks (predict numeric values)."""

    task_type: str = "image_regression"
    label_format: Literal["csv"] = "csv"  # regression needs exact values

`ImageSegmentationTask` ¶

Bases: ImageTask

Configuration for semantic/panoptic image segmentation tasks.

Source code in app/vision_automl/models.py

class ImageSegmentationTask(ImageTask):
    """Configuration for semantic/panoptic image segmentation tasks."""

    task_type: Literal["image_segmentation"] = "image_segmentation"

`ImageTask` ¶

Bases: BaseModel

Base Pydantic model describing common image task inputs.

Source code in app/vision_automl/models.py

class ImageTask(BaseModel):
    """Base Pydantic model describing common image task inputs."""

    train_dir: Path
    test_dir: Path | None = None
    label_format: Literal["folder", "csv"] = "folder"
    labels_file: Path | None = None  # used if label_format != 'folder'

    class Config:
        arbitrary_types_allowed = True

`KeypointDetectionTask` ¶

Bases: ImageTask

Configuration for keypoint detection tasks.

The labels CSV must include a keypoints column with JSON-encoded [x, y, visibility] lists.

Source code in app/vision_automl/models.py

class KeypointDetectionTask(ImageTask):
    """Configuration for keypoint detection tasks.

    The labels CSV must include a ``keypoints`` column with JSON-encoded
    ``[x, y, visibility]`` lists.
    """

    task_type: Literal["keypoint_detection"] = "keypoint_detection"

`MaskedLMTask` ¶

Bases: TextTask

Configuration for masked language modelling tasks.

CSV must have a text column.

Source code in app/vision_automl/models.py

class MaskedLMTask(TextTask):
    """Configuration for masked language modelling tasks.

    CSV must have a ``text`` column.
    """

    task_type: Literal["masked_lm"] = "masked_lm"

`ObjectDetectionTask` ¶

Bases: ImageTask

Configuration for object detection tasks.

The labels CSV must include boxes and class_labels columns (JSON-encoded lists per row).

Source code in app/vision_automl/models.py

class ObjectDetectionTask(ImageTask):
    """Configuration for object detection tasks.

    The labels CSV must include ``boxes`` and ``class_labels`` columns
    (JSON-encoded lists per row).
    """

    task_type: Literal["object_detection"] = "object_detection"
    label_format: Literal["csv"] = "csv"

`QuestionAnsweringTask` ¶

Bases: TextTask

Configuration for extractive question answering tasks.

CSV must have question, context, answer_start, and answer_text columns.

Source code in app/vision_automl/models.py

class QuestionAnsweringTask(TextTask):
    """Configuration for extractive question answering tasks.

    CSV must have ``question``, ``context``, ``answer_start``, and
    ``answer_text`` columns.
    """

    task_type: Literal["question_answering"] = "question_answering"

`Seq2SeqLMTask` ¶

Bases: TextTask

Configuration for sequence-to-sequence tasks.

CSV must have input_text and target_text columns.

Source code in app/vision_automl/models.py

class Seq2SeqLMTask(TextTask):
    """Configuration for sequence-to-sequence tasks.

    CSV must have ``input_text`` and ``target_text`` columns.
    """

    task_type: Literal["seq2seq_lm"] = "seq2seq_lm"

`SequenceClassificationTask` ¶

Bases: TextTask

Configuration for text sequence classification tasks.

CSV must have text and label columns.

Source code in app/vision_automl/models.py

class SequenceClassificationTask(TextTask):
    """Configuration for text sequence classification tasks.

    CSV must have ``text`` and ``label`` columns.
    """

    task_type: Literal["text_classification"] = "text_classification"

`TextTask` ¶

Bases: BaseModel

Base Pydantic model for text-based tasks.

Source code in app/vision_automl/models.py

class TextTask(BaseModel):
    """Base Pydantic model for text-based tasks."""

    data_file: Path  # CSV with the required columns for the task type

    class Config:
        arbitrary_types_allowed = True

`VideoClassificationTask` ¶

Bases: ImageTask

Configuration for video classification tasks.

The labels CSV must include a video_path column pointing to video files relative to train_dir.

Source code in app/vision_automl/models.py

class VideoClassificationTask(ImageTask):
    """Configuration for video classification tasks.

    The labels CSV must include a ``video_path`` column pointing to video
    files relative to ``train_dir``.
    """

    task_type: Literal["video_classification"] = "video_classification"
    label_format: Literal["csv"] = "csv"

Route definitions for the vision AutoML service.

`find_best_model_for_multimodal_vision(request, user_id, dataset_id, dataset_version=None, filename_column='filename', label_column='label', exclude_columns=None, time_budget=60, model_size='small', dataset_split=None)` `async` ¶

Fetch a multimodal vision dataset from AutoDW, run AutoML training using both image data and auxiliary tabular metadata columns, and upload the best model.

Auxiliary columns are auto-detected from the CSV: all columns except filename_column, label_column, and any columns listed in exclude_columns are used as tabular features. Numeric columns are standard-scaled and categorical columns are ordinal-encoded.

Steps

Fetch dataset metadata from AutoDW.
Resolve the correct download URL (respecting splits if present).
Download the dataset ZIP to a temporary directory and extract it.
Auto-discover auxiliary columns and validate dataset structure.
Train a multimodal vision AutoML model within the given time budget.
Zip the model artifacts.
Upload the model and leaderboard back to AutoDW.

Returns:

Type	Description
`JSONResponse`	200 – success message and leaderboard summary.
`JSONResponse`	400 – validation error (bad inputs or unsupported dataset).
`JSONResponse`	502 – AutoDW communication failure.
`JSONResponse`	500 – unexpected runtime error.

Source code in app/vision_automl/router.py

@router.post("/multimodal_best_model/")
async def find_best_model_for_multimodal_vision(
    request: Request,
    user_id: Annotated[str, Form(..., description="User id from AutoDW")],
    dataset_id: Annotated[str, Form(..., description="Dataset id from AutoDW")],
    dataset_version: Annotated[
        str | None, Form(description="Optional dataset version")
    ] = None,
    filename_column: Annotated[
        str, Form(..., description="Filename column in labels.csv")
    ] = "filename",
    label_column: Annotated[
        str, Form(..., description="Label column in labels.csv")
    ] = "label",
    exclude_columns: Annotated[
        str | None,
        Form(
            description=(
                "Comma-separated list of CSV columns to exclude from auxiliary features "
                "(in addition to filename_column and label_column)."
            )
        ),
    ] = None,
    time_budget: Annotated[int, Form(..., description="Time budget in seconds")] = 60,
    model_size: Annotated[
        str, Form(..., description="Model size: small / medium / large")
    ] = "small",
    dataset_split: Annotated[
        str | None,
        Form(description="Dataset split to use for training (e.g., 'train')."),
    ] = None,
) -> JSONResponse:
    """
    Fetch a multimodal vision dataset from AutoDW, run AutoML training
    using both image data and auxiliary tabular metadata columns, and
    upload the best model.

    Auxiliary columns are auto-detected from the CSV: all columns except
    ``filename_column``, ``label_column``, and any columns listed in
    ``exclude_columns`` are used as tabular features.  Numeric columns
    are standard-scaled and categorical columns are ordinal-encoded.

    Steps:
      1. Fetch dataset metadata from AutoDW.
      2. Resolve the correct download URL (respecting splits if present).
      3. Download the dataset ZIP to a temporary directory and extract it.
      4. Auto-discover auxiliary columns and validate dataset structure.
      5. Train a multimodal vision AutoML model within the given time budget.
      6. Zip the model artifacts.
      7. Upload the model and leaderboard back to AutoDW.

    Returns:
        200 – success message and leaderboard summary.
        400 – validation error (bad inputs or unsupported dataset).
        502 – AutoDW communication failure.
        500 – unexpected runtime error.
    """
    autodw_base = os.getenv("AUTODW_URL", "http://localhost:8000")
    upload_url = f"{autodw_base}/ai-models/upload/single/{user_id}"

    try:
        metadata = fetch_dataset_metadata(
            autodw_base, user_id, dataset_id, dataset_version
        )

        if metadata.get("file_type") != "zip":
            return JSONResponse(
                status_code=400,
                content={"error": "Vision AutoML requires a ZIP dataset."},
            )

        download_url = resolve_download_url(
            autodw_base, user_id, dataset_id, dataset_version, metadata, dataset_split
        )

        with dataset_workspace(f"multimodal_{dataset_id}") as workdir:
            zip_path = download_dataset(
                download_url, workdir, metadata.get("original_filename", "dataset.zip")
            )
            csv_path, images_dir = extract_and_locate_dataset(zip_path, workdir)

            exclude_cols = (
                [c.strip() for c in exclude_columns.split(",") if c.strip()]
                if exclude_columns
                else None
            )

            validation_error, auxiliary_columns = validate_multimodal_inputs(
                csv_path, images_dir, filename_column, label_column, exclude_cols
            )
            if validation_error:
                return JSONResponse(
                    status_code=400, content={"error": validation_error}
                )

            optuna_result = await train_automl_multimodal(
                csv_path,
                images_dir,
                filename_column,
                label_column,
                auxiliary_columns,
                time_budget,
                model_size,
                workdir=workdir,
            )

            zip_path = serialize_and_zip_model(workdir)
            leaderboard_json, leaderboard_str = convert_leaderboard_safely(
                optuna_result
            )

            task_type = "image_classification_multimodal"
            _, payload = build_upload_payload(
                dataset_id, dataset_version, metadata, task_type, leaderboard_json
            )
            upload_resp = upload_model(
                upload_url, zip_path, payload, request.headers.get("X-Task-ID")
            )

            if upload_resp.status_code >= 400:
                logger.error("Model upload failed: %s", upload_resp.text)
                return JSONResponse(
                    status_code=upload_resp.status_code,
                    content={"error": f"Failed to upload model: {upload_resp.text}"},
                )

        logger.info(
            "Multimodal vision AutoML training completed and model uploaded successfully."
        )
        return JSONResponse(
            status_code=200,
            content={
                "message": "Multimodal vision AutoML training completed successfully and model uploaded to AutoDW",
                "leaderboard": leaderboard_str,
                "auxiliary_columns": auxiliary_columns,
            },
        )

    except AutoMLValidationError as e:
        return JSONResponse(status_code=400, content={"error": str(e)})
    except AutoDWDownloadError as e:
        return JSONResponse(status_code=502, content={"error": f"AutoDW error: {e}"})
    except Exception as e:
        logger.exception("Unexpected error during multimodal vision AutoML")
        return JSONResponse(status_code=500, content={"error": str(e)})

`find_best_model_for_vision(request, user_id, dataset_id, dataset_version='v1', filename_column='filename', label_column='label', task_type='image_classification', time_budget=60, model_size='small', dataset_split=None)` `async` ¶

Fetch a vision dataset from AutoDW, run AutoML training, and upload the best model.

Steps

Fetch dataset metadata from AutoDW.
Resolve the correct download URL (respecting splits if present).
Download the dataset ZIP to a temporary directory and extract it.
Validate CSV structure and image file presence.
Train a vision AutoML model within the given time budget.
Zip the model artifacts.
Upload the model and leaderboard back to AutoDW.

Returns:

Type	Description
`JSONResponse`	200 – success message and leaderboard summary.
`JSONResponse`	400 – validation error (bad inputs or unsupported dataset).
`JSONResponse`	502 – AutoDW communication failure.
`JSONResponse`	500 – unexpected runtime error.

Source code in app/vision_automl/router.py

@router.post("/best_model/")
async def find_best_model_for_vision(
    request: Request,
    user_id: Annotated[str, Form(..., description="User id from AutoDW")],
    dataset_id: Annotated[str, Form(..., description="Dataset id from AutoDW")],
    dataset_version: Annotated[
        str | None, Form(description="Optional dataset version")
    ] = "v1",
    filename_column: Annotated[
        str, Form(..., description="Filename column in labels.csv")
    ] = "filename",
    label_column: Annotated[
        str, Form(..., description="Label column in labels.csv")
    ] = "label",
    task_type: Annotated[
        str,
        Form(
            description=(
                "Vision task type. One of: "
                + ", ".join(sorted(SUPPORTED_VISION_TASK_TYPES))
            )
        ),
    ] = "image_classification",
    time_budget: Annotated[int, Form(..., description="Time budget in seconds")] = 60,
    model_size: Annotated[
        str, Form(..., description="Model size: small / medium / large")
    ] = "small",
    dataset_split: Annotated[
        str | None,
        Form(description="Dataset split to use for training (e.g., 'train')."),
    ] = None,
) -> JSONResponse:
    """
    Fetch a vision dataset from AutoDW, run AutoML training, and upload the best model.

    Steps:
      1. Fetch dataset metadata from AutoDW.
      2. Resolve the correct download URL (respecting splits if present).
      3. Download the dataset ZIP to a temporary directory and extract it.
      4. Validate CSV structure and image file presence.
      5. Train a vision AutoML model within the given time budget.
      6. Zip the model artifacts.
      7. Upload the model and leaderboard back to AutoDW.

    Returns:
        200 – success message and leaderboard summary.
        400 – validation error (bad inputs or unsupported dataset).
        502 – AutoDW communication failure.
        500 – unexpected runtime error.
    """
    autodw_base = os.getenv("AUTODW_URL", "http://localhost:8000")
    upload_url = f"{autodw_base}/ai-models/upload/single/{user_id}"

    try:
        # 1. Metadata
        metadata = fetch_dataset_metadata(
            autodw_base, user_id, dataset_id, dataset_version
        )

        if metadata.get("file_type") != "zip":
            return JSONResponse(
                status_code=400,
                content={"error": "Vision AutoML requires a ZIP dataset."},
            )

        # 2. Download URL
        download_url = resolve_download_url(
            autodw_base, user_id, dataset_id, dataset_version, metadata, dataset_split
        )

        with dataset_workspace(f"automl_{dataset_id}") as workdir:
            # 3. Download & extract
            zip_path = download_dataset(
                download_url, workdir, metadata.get("original_filename", "dataset.zip")
            )
            csv_path, images_dir = extract_and_locate_dataset(zip_path, workdir)

            # 4. Validate
            if task_type not in SUPPORTED_VISION_TASK_TYPES:
                return JSONResponse(
                    status_code=400,
                    content={
                        "error": f"Unsupported task_type '{task_type}'. "
                        f"Supported: {sorted(SUPPORTED_VISION_TASK_TYPES)}"
                    },
                )

            validation_error = validate_vision_inputs(
                csv_path, images_dir, filename_column, label_column, task_type
            )
            if validation_error:
                return JSONResponse(
                    status_code=400, content={"error": validation_error}
                )

            # 5. Train
            optuna_result = await train_automl(
                csv_path,
                images_dir,
                filename_column,
                label_column,
                time_budget,
                model_size,
                workdir=workdir,
                task_type=task_type,
            )

            # 6. Serialize
            zip_path = serialize_and_zip_model(workdir)
            leaderboard_json, leaderboard_str = convert_leaderboard_safely(
                optuna_result
            )

            # 7. Upload
            _, payload = build_upload_payload(
                dataset_id, dataset_version, metadata, task_type, leaderboard_json
            )
            upload_resp = upload_model(
                upload_url, zip_path, payload, request.headers.get("X-Task-ID")
            )

            if upload_resp.status_code >= 400:
                logger.error("Model upload failed: %s", upload_resp.text)
                return JSONResponse(
                    status_code=upload_resp.status_code,
                    content={"error": f"Failed to upload model: {upload_resp.text}"},
                )

        logger.info("Vision AutoML training completed and model uploaded successfully.")
        return JSONResponse(
            status_code=200,
            content={
                "message": "Vision AutoML training completed successfully and model uploaded to AutoDW",
                "leaderboard": leaderboard_str,
            },
        )

    except AutoMLValidationError as e:
        return JSONResponse(status_code=400, content={"error": str(e)})
    except AutoDWDownloadError as e:
        return JSONResponse(status_code=502, content={"error": f"AutoDW error: {e}"})
    except Exception as e:
        logger.exception("Unexpected error during vision AutoML")
        return JSONResponse(status_code=500, content={"error": str(e)})

`show_accepted_format_instructions()` `async` ¶

Show accepted format instructions from a template

Source code in app/vision_automl/router.py

@router.post("/accepted_format/")
async def show_accepted_format_instructions() -> JSONResponse:
    """Show accepted format instructions from a template"""
    try:
        return JSONResponse(
            content={"instructions": vision_data_instructions()}, status_code=200
        )
    except Exception as e:
        logger.exception(
            "Unexpected error in finding data format instructions in tabular"
        )
        return JSONResponse(status_code=500, content={"error": str(e)})

`show_deployment_instructions()` `async` ¶

Show deployment instructions from a template

Source code in app/vision_automl/router.py

@router.post("/deployment_instructions/")
async def show_deployment_instructions() -> JSONResponse:
    """Show deployment instructions from a template"""
    try:
        return JSONResponse(
            content={"instructions": deployment_instructions()}, status_code=200
        )
    except Exception as e:
        logger.exception(
            "Unexpected error in finding deployment instructions in vision"
        )
        return JSONResponse(status_code=500, content={"error": str(e)})

Service layer for vision AutoML workflows.

Mirrors the structure of tabular_automl/services.py so both pipelines share a consistent public API consumed by their respective main.py files.

`build_upload_payload(dataset_id, dataset_version, metadata, task_type, leaderboard_json)` ¶

Return (model_id, form_data_dict) for the AutoDW upload request.

Mirrors tabular's build_upload_payload.

Source code in app/vision_automl/services.py

def build_upload_payload(
    dataset_id: str,
    dataset_version: str | None,
    metadata: dict,
    task_type: str,
    leaderboard_json: dict,
) -> tuple[str, dict]:
    """
    Return (model_id, form_data_dict) for the AutoDW upload request.

    Mirrors tabular's ``build_upload_payload``.
    """
    model_id = (
        f"vision_automl_{dataset_id}_{int(datetime.datetime.utcnow().timestamp())}"
    )
    data = {
        "model_id": model_id,
        "name": f"Vision AutoML Model - {dataset_id}",
        "description": "AutoML trained vision model",
        "framework": "pytorch",
        "model_type": task_type,
        "training_dataset": str(dataset_id),
        "training_dataset_version": dataset_version or metadata.get("version", "v1"),
        "leaderboard": json.dumps(leaderboard_json),
    }
    return model_id, data

`collect_missing_files(df, images_dir, filename_col, label_col)` ¶

Return a list of filenames referenced in the CSV but absent on disk.

Source code in app/vision_automl/services.py

def collect_missing_files(
    df: pd.DataFrame, images_dir: Path, filename_col: str, label_col: str
) -> list[str]:
    """Return a list of filenames referenced in the CSV but absent on disk."""
    missing = []
    for _, row in df.iterrows():
        filename = row[filename_col]

        img_path = images_dir / filename
        if img_path.exists():
            continue

        matches = list(images_dir.rglob(str(filename)))
        if len(matches) == 1:
            continue
        elif len(matches) > 1:
            logger.warning("Multiple matches for %s: %s", filename, matches)

        missing.append(filename)
    return missing

`convert_leaderboard_safely(optuna_result)` ¶

Extract leaderboard information from an Optuna result dict.

Returns (leaderboard_json, leaderboard_str) — mirrors the tabular convert_leaderboard_safely signature so main.py can treat both pipelines identically.

Source code in app/vision_automl/services.py

def convert_leaderboard_safely(optuna_result: dict) -> tuple[dict, str]:
    """
    Extract leaderboard information from an Optuna result dict.

    Returns (leaderboard_json, leaderboard_str) — mirrors the tabular
    ``convert_leaderboard_safely`` signature so main.py can treat both
    pipelines identically.
    """
    leaderboard_json = {
        "best_loss": optuna_result.get("best_value"),
        "best_params": optuna_result.get("best_params"),
        "trials": optuna_result.get("n_trials"),
    }
    leaderboard_str = json.dumps(leaderboard_json, indent=2)
    return leaderboard_json, leaderboard_str

`download_dataset(download_url, workdir, original_filename)` ¶

Stream-download the ZIP dataset and return its local path.

Source code in app/vision_automl/services.py

def download_dataset(download_url: str, workdir: Path, original_filename: str) -> Path:
    """Stream-download the ZIP dataset and return its local path."""
    zip_path = workdir / original_filename
    try:
        with requests.get(
            download_url,
            stream=True,
            timeout=60,
            headers={"Accept-Encoding": "gzip, deflate"},
        ) as resp:
            resp.raise_for_status()
            with open(zip_path, "wb") as f:
                for chunk in resp.iter_content(chunk_size=1024 * 1024):
                    f.write(chunk)
    except requests.RequestException as e:
        logger.error(f"Failed to download dataset from {download_url}: {e}")
        raise AutoDWDownloadError(f"Failed to download dataset from AutoDW: {e}") from e
    except OSError as e:
        logger.error(f"Failed to write dataset to {zip_path}: {e}")
        raise AutoDWDownloadError(f"Failed to save dataset file: {e}") from e
    logger.info("Dataset ZIP saved to %s", zip_path)
    return zip_path

`extract_and_locate_dataset(zip_path, workdir)` ¶

Extract a vision dataset ZIP and return (csv_path, images_dir).

Raises DatasetValidationError for structural problems.

Source code in app/vision_automl/services.py

def extract_and_locate_dataset(zip_path: Path, workdir: Path) -> tuple[Path, Path]:
    """
    Extract a vision dataset ZIP and return (csv_path, images_dir).

    Raises DatasetValidationError for structural problems.
    """
    extract_dir = workdir / "dataset"
    extract_dir.mkdir(exist_ok=True)
    shutil.unpack_archive(zip_path, extract_dir)

    dataset_root = _find_valid_dataset_root(extract_dir)
    csv_path = _find_csv_file(dataset_root)
    images_dir = _find_or_resolve_images_dir(dataset_root, csv_path)
    return csv_path, images_dir

`fetch_dataset_metadata(autodw_base, user_id, dataset_id, dataset_version)` ¶

Fetch and return dataset metadata from AutoDW.

Source code in app/vision_automl/services.py

def fetch_dataset_metadata(
    autodw_base: str,
    user_id: str,
    dataset_id: str,
    dataset_version: str | None,
) -> dict:
    """Fetch and return dataset metadata from AutoDW."""
    metadata_url = _build_metadata_url(
        autodw_base, user_id, dataset_id, dataset_version
    )
    logger.debug("Fetching dataset metadata: %s", metadata_url)
    try:
        resp = requests.get(metadata_url, timeout=15)
        resp.raise_for_status()
    except requests.RequestException as e:
        logger.error(f"Failed to fetch dataset metadata: {e}")
        raise AutoDWDownloadError(
            f"Failed to fetch dataset metadata from AutoDW: {e}"
        ) from e

    try:
        return resp.json()
    except (ValueError, json.JSONDecodeError) as e:
        logger.error(f"Failed to parse JSON response from AutoDW: {e}")
        raise AutoDWDownloadError(f"Invalid JSON response from AutoDW: {e}") from e

`get_num_params_if_available(repo_id, revision=None)` ¶

Try to retrieve number of parameters for a HF model, if available.

Source code in app/vision_automl/services.py

def get_num_params_if_available(
    repo_id: str, revision: str | None = None
) -> int | None:
    """Try to retrieve number of parameters for a HF model, if available."""
    logger.debug("Fetching parameter count for model %s", repo_id)
    api = HfApi()
    try:
        info = api.model_info(repo_id, revision=revision, files_metadata=True)
        num_params = getattr(info, "safetensors", None)
        if num_params is not None:
            return num_params.total
    except Exception as e:
        logger.warning("Failed to retrieve num_params for %s: %s", repo_id, e)
    return None

`normalize_dataframe_filenames(df, filename_column, csv_path)` ¶

Normalize filenames to basenames and persist CSV back to disk.

Source code in app/vision_automl/services.py

def normalize_dataframe_filenames(
    df: pd.DataFrame, filename_column: str, csv_path: Path
) -> pd.DataFrame:
    """Normalize filenames to basenames and persist CSV back to disk."""
    logger.info("Normalizing filenames in column '%s'", filename_column)
    if filename_column in df.columns:
        df[filename_column] = (
            df[filename_column]
            .astype(str)
            .map(lambda s: os.path.basename(str(s).replace("\\", "/")))
        )
        df.to_csv(csv_path, index=False)
        logger.debug("Normalized filenames saved to %s", csv_path)
    else:
        logger.warning(
            "Filename column '%s' not found during normalization", filename_column
        )
    return df

`resolve_download_url(autodw_base, user_id, dataset_id, dataset_version, metadata, split)` ¶

Determine the correct dataset download URL, accounting for splits.

Source code in app/vision_automl/services.py

def resolve_download_url(
    autodw_base: str,
    user_id: str,
    dataset_id: str,
    dataset_version: str | None,
    metadata: dict,
    split: str | None,
) -> str:
    """Determine the correct dataset download URL, accounting for splits."""
    base_url = _build_metadata_url(autodw_base, user_id, dataset_id, dataset_version)
    download_url = f"{base_url}/download"

    has_split = bool(metadata.get("custom_metadata", {}).get("split"))
    if split and has_split:
        download_url = f"{download_url}?split={split}"
        logger.info(
            "Dataset has splits; downloading '%s' split from: %s", split, download_url
        )
    else:
        if split and not has_split:
            logger.warning(
                "split='%s' was requested but dataset has no splits; "
                "downloading full dataset.",
                split,
            )
        logger.debug("Downloading full dataset ZIP: %s", download_url)

    return download_url

`resolve_images_root(images_dir)` ¶

Resolve common nested packaging patterns inside uploaded image zips.

Source code in app/vision_automl/services.py

def resolve_images_root(images_dir: Path) -> Path:
    """Resolve common nested packaging patterns inside uploaded image zips."""
    logger.info("Resolving image directory structure at %s", images_dir)
    nested_images_dir = images_dir / "images"
    if nested_images_dir.exists() and nested_images_dir.is_dir():
        logger.debug("Detected nested 'images' folder, using it as root")
        images_dir = nested_images_dir

    try:
        top_level_entries = list(images_dir.iterdir())
        only_dirs = [p for p in top_level_entries if p.is_dir()]
        only_files = [p for p in top_level_entries if p.is_file()]
        if len(only_files) == 0 and len(only_dirs) == 1:
            logger.debug("Detected single top-level directory: %s", only_dirs[0])
            images_dir = only_dirs[0]
    except Exception as e:
        logger.warning("Error resolving image root: %s", e)

    return images_dir

`search_hf_for_pytorch_models_with_estimated_parameters(filter='image-classification', limit=3, sort='downloads')` ¶

Search HF for PyTorch image-classification models annotated with param counts.

Source code in app/vision_automl/services.py

def search_hf_for_pytorch_models_with_estimated_parameters(
    filter: str = "image-classification", limit: int = 3, sort: str = "downloads"
) -> list[dict[str, Any]]:
    """Search HF for PyTorch image-classification models annotated with param counts."""
    logger.info("Searching Hugging Face models for filter='%s'", filter)
    api = HfApi()
    models = api.list_models(
        filter=filter,
        library="pytorch",
        sort=sort,
        direction=-1,
        limit=limit,
    )

    results: list[dict[str, Any]] = []
    for m in models:
        num_params = get_num_params_if_available(m.id)
        if num_params:
            results.append(
                {
                    "model_id": m.id,
                    "downloads": getattr(m, "downloads", None),
                    "likes": getattr(m, "likes", None),
                    "last_modified": getattr(m, "lastModified", None),
                    "private": getattr(m, "private", None),
                    "num_params": num_params,
                }
            )

    logger.info("Found %d models with parameter info", len(results))
    return results

`serialize_and_zip_model(workdir)` ¶

Package the trained model directory into a ZIP archive.

Returns the path to the ZIP file. Mirrors tabular's serialize_and_zip_predictor.

Source code in app/vision_automl/services.py

def serialize_and_zip_model(workdir: Path) -> Path:
    """
    Package the trained model directory into a ZIP archive.

    Returns the path to the ZIP file.
    Mirrors tabular's ``serialize_and_zip_predictor``.
    """
    model_dir = workdir / "model"
    model_dir.mkdir(exist_ok=True)

    try:
        with open(workdir / "vision_deployment_instructions.md") as f:
            f.write(deployment_instructions())
    except Exception as e:
        logger.debug(f"No deployment_instructions found, {e}")

    zip_base = workdir / "vision_model"
    try:
        shutil.make_archive(str(zip_base), "zip", model_dir)
    except Exception as e:
        logger.error(f"Failed to create zip archive: {e}")
        raise AutoMLSerializationError(f"Failed to zip model: {e}") from e
    zip_path = zip_base.with_suffix(".zip")
    logger.debug("Model artifacts zipped to %s", zip_path)
    return zip_path

`sort_models_by_size(models, size_tier)` ¶

Filter and sort models by size tier based on estimated parameter counts.

Source code in app/vision_automl/services.py

def sort_models_by_size(
    models: list[dict[str, Any]], size_tier: str
) -> list[dict[str, Any]]:
    """Filter and sort models by size tier based on estimated parameter counts."""
    logger.info("Sorting models by size tier: %s", size_tier)
    tier = str(size_tier).strip().lower()

    SMALL_MAX: int = int(os.getenv("MODEL_SMALL_MAX_PARAM_SIZE", 50_000_000))
    MEDIUM_MIN: int = SMALL_MAX + 1
    MEDIUM_MAX: int = int(os.getenv("MODEL_MEDIUM_MAX_PARAM_SIZE", 200_000_000))
    LARGE_MIN: int = MEDIUM_MAX + 1

    def in_tier(m: dict[str, Any]) -> bool:
        n = m.get("num_params")
        if n is None:
            return False
        if tier == "small":
            return 0 <= n <= SMALL_MAX
        if tier == "medium":
            return MEDIUM_MIN <= n <= MEDIUM_MAX
        if tier == "large":
            return n >= LARGE_MIN
        return True

    filtered = [m for m in models if in_tier(m)]
    if not filtered:
        logger.warning("No models matched tier '%s'; falling back to all models", tier)
        filtered = models

    return sorted(
        filtered, key=lambda m: (m.get("num_params") is None, m.get("num_params", 0))
    )

`train_automl(csv_path, images_dir, filename_column, label_column, time_budget, model_size, workdir, task_type='image_classification')` `async` ¶

Run Optuna-based vision AutoML and return the result dict.

Source code in app/vision_automl/services.py

async def train_automl(
    csv_path: Path,
    images_dir: Path,
    filename_column: str,
    label_column: str,
    time_budget: int,
    model_size: str,
    workdir: Path,
    task_type: str = "image_classification",
) -> dict:
    """Run Optuna-based vision AutoML and return the result dict."""
    return await run_in_threadpool(
        run_optuna_search,
        task_type=task_type,
        csv_path=csv_path,
        images_dir=images_dir,
        filename_column=filename_column,
        label_column=label_column,
        n_trials=max(1, min(25, time_budget // 60)),
        timeout=time_budget,
        model_size=model_size,
        workdir=workdir,
    )

`train_automl_multimodal(csv_path, images_dir, filename_column, label_column, auxiliary_columns, time_budget, model_size, workdir)` `async` ¶

Run Optuna-based multimodal vision AutoML and return the result dict.

Source code in app/vision_automl/services.py

async def train_automl_multimodal(
    csv_path: Path,
    images_dir: Path,
    filename_column: str,
    label_column: str,
    auxiliary_columns: list[str],
    time_budget: int,
    model_size: str,
    workdir: Path,
) -> dict:
    """Run Optuna-based multimodal vision AutoML and return the result dict."""
    return await run_in_threadpool(
        run_optuna_search,
        task_type="image_classification_multimodal",
        csv_path=csv_path,
        images_dir=images_dir,
        filename_column=filename_column,
        label_column=label_column,
        auxiliary_columns=auxiliary_columns,
        n_trials=max(1, min(25, time_budget // 60)),
        timeout=time_budget,
        model_size=model_size,
        workdir=workdir,
    )

`upload_model(upload_url, zip_path, payload, task_id)` ¶

Upload the zipped model to AutoDW and return the raw response.

Source code in app/vision_automl/services.py

def upload_model(
    upload_url: str,
    zip_path: Path,
    payload: dict,
    task_id: str | None,
) -> requests.Response:
    """Upload the zipped model to AutoDW and return the raw response."""
    if not zip_path.exists():
        raise AutoMLDataError(f"Zip file not found: {zip_path}")
    headers = {"X-Task-ID": task_id} if task_id else {}
    try:
        with open(zip_path, "rb") as f:
            files = {"file": (zip_path.name, f, "application/octet-stream")}
            logger.debug("Uploading vision model to %s", upload_url)
            return requests.post(
                upload_url, headers=headers, files=files, data=payload, timeout=120
            )
    except requests.RequestException as e:
        logger.error(f"Failed to upload model to {upload_url}: {e}")
        raise AutoDWUploadError(f"Failed to upload model to AutoDW: {e}") from e
    except OSError as e:
        logger.error(f"Failed to read zip file {zip_path}: {e}")
        raise AutoDWUploadError(f"Failed to read zip file: {e}") from e

`validate_multimodal_inputs(csv_path, images_dir, filename_column, label_column, exclude_columns=None)` ¶

Validate dataset structure for multimodal image classification.

Auto-discovers auxiliary columns (all columns except filename_column, label_column, and exclude_columns), validates their presence and contents, and checks image file existence.

Returns:

Type	Description
`tuple[str \| None, list[str]]`	(error_string_or_None, auxiliary_columns_list).

Source code in app/vision_automl/services.py

def validate_multimodal_inputs(
    csv_path: Path,
    images_dir: Path,
    filename_column: str,
    label_column: str,
    exclude_columns: list[str] | None = None,
) -> tuple[str | None, list[str]]:
    """Validate dataset structure for multimodal image classification.

    Auto-discovers auxiliary columns (all columns except ``filename_column``,
    ``label_column``, and ``exclude_columns``), validates their presence and
    contents, and checks image file existence.

    Returns:
        (error_string_or_None, auxiliary_columns_list).
    """
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        return f"Could not read labels CSV: {e}", []

    for col, role in [(filename_column, "Filename"), (label_column, "Label")]:
        if col not in df.columns:
            return f"{role} column '{col}' not found in labels CSV", []

    auxiliary_columns = _discover_auxiliary_columns(
        df, filename_column, label_column, exclude_columns
    )

    if not auxiliary_columns:
        return (
            "No auxiliary columns found in the CSV. "
            "Use the standard /best_model/ endpoint for image-only classification.",
            [],
        )

    for col in auxiliary_columns:
        if df[col].isnull().all():
            return f"Auxiliary column '{col}' is entirely null", []

    df = normalize_dataframe_filenames(df, filename_column, csv_path)

    missing = collect_missing_files(df, images_dir, filename_column, label_column)
    if missing:
        preview = missing[:5]
        suffix = "..." if len(missing) > 5 else ""
        return f"Missing {len(missing)} image file(s): {preview}{suffix}", []

    return None, auxiliary_columns

`validate_vision_inputs(csv_path, images_dir, filename_column, label_column, task_type='image_classification')` ¶

Validate dataset structure for the given task type.

Returns an error string on failure, or None if everything is valid. Mirrors the signature/contract of tabular's validate_tabular_inputs.

Parameters:

Name	Type	Description	Default
`csv_path`	`Path`	Path to the labels CSV.	required
`images_dir`	`Path`	Root directory containing image/audio/video files. Unused for pure text tasks.	required
`filename_column`	`str`	Column name containing file paths (image/audio tasks).	required
`label_column`	`str`	Column name containing labels (classification tasks).	required
`task_type`	`str`	One of the supported task type slugs.	`'image_classification'`

Source code in app/vision_automl/services.py

def validate_vision_inputs(
    csv_path: Path,
    images_dir: Path,
    filename_column: str,
    label_column: str,
    task_type: str = "image_classification",
) -> str | None:
    """Validate dataset structure for the given task type.

    Returns an error string on failure, or None if everything is valid.
    Mirrors the signature/contract of tabular's ``validate_tabular_inputs``.

    Args:
        csv_path: Path to the labels CSV.
        images_dir: Root directory containing image/audio/video files.
            Unused for pure text tasks.
        filename_column: Column name containing file paths (image/audio tasks).
        label_column: Column name containing labels (classification tasks).
        task_type: One of the supported task type slugs.
    """
    # Audio task — validate audio dir + CSV
    if task_type == "audio_classification":
        if not csv_path.exists():
            return f"Labels CSV not found: {csv_path}"
        if not images_dir.exists():
            return f"Audio directory not found: {images_dir}"
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            return f"Could not read labels CSV: {e}"
        for col, role in [(filename_column, "Filename"), (label_column, "Label")]:
            if col not in df.columns:
                return f"{role} column '{col}' not found in labels CSV"
        return None

    # Text tasks — validate CSV + required columns
    if task_type in _TEXT_REQUIRED_COLUMNS:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            return f"Could not read labels CSV: {e}"
        required = _TEXT_REQUIRED_COLUMNS[task_type]
        missing_cols = [c for c in required if c not in df.columns]
        if missing_cols:
            return f"Required column(s) missing for {task_type}: {missing_cols}"
        return None

    # Image tasks — existing CSV + image presence checks
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        return f"Could not read labels CSV: {e}"

    for col, role in [(filename_column, "Filename"), (label_column, "Label")]:
        if col not in df.columns:
            return f"{role} column '{col}' not found in labels CSV"

    # Detection/segmentation tasks — validate annotation columns
    if task_type in _DETECTION_EXTRA_COLUMNS:
        extra = _DETECTION_EXTRA_COLUMNS[task_type]
        missing_cols = [c for c in extra if c not in df.columns]
        if missing_cols:
            return (
                f"Required annotation column(s) missing for {task_type}: {missing_cols}"
            )

    df = normalize_dataframe_filenames(df, filename_column, csv_path)

    missing = collect_missing_files(df, images_dir, filename_column, label_column)
    if missing:
        preview = missing[:5]
        suffix = "..." if len(missing) > 5 else ""
        return f"Missing {len(missing)} image file(s): {preview}{suffix}"

    return None

`vision_data_instructions()` ¶

Return the instructions from what kind of data is accepted by the vision AutoML engine

Source code in app/vision_automl/services.py

def vision_data_instructions() -> str:
    """Return the instructions from what kind of data is accepted by the vision AutoML engine"""
    if jinja_environment is not None:
        try:
            return render_template(jinja_environment, "vision_accepted_format.md")
        except Exception as e:
            logger.error(f"Failed to render accepted format instructions: {e}")
            return "No accepted format instructions available"
    else:
        logger.warning("jinja_environment is None, returning default formats")
        return "Ask the agent for help"

ML engine¶

Per-task hyperparameter and model config loader.

`load_task_config(task_type)` ¶

Load and return the JSON config for the given task type.

Parameters:

Name	Type	Description	Default
`task_type`	`str`	One of the supported task type slugs.	required

Returns:

Type	Description
`dict`	Dict with keys: small_models, medium_models, large_models,
`dict`	lr_low, lr_high, batch_sizes, weight_decay_low, weight_decay_high,
`dict`	max_epochs, early_stopping_patience.

Raises:

Type	Description
`ValueError`	If the task type is not supported.

Source code in app/vision_automl/ml_engine/configs/__init__.py

def load_task_config(task_type: str) -> dict:
    """Load and return the JSON config for the given task type.

    Args:
        task_type: One of the supported task type slugs.

    Returns:
        Dict with keys: small_models, medium_models, large_models,
        lr_low, lr_high, batch_sizes, weight_decay_low, weight_decay_high,
        max_epochs, early_stopping_patience.

    Raises:
        ValueError: If the task type is not supported.
    """
    if task_type not in SUPPORTED_TASK_TYPES:
        raise ValueError(
            f"Unknown task type '{task_type}'. "
            f"Supported: {sorted(SUPPORTED_TASK_TYPES)}"
        )
    config_path = _CONFIGS_DIR / f"{task_type}.json"
    with open(config_path) as f:
        return json.load(f)

`AudioClassificationDataModule` ¶

Datamodule for audio classification tasks.

CSV columns: audio_path (relative to root_dir) and label. Audio is loaded with torchaudio (must be installed separately).

Source code in app/vision_automl/ml_engine/datamodule.py

class AudioClassificationDataModule:
    """Datamodule for audio classification tasks.

    CSV columns: ``audio_path`` (relative to ``root_dir``) and ``label``.
    Audio is loaded with ``torchaudio`` (must be installed separately).
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        audio_col: str = "audio_path",
        label_col: str = "label",
        sampling_rate: int = 16000,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "facebook/wav2vec2-base",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.root_dir = Path(root_dir)
        self.audio_col = audio_col
        self.label_col = label_col
        self.sampling_rate = sampling_rate
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.num_classes: int = 0
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.feature_extractor: AutoFeatureExtractor | None = None
        self.train_df: pd.DataFrame | None = None
        self.val_df: pd.DataFrame | None = None
        self.test_df: pd.DataFrame | None = None
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            classes = sorted(df[self.label_col].unique().tolist())
        except KeyError as e:
            logger.error(
                "Label column '%s' not found in dataset: %s", self.label_col, e
            )
            raise

        self.num_classes = len(classes)
        self.id2label = {i: c for i, c in enumerate(classes)}
        self.label2id = {c: i for i, c in enumerate(classes)}
        df = df.copy()
        df[self.label_col] = df[self.label_col].map(self.label2id)

        try:
            train_df, temp_df = train_test_split(
                df,
                test_size=self.val_split + self.test_split,
                stratify=df[self.label_col],
                random_state=self.seed,
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df,
                test_size=1 - relative_val,
                stratify=temp_df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        self.train_df = train_df.reset_index(drop=True)
        self.val_df = val_df.reset_index(drop=True)
        self.test_df = test_df.reset_index(drop=True)

        try:
            self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                self.hf_model_id
            )
        except Exception as e:
            logger.error(
                "Failed to load feature extractor from %s: %s", self.hf_model_id, e
            )
            raise

    def _make_dataset(self, df: pd.DataFrame) -> Dataset:
        try:
            import torchaudio
        except ImportError as e:
            raise ImportError(
                "torchaudio is required for audio tasks. "
                "Install it with: pip install torchaudio"
            ) from e

        root = self.root_dir
        audio_col = self.audio_col
        label_col = self.label_col
        target_sr = self.sampling_rate

        class _AudioDataset(Dataset):
            def __init__(self, df):
                self.df = df

            def __len__(self):
                return len(self.df)

            def __getitem__(self, idx):
                row = self.df.iloc[idx]
                waveform, sr = torchaudio.load(str(root / str(row[audio_col])))
                if sr != target_sr:
                    waveform = torchaudio.functional.resample(waveform, sr, target_sr)
                waveform = waveform.mean(0)  # mono
                return waveform, torch.tensor(int(row[label_col]), dtype=torch.long)

        return _AudioDataset(df)

    def _collate_fn(self, batch):
        waveforms, labels = zip(*batch)
        if self.feature_extractor is None:
            raise AutoMLRuntimeError("Feature extractor not initialized.")
        inputs = self.feature_extractor(
            [w.numpy() for w in waveforms],
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=True,
        )
        return {
            "input_values": inputs.input_values,
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def _make_loader(self, df: pd.DataFrame, shuffle: bool) -> DataLoader:
        return DataLoader(
            self._make_dataset(df),
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_df, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_df, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_df, shuffle=False)

`CausalLMDataModule` ¶

Datamodule for causal language modelling tasks.

CSV column: text. Labels are produced by shifting input_ids right by one position (handled by the model internally when labels equals input_ids).

Source code in app/vision_automl/ml_engine/datamodule.py

class CausalLMDataModule:
    """Datamodule for causal language modelling tasks.

    CSV column: ``text``.  Labels are produced by shifting ``input_ids``
    right by one position (handled by the model internally when ``labels``
    equals ``input_ids``).
    """

    def __init__(
        self,
        csv_file: Path,
        text_col: str = "text",
        max_length: int = 256,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "distilgpt2",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.text_col = text_col
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.tokenizer: AutoTokenizer | None = None
        self.train_dataset: CausalLMFromCSVDataset | None = None
        self.val_dataset: CausalLMFromCSVDataset | None = None
        self.test_dataset: CausalLMFromCSVDataset | None = None
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        try:
            self.train_dataset = CausalLMFromCSVDataset(train_df, self.text_col)
            self.val_dataset = CausalLMFromCSVDataset(val_df, self.text_col)
            self.test_dataset = CausalLMFromCSVDataset(test_df, self.text_col)
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load tokenizer from %s: %s", self.hf_model_id, e)
            raise

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def _collate_fn(self, batch: list[str]) -> dict[str, torch.Tensor]:
        if self.tokenizer is None:
            raise AutoMLRuntimeError("Tokenizer not initialized.")
        encoding = self.tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        # For causal LM, labels = input_ids (model shifts internally)
        return {
            "input_ids": encoding.input_ids,
            "attention_mask": encoding.attention_mask,
            "labels": encoding.input_ids.clone(),
        }

    def _make_loader(self, dataset, shuffle: bool) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_dataset, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_dataset, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_dataset, shuffle=False)

`ImageClassificationDataModule` ¶

Handles dataset preparation and dataloaders for image classification tasks.

Source code in app/vision_automl/ml_engine/datamodule.py

class ImageClassificationDataModule:
    """Handles dataset preparation and dataloaders for image classification tasks."""

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        img_col: str = "filename",
        label_col: str = "label",
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        transform: Callable | None = None,
        shuffle: bool = True,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = DEFAULT_IMAGE_CLASSIFIER_HF_ID,
    ) -> None:
        self.csv_file = Path(csv_file)
        self.root_dir = Path(root_dir)
        self.img_col: str = img_col
        self.label_col: str = label_col
        self.batch_size: int = batch_size
        self.num_workers: int = num_workers
        self.transform: Callable | None = transform
        self.shuffle: bool = shuffle
        self.val_split: float = val_split
        self.test_split: float = test_split
        self.seed: int = seed
        self.hf_model_id: str = hf_model_id

        self.num_classes: int = 0
        self.train_dataset: ImageClassificationFromCSVDataset | None = None
        self.val_dataset: ImageClassificationFromCSVDataset | None = None
        self.test_dataset: ImageClassificationFromCSVDataset | None = None
        self.processor: AutoImageProcessor | None = None
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}

        logger.info("Initializing ImageClassificationDataModule with CSV: %s", csv_file)
        self.setup()

    def setup(self) -> None:
        """Create train/val/test splits, datasets, label maps, and processor."""
        try:
            logger.info("Reading dataset from %s", self.csv_file)
            df: pd.DataFrame = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df,
                test_size=self.val_split + self.test_split,
                stratify=df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error(
                "Failed to split dataset (insufficient samples or invalid stratification): %s",
                e,
            )
            raise

        try:
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df,
                test_size=1 - relative_val,
                stratify=temp_df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error("Failed to split validation/test data: %s", e)
            raise

        logger.info(
            "Split completed: train=%d, val=%d, test=%d",
            len(train_df),
            len(val_df),
            len(test_df),
        )

        try:
            self.train_dataset = ImageClassificationFromCSVDataset(
                csv_file=train_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                transform=self.transform,
            )
            self.val_dataset = ImageClassificationFromCSVDataset(
                csv_file=val_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                transform=self.transform,
            )
            self.test_dataset = ImageClassificationFromCSVDataset(
                csv_file=test_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                transform=self.transform,
            )
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        self.num_classes = len(self.train_dataset.classes)
        self.id2label = {i: c for i, c in enumerate(self.train_dataset.classes)}
        self.label2id = {c: i for i, c in enumerate(self.train_dataset.classes)}

        try:
            self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load processor from %s: %s", self.hf_model_id, e)
            raise
        logger.info("Loaded processor from: %s", self.hf_model_id)

    def _collate_fn(self, batch: list[tuple[Any, Any]]) -> dict[str, torch.Tensor]:
        images, labels = zip(*batch)
        if self.processor is None:
            raise AutoMLRuntimeError("Processor not initialized. Call setup() first.")
        pixel_values = self.processor(
            images=list(images), return_tensors="pt"
        ).pixel_values
        return {
            "pixel_values": pixel_values,
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise AutoMLRuntimeError(
                "Train dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def val_dataloader(self) -> DataLoader:
        if self.val_dataset is None:
            raise AutoMLRuntimeError(
                "Validation dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def test_dataloader(self) -> DataLoader:
        if self.test_dataset is None:
            raise AutoMLRuntimeError(
                "Test dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

`setup()` ¶

Create train/val/test splits, datasets, label maps, and processor.

Source code in app/vision_automl/ml_engine/datamodule.py

def setup(self) -> None:
    """Create train/val/test splits, datasets, label maps, and processor."""
    try:
        logger.info("Reading dataset from %s", self.csv_file)
        df: pd.DataFrame = pd.read_csv(self.csv_file)
    except FileNotFoundError as e:
        logger.error("Dataset file not found: %s", e)
        raise
    except pd.errors.EmptyDataError:
        logger.error("Dataset file is empty: %s", self.csv_file)
        raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
    except pd.errors.ParserError as e:
        logger.error("Failed to parse dataset CSV: %s", e)
        raise
    except Exception as e:
        logger.error("Unexpected error reading dataset: %s", e)
        raise

    try:
        train_df, temp_df = train_test_split(
            df,
            test_size=self.val_split + self.test_split,
            stratify=df[self.label_col],
            random_state=self.seed,
        )
    except ValueError as e:
        logger.error(
            "Failed to split dataset (insufficient samples or invalid stratification): %s",
            e,
        )
        raise

    try:
        relative_val = self.val_split / (self.val_split + self.test_split)
        val_df, test_df = train_test_split(
            temp_df,
            test_size=1 - relative_val,
            stratify=temp_df[self.label_col],
            random_state=self.seed,
        )
    except ValueError as e:
        logger.error("Failed to split validation/test data: %s", e)
        raise

    logger.info(
        "Split completed: train=%d, val=%d, test=%d",
        len(train_df),
        len(val_df),
        len(test_df),
    )

    try:
        self.train_dataset = ImageClassificationFromCSVDataset(
            csv_file=train_df,
            root_dir=self.root_dir,
            img_col=self.img_col,
            label_col=self.label_col,
            transform=self.transform,
        )
        self.val_dataset = ImageClassificationFromCSVDataset(
            csv_file=val_df,
            root_dir=self.root_dir,
            img_col=self.img_col,
            label_col=self.label_col,
            transform=self.transform,
        )
        self.test_dataset = ImageClassificationFromCSVDataset(
            csv_file=test_df,
            root_dir=self.root_dir,
            img_col=self.img_col,
            label_col=self.label_col,
            transform=self.transform,
        )
    except Exception as e:
        logger.error("Failed to create datasets: %s", e)
        raise

    self.num_classes = len(self.train_dataset.classes)
    self.id2label = {i: c for i, c in enumerate(self.train_dataset.classes)}
    self.label2id = {c: i for i, c in enumerate(self.train_dataset.classes)}

    try:
        self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
    except Exception as e:
        logger.error("Failed to load processor from %s: %s", self.hf_model_id, e)
        raise
    logger.info("Loaded processor from: %s", self.hf_model_id)

`ImageSegmentationDataModule` ¶

Bases: ImageClassificationDataModule

Datamodule for image segmentation tasks.

Uses the same CSV + class-subdir image layout as image classification. The collate function passes labels (pixel-level segmentation maps) to the processor. The labels CSV must contain a mask_filename column pointing to the segmentation mask image (same class-subdir layout).

Source code in app/vision_automl/ml_engine/datamodule.py

class ImageSegmentationDataModule(ImageClassificationDataModule):
    """Datamodule for image segmentation tasks.

    Uses the same CSV + class-subdir image layout as image classification.
    The collate function passes ``labels`` (pixel-level segmentation maps)
    to the processor.  The labels CSV must contain a ``mask_filename``
    column pointing to the segmentation mask image (same class-subdir layout).
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        img_col: str = "filename",
        label_col: str = "label",
        mask_col: str = "mask_filename",
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = DEFAULT_IMAGE_CLASSIFIER_HF_ID,
    ) -> None:
        self.mask_col = mask_col
        super().__init__(
            csv_file=csv_file,
            root_dir=root_dir,
            img_col=img_col,
            label_col=label_col,
            batch_size=batch_size,
            num_workers=num_workers,
            val_split=val_split,
            test_split=test_split,
            seed=seed,
            hf_model_id=hf_model_id,
        )

    def _collate_fn(self, batch: list[tuple[Any, Any]]) -> dict[str, torch.Tensor]:
        images, labels = zip(*batch)
        if self.processor is None:
            raise AutoMLRuntimeError("Processor not initialized.")
        encoding = self.processor(images=list(images), return_tensors="pt")
        return {
            "pixel_values": encoding.pixel_values,
            "labels": torch.stack(
                [l if isinstance(l, torch.Tensor) else torch.tensor(l) for l in labels]
            ),
        }

`KeypointDetectionDataModule` ¶

Bases: ImageClassificationDataModule

Datamodule for keypoint detection tasks.

Uses the same CSV + image layout as image classification. The keypoints_col should contain a JSON list of [x, y, visibility] entries (one per keypoint).

Source code in app/vision_automl/ml_engine/datamodule.py

class KeypointDetectionDataModule(ImageClassificationDataModule):
    """Datamodule for keypoint detection tasks.

    Uses the same CSV + image layout as image classification.
    The ``keypoints_col`` should contain a JSON list of
    ``[x, y, visibility]`` entries (one per keypoint).
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        img_col: str = "filename",
        label_col: str = "label",
        keypoints_col: str = "keypoints",
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = DEFAULT_IMAGE_CLASSIFIER_HF_ID,
    ) -> None:
        self.keypoints_col = keypoints_col
        super().__init__(
            csv_file=csv_file,
            root_dir=root_dir,
            img_col=img_col,
            label_col=label_col,
            batch_size=batch_size,
            num_workers=num_workers,
            val_split=val_split,
            test_split=test_split,
            seed=seed,
            hf_model_id=hf_model_id,
        )

    def _collate_fn(self, batch):
        images, labels = zip(*batch)
        if self.processor is None:
            raise AutoMLRuntimeError("Processor not initialized.")
        encoding = self.processor(images=list(images), return_tensors="pt")
        return {
            "pixel_values": encoding.pixel_values,
            "labels": torch.stack(
                [l if isinstance(l, torch.Tensor) else torch.tensor(l) for l in labels]
            ),
        }

`MaskedLMDataModule` ¶

Datamodule for masked language modelling tasks.

CSV column: text. Uses DataCollatorForLanguageModeling to randomly mask tokens at runtime.

Source code in app/vision_automl/ml_engine/datamodule.py

class MaskedLMDataModule:
    """Datamodule for masked language modelling tasks.

    CSV column: ``text``.  Uses ``DataCollatorForLanguageModeling`` to
    randomly mask tokens at runtime.
    """

    def __init__(
        self,
        csv_file: Path,
        text_col: str = "text",
        mlm_probability: float = 0.15,
        max_length: int = 256,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "bert-base-uncased",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.text_col = text_col
        self.mlm_probability = mlm_probability
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.tokenizer: AutoTokenizer | None = None
        self.data_collator: DataCollatorForLanguageModeling | None = None
        self.train_dataset: CausalLMFromCSVDataset | None = None
        self.val_dataset: CausalLMFromCSVDataset | None = None
        self.test_dataset: CausalLMFromCSVDataset | None = None
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        try:
            # Reuse CausalLMFromCSVDataset as it just returns text strings
            self.train_dataset = CausalLMFromCSVDataset(train_df, self.text_col)
            self.val_dataset = CausalLMFromCSVDataset(val_df, self.text_col)
            self.test_dataset = CausalLMFromCSVDataset(test_df, self.text_col)
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load tokenizer from %s: %s", self.hf_model_id, e)
            raise

        try:
            self.data_collator = DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer,
                mlm=True,
                mlm_probability=self.mlm_probability,
            )
        except Exception as e:
            logger.error("Failed to create data collator: %s", e)
            raise

    def _tokenize(self, batch: list[str]) -> dict[str, torch.Tensor]:
        if self.tokenizer is None:
            raise AutoMLRuntimeError("Tokenizer not initialized.")
        return self.tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

    def _collate_fn(self, batch: list[str]) -> dict[str, torch.Tensor]:
        encoding = self._tokenize(batch)
        # data_collator applies random masking and returns input_ids + labels
        collated = self.data_collator(
            [{"input_ids": ids} for ids in encoding.input_ids]
        )
        return {
            "input_ids": collated["input_ids"],
            "attention_mask": encoding.attention_mask,
            "labels": collated["labels"],
        }

    def _make_loader(self, dataset, shuffle: bool) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_dataset, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_dataset, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_dataset, shuffle=False)

`MultimodalClassificationDataModule` ¶

Handles dataset preparation and dataloaders for multimodal image classification tasks where the CSV contains auxiliary metadata columns alongside the filename and label.

Numeric auxiliary columns are standard-scaled; categorical/string columns are ordinal-encoded. Scalers/encoders are fit on the training split only and then applied to validation and test splits.

Source code in app/vision_automl/ml_engine/datamodule.py

class MultimodalClassificationDataModule:
    """Handles dataset preparation and dataloaders for multimodal image
    classification tasks where the CSV contains auxiliary metadata columns
    alongside the filename and label.

    Numeric auxiliary columns are standard-scaled; categorical/string columns
    are ordinal-encoded.  Scalers/encoders are fit on the **training split
    only** and then applied to validation and test splits.
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        img_col: str = "filename",
        label_col: str = "label",
        auxiliary_columns: list[str] | None = None,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        transform: Callable | None = None,
        shuffle: bool = True,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = DEFAULT_IMAGE_CLASSIFIER_HF_ID,
    ) -> None:
        self.csv_file = Path(csv_file)
        self.root_dir = Path(root_dir)
        self.img_col: str = img_col
        self.label_col: str = label_col
        self.auxiliary_columns: list[str] = auxiliary_columns or []
        self.batch_size: int = batch_size
        self.num_workers: int = num_workers
        self.transform: Callable | None = transform
        self.shuffle: bool = shuffle
        self.val_split: float = val_split
        self.test_split: float = test_split
        self.seed: int = seed
        self.hf_model_id: str = hf_model_id

        self.num_classes: int = 0
        self.aux_feature_dim: int = 0
        self.train_dataset: MultimodalClassificationDataset | None = None
        self.val_dataset: MultimodalClassificationDataset | None = None
        self.test_dataset: MultimodalClassificationDataset | None = None
        self.processor: AutoImageProcessor | None = None
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}

        self.numeric_cols: list[str] = []
        self.categorical_cols: list[str] = []
        self.scaler: StandardScaler | None = None
        self.encoder: OrdinalEncoder | None = None

        logger.info(
            "Initializing MultimodalClassificationDataModule with CSV: %s", csv_file
        )
        self.setup()

    def setup(self) -> None:
        try:
            logger.info("Reading dataset from %s", self.csv_file)
            df: pd.DataFrame = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        self.numeric_cols, self.categorical_cols = _infer_column_types(
            df, self.auxiliary_columns
        )
        self.aux_feature_dim = len(self.auxiliary_columns)
        logger.info(
            "Auxiliary columns — numeric: %s, categorical: %s (total dim=%d)",
            self.numeric_cols,
            self.categorical_cols,
            self.aux_feature_dim,
        )

        try:
            train_df, temp_df = train_test_split(
                df,
                test_size=self.val_split + self.test_split,
                stratify=df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error(
                "Failed to split dataset (insufficient samples or invalid stratification): %s",
                e,
            )
            raise

        try:
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df,
                test_size=1 - relative_val,
                stratify=temp_df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error("Failed to split validation/test data: %s", e)
            raise

        logger.info(
            "Split completed: train=%d, val=%d, test=%d",
            len(train_df),
            len(val_df),
            len(test_df),
        )

        train_df = self._encode_auxiliary(train_df, fit=True)
        val_df = self._encode_auxiliary(val_df, fit=False)
        test_df = self._encode_auxiliary(test_df, fit=False)

        try:
            self.train_dataset = MultimodalClassificationDataset(
                csv_file=train_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                auxiliary_columns=self.auxiliary_columns,
                transform=self.transform,
            )
            self.val_dataset = MultimodalClassificationDataset(
                csv_file=val_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                auxiliary_columns=self.auxiliary_columns,
                transform=self.transform,
            )
            self.test_dataset = MultimodalClassificationDataset(
                csv_file=test_df,
                root_dir=self.root_dir,
                img_col=self.img_col,
                label_col=self.label_col,
                auxiliary_columns=self.auxiliary_columns,
                transform=self.transform,
            )
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        self.num_classes = len(self.train_dataset.classes)
        self.id2label = {i: c for i, c in enumerate(self.train_dataset.classes)}
        self.label2id = {c: i for i, c in enumerate(self.train_dataset.classes)}

        try:
            self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load processor from %s: %s", self.hf_model_id, e)
            raise
        logger.info("Loaded processor from: %s", self.hf_model_id)

    def _encode_auxiliary(self, df: pd.DataFrame, fit: bool) -> pd.DataFrame:
        """Encode auxiliary columns in-place.  If *fit* is True, fit the
        scaler/encoder on *df* (must be the training split)."""
        df = df.copy()

        if self.numeric_cols:
            subset = df[self.numeric_cols].fillna(0.0)
            if fit:
                self.scaler = StandardScaler()
                df[self.numeric_cols] = self.scaler.fit_transform(subset)
            else:
                if self.scaler is None:
                    raise AutoMLRuntimeError(
                        "Scaler not fitted. Call setup() with training data first."
                    )
                df[self.numeric_cols] = self.scaler.transform(subset)

        if self.categorical_cols:
            subset = df[self.categorical_cols].astype(str).fillna("missing")
            if fit:
                self.encoder = OrdinalEncoder(
                    handle_unknown="use_encoded_value", unknown_value=-1
                )
                encoded = self.encoder.fit_transform(subset)
            else:
                if self.encoder is None:
                    raise AutoMLRuntimeError(
                        "OrdinalEncoder not fitted. Call setup() with training data first."
                    )
                encoded = self.encoder.transform(subset)
            for i, col in enumerate(self.categorical_cols):
                df[col] = encoded[:, i].astype(float)

        return df

    def _collate_fn(self, batch: list[tuple[Any, Any, Any]]) -> dict[str, torch.Tensor]:
        images, aux_values, labels = zip(*batch)
        if self.processor is None:
            raise AutoMLRuntimeError("Processor not initialized. Call setup() first.")
        pixel_values = self.processor(
            images=list(images), return_tensors="pt"
        ).pixel_values
        return {
            "pixel_values": pixel_values,
            "aux_features": torch.tensor(np.stack(aux_values), dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise AutoMLRuntimeError(
                "Train dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def val_dataloader(self) -> DataLoader:
        if self.val_dataset is None:
            raise AutoMLRuntimeError(
                "Validation dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def test_dataloader(self) -> DataLoader:
        if self.test_dataset is None:
            raise AutoMLRuntimeError(
                "Test dataset not initialized. Call setup() first."
            )
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

`ObjectDetectionDataModule` ¶

Datamodule for object detection tasks.

CSV columns: filename (image file), boxes (JSON list of [x_min, y_min, x_max, y_max]), class_labels (JSON list of int class IDs). Images live in class-neutral flat layout under root_dir/images/.

Source code in app/vision_automl/ml_engine/datamodule.py

class ObjectDetectionDataModule:
    """Datamodule for object detection tasks.

    CSV columns: ``filename`` (image file), ``boxes`` (JSON list of
    ``[x_min, y_min, x_max, y_max]``), ``class_labels`` (JSON list of
    int class IDs).  Images live in class-neutral flat layout under
    ``root_dir/images/``.
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        img_col: str = "filename",
        boxes_col: str = "boxes",
        class_labels_col: str = "class_labels",
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "facebook/detr-resnet-50",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.root_dir = Path(root_dir)
        self.img_col = img_col
        self.boxes_col = boxes_col
        self.class_labels_col = class_labels_col
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.processor: AutoImageProcessor | None = None
        self.num_classes: int = 0
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.train_df: pd.DataFrame | None = None
        self.val_df: pd.DataFrame | None = None
        self.test_df: pd.DataFrame | None = None
        self.setup()

    def setup(self) -> None:
        import json as _json

        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            all_labels: set[int] = set()
            for row in df[self.class_labels_col]:
                all_labels.update(_json.loads(row))
        except json.JSONDecodeError as e:
            logger.error("Failed to parse JSON in class_labels column: %s", e)
            raise
        except KeyError as e:
            logger.error("Column not found in dataset: %s", e)
            raise

        self.num_classes = len(all_labels)
        self.id2label = {i: str(i) for i in sorted(all_labels)}
        self.label2id = {v: k for k, v in self.id2label.items()}

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        self.train_df = train_df.reset_index(drop=True)
        self.val_df = val_df.reset_index(drop=True)
        self.test_df = test_df.reset_index(drop=True)

        try:
            self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load processor from %s: %s", self.hf_model_id, e)
            raise

    def _make_dataset(self, df: pd.DataFrame) -> Dataset:
        import json as _json

        from PIL import Image as _Image

        root = self.root_dir

        class _DetectionDataset(Dataset):
            def __init__(self, df, root, img_col, boxes_col, class_labels_col):
                self.df = df
                self.root = root
                self.img_col = img_col
                self.boxes_col = boxes_col
                self.class_labels_col = class_labels_col

            def __len__(self):
                return len(self.df)

            def __getitem__(self, idx):
                row = self.df.iloc[idx]
                img = _Image.open(self.root / str(row[self.img_col])).convert("RGB")
                boxes = _json.loads(row[self.boxes_col])
                class_labels = _json.loads(row[self.class_labels_col])
                return img, {
                    "boxes": torch.tensor(boxes, dtype=torch.float32),
                    "class_labels": torch.tensor(class_labels, dtype=torch.long),
                }

        return _DetectionDataset(
            df, root, self.img_col, self.boxes_col, self.class_labels_col
        )

    def _collate_fn(self, batch):
        images, targets = zip(*batch)
        encoding = self.processor(images=list(images), return_tensors="pt")
        return {"pixel_values": encoding.pixel_values, "labels": list(targets)}

    def _make_loader(self, df: pd.DataFrame, shuffle: bool) -> DataLoader:
        return DataLoader(
            self._make_dataset(df),
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_df, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_df, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_df, shuffle=False)

`QuestionAnsweringDataModule` ¶

Datamodule for extractive question answering tasks.

CSV columns: question, context, answer_start (char offset), answer_text.

Source code in app/vision_automl/ml_engine/datamodule.py

class QuestionAnsweringDataModule:
    """Datamodule for extractive question answering tasks.

    CSV columns: ``question``, ``context``, ``answer_start`` (char offset),
    ``answer_text``.
    """

    def __init__(
        self,
        csv_file: Path,
        question_col: str = "question",
        context_col: str = "context",
        answer_start_col: str = "answer_start",
        answer_text_col: str = "answer_text",
        max_length: int = 384,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "distilbert-base-uncased-distilled-squad",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.question_col = question_col
        self.context_col = context_col
        self.answer_start_col = answer_start_col
        self.answer_text_col = answer_text_col
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.tokenizer: AutoTokenizer | None = None
        self.train_dataset: QuestionAnsweringFromCSVDataset | None = None
        self.val_dataset: QuestionAnsweringFromCSVDataset | None = None
        self.test_dataset: QuestionAnsweringFromCSVDataset | None = None
        # QA tasks do not use id2label
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        try:
            self.train_dataset = QuestionAnsweringFromCSVDataset(
                train_df,
                self.question_col,
                self.context_col,
                self.answer_start_col,
                self.answer_text_col,
            )
            self.val_dataset = QuestionAnsweringFromCSVDataset(
                val_df,
                self.question_col,
                self.context_col,
                self.answer_start_col,
                self.answer_text_col,
            )
            self.test_dataset = QuestionAnsweringFromCSVDataset(
                test_df,
                self.question_col,
                self.context_col,
                self.answer_start_col,
                self.answer_text_col,
            )
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load tokenizer from %s: %s", self.hf_model_id, e)
            raise

    def _collate_fn(self, batch: list[dict]) -> dict[str, torch.Tensor]:
        if self.tokenizer is None:
            raise AutoMLRuntimeError("Tokenizer not initialized.")
        questions = [b["question"] for b in batch]
        contexts = [b["context"] for b in batch]
        answer_starts = [b["answer_start"] for b in batch]
        answer_texts = [b["answer_text"] for b in batch]

        encoding = self.tokenizer(
            questions,
            contexts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
            return_offsets_mapping=True,
        )
        offset_mapping = encoding.pop("offset_mapping")

        # Convert character-level answer positions to token positions
        start_positions = []
        end_positions = []
        for i, (start_char, answer) in enumerate(zip(answer_starts, answer_texts)):
            end_char = start_char + len(answer)
            offsets = offset_mapping[i].tolist()
            token_start = token_end = 0
            for j, (s, e) in enumerate(offsets):
                if s <= start_char < e:
                    token_start = j
                if s < end_char <= e:
                    token_end = j
                    break
            start_positions.append(token_start)
            end_positions.append(token_end)

        return {
            "input_ids": encoding.input_ids,
            "attention_mask": encoding.attention_mask,
            "start_positions": torch.tensor(start_positions, dtype=torch.long),
            "end_positions": torch.tensor(end_positions, dtype=torch.long),
        }

    def _make_loader(self, dataset, shuffle: bool) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_dataset, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_dataset, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_dataset, shuffle=False)

`Seq2SeqLMDataModule` ¶

Datamodule for sequence-to-sequence tasks.

CSV columns: input_text and target_text.

Source code in app/vision_automl/ml_engine/datamodule.py

class Seq2SeqLMDataModule:
    """Datamodule for sequence-to-sequence tasks.

    CSV columns: ``input_text`` and ``target_text``.
    """

    def __init__(
        self,
        csv_file: Path,
        input_col: str = "input_text",
        target_col: str = "target_text",
        max_source_length: int = 256,
        max_target_length: int = 128,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "t5-small",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.input_col = input_col
        self.target_col = target_col
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.tokenizer: AutoTokenizer | None = None
        self.train_dataset: Seq2SeqFromCSVDataset | None = None
        self.val_dataset: Seq2SeqFromCSVDataset | None = None
        self.test_dataset: Seq2SeqFromCSVDataset | None = None
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        try:
            self.train_dataset = Seq2SeqFromCSVDataset(
                train_df, self.input_col, self.target_col
            )
            self.val_dataset = Seq2SeqFromCSVDataset(
                val_df, self.input_col, self.target_col
            )
            self.test_dataset = Seq2SeqFromCSVDataset(
                test_df, self.input_col, self.target_col
            )
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load tokenizer from %s: %s", self.hf_model_id, e)
            raise

    def _collate_fn(self, batch: list[tuple[str, str]]) -> dict[str, torch.Tensor]:
        if self.tokenizer is None:
            raise AutoMLRuntimeError("Tokenizer not initialized.")
        inputs, targets = zip(*batch)
        src = self.tokenizer(
            list(inputs),
            padding=True,
            truncation=True,
            max_length=self.max_source_length,
            return_tensors="pt",
        )
        tgt = self.tokenizer(
            list(targets),
            padding=True,
            truncation=True,
            max_length=self.max_target_length,
            return_tensors="pt",
        )
        labels = tgt.input_ids.clone()
        # Replace pad token id with -100 so it's ignored in loss
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": src.input_ids,
            "attention_mask": src.attention_mask,
            "labels": labels,
        }

    def _make_loader(self, dataset, shuffle: bool) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_dataset, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_dataset, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_dataset, shuffle=False)

`SequenceClassificationDataModule` ¶

Datamodule for text sequence classification tasks.

CSV columns: text and label.

Source code in app/vision_automl/ml_engine/datamodule.py

class SequenceClassificationDataModule:
    """Datamodule for text sequence classification tasks.

    CSV columns: ``text`` and ``label``.
    """

    def __init__(
        self,
        csv_file: Path,
        text_col: str = "text",
        label_col: str = "label",
        max_length: int = 128,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "distilbert-base-uncased",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.text_col = text_col
        self.label_col = label_col
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.num_classes: int = 0
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.tokenizer: AutoTokenizer | None = None
        self.train_dataset: TextClassificationFromCSVDataset | None = None
        self.val_dataset: TextClassificationFromCSVDataset | None = None
        self.test_dataset: TextClassificationFromCSVDataset | None = None
        self.setup()

    def setup(self) -> None:
        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            train_df, temp_df = train_test_split(
                df,
                test_size=self.val_split + self.test_split,
                stratify=df[self.label_col],
                random_state=self.seed,
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df,
                test_size=1 - relative_val,
                stratify=temp_df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        try:
            self.train_dataset = TextClassificationFromCSVDataset(
                train_df, self.text_col, self.label_col
            )
            self.val_dataset = TextClassificationFromCSVDataset(
                val_df, self.text_col, self.label_col
            )
            self.test_dataset = TextClassificationFromCSVDataset(
                test_df, self.text_col, self.label_col
            )
        except Exception as e:
            logger.error("Failed to create datasets: %s", e)
            raise

        classes = self.train_dataset.classes
        self.num_classes = len(classes)
        self.id2label = {i: str(c) for i, c in enumerate(classes)}
        self.label2id = {str(c): i for i, c in enumerate(classes)}

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load tokenizer from %s: %s", self.hf_model_id, e)
            raise

    def _collate_fn(self, batch: list[tuple[str, int]]) -> dict[str, torch.Tensor]:
        texts, labels = zip(*batch)
        if self.tokenizer is None:
            raise AutoMLRuntimeError("Tokenizer not initialized.")
        encoding = self.tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding.input_ids,
            "attention_mask": encoding.attention_mask,
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def _make_loader(self, dataset, shuffle: bool) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_dataset, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_dataset, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_dataset, shuffle=False)

`VideoClassificationDataModule` ¶

Datamodule for video classification tasks.

CSV columns: video_path (relative to root_dir) and label. Frames are decoded using torchvision.io.read_video.

Source code in app/vision_automl/ml_engine/datamodule.py

class VideoClassificationDataModule:
    """Datamodule for video classification tasks.

    CSV columns: ``video_path`` (relative to ``root_dir``) and ``label``.
    Frames are decoded using ``torchvision.io.read_video``.
    """

    def __init__(
        self,
        csv_file: Path,
        root_dir: Path,
        video_col: str = "video_path",
        label_col: str = "label",
        num_frames: int = 8,
        batch_size: int = DEFAULT_BATCH_SIZE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        val_split: float = DEFAULT_VAL_SPLIT,
        test_split: float = DEFAULT_TEST_SPLIT,
        seed: int = 42,
        hf_model_id: str = "MCG-NJU/videomae-base",
    ) -> None:
        self.csv_file = Path(csv_file)
        self.root_dir = Path(root_dir)
        self.video_col = video_col
        self.label_col = label_col
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_split = val_split
        self.test_split = test_split
        self.seed = seed
        self.hf_model_id = hf_model_id
        self.num_classes: int = 0
        self.id2label: dict[int, str] = {}
        self.label2id: dict[str, int] = {}
        self.processor: AutoImageProcessor | None = None
        self.train_df: pd.DataFrame | None = None
        self.val_df: pd.DataFrame | None = None
        self.test_df: pd.DataFrame | None = None
        self.setup()

    def setup(self) -> None:
        import json

        try:
            df = pd.read_csv(self.csv_file)
        except FileNotFoundError as e:
            logger.error("Dataset file not found: %s", e)
            raise
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise
        except Exception as e:
            logger.error("Unexpected error reading dataset: %s", e)
            raise

        try:
            all_labels: set[int] = set()
            for row in df[self.class_labels_col]:
                all_labels.update(json.loads(row))
        except json.JSONDecodeError as e:
            logger.error("Failed to parse JSON in class_labels column: %s", e)
            raise
        except KeyError as e:
            logger.error("Column not found in dataset: %s", e)
            raise

        self.num_classes = len(all_labels)
        self.id2label = {i: str(i) for i in sorted(all_labels)}
        self.label2id = {v: k for k, v in self.id2label.items()}

        try:
            train_df, temp_df = train_test_split(
                df, test_size=self.val_split + self.test_split, random_state=self.seed
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df, test_size=1 - relative_val, random_state=self.seed
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        self.train_df = train_df.reset_index(drop=True)
        self.val_df = val_df.reset_index(drop=True)
        self.test_df = test_df.reset_index(drop=True)

        try:
            self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
        except pd.errors.EmptyDataError:
            logger.error("Dataset file is empty: %s", self.csv_file)
            raise AutoMLDataError(f"Dataset file is empty: {self.csv_file}")
        except pd.errors.ParserError as e:
            logger.error("Failed to parse dataset CSV: %s", e)
            raise

        try:
            classes = sorted(df[self.label_col].unique().tolist())
        except KeyError as e:
            logger.error(
                "Label column '%s' not found in dataset: %s", self.label_col, e
            )
            raise

        self.num_classes = len(classes)
        self.id2label = {i: c for i, c in enumerate(classes)}
        self.label2id = {c: i for i, c in enumerate(classes)}
        df = df.copy()
        df[self.label_col] = df[self.label_col].map(self.label2id)

        try:
            train_df, temp_df = train_test_split(
                df,
                test_size=self.val_split + self.test_split,
                stratify=df[self.label_col],
                random_state=self.seed,
            )
            relative_val = self.val_split / (self.val_split + self.test_split)
            val_df, test_df = train_test_split(
                temp_df,
                test_size=1 - relative_val,
                stratify=temp_df[self.label_col],
                random_state=self.seed,
            )
        except ValueError as e:
            logger.error("Failed to split dataset: %s", e)
            raise

        self.train_df = train_df.reset_index(drop=True)
        self.val_df = val_df.reset_index(drop=True)
        self.test_df = test_df.reset_index(drop=True)

        try:
            self.processor = AutoImageProcessor.from_pretrained(self.hf_model_id)
        except Exception as e:
            logger.error("Failed to load processor from %s: %s", self.hf_model_id, e)
            raise

    def _make_dataset(self, df: pd.DataFrame) -> Dataset:
        from torchvision.io import read_video

        root = self.root_dir
        num_frames = self.num_frames
        video_col = self.video_col
        label_col = self.label_col

        class _VideoDataset(Dataset):
            def __init__(self, df):
                self.df = df

            def __len__(self):
                return len(self.df)

            def __getitem__(self, idx):
                row = self.df.iloc[idx]
                video_path = str(root / str(row[video_col]))
                frames, _, _ = read_video(
                    video_path, output_format="TCHW", pts_unit="sec"
                )
                # Sample num_frames evenly
                total = frames.shape[0]
                indices = torch.linspace(0, total - 1, num_frames).long()
                frames = frames[indices]  # (T, C, H, W)
                return frames.float() / 255.0, torch.tensor(
                    int(row[label_col]), dtype=torch.long
                )

        return _VideoDataset(df)

    def _collate_fn(self, batch):
        clips, labels = zip(*batch)
        return {
            "pixel_values": torch.stack(clips),  # (B, T, C, H, W)
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def _make_loader(self, df: pd.DataFrame, shuffle: bool) -> DataLoader:
        return DataLoader(
            self._make_dataset(df),
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            collate_fn=self._collate_fn,
        )

    def train_dataloader(self) -> DataLoader:
        return self._make_loader(self.train_df, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self._make_loader(self.val_df, shuffle=False)

    def test_dataloader(self) -> DataLoader:
        return self._make_loader(self.test_df, shuffle=False)

`CausalLMFromCSVDataset` ¶

Bases: Dataset

Dataset for causal language modelling tasks.

Expected CSV column: text. The datamodule tokenises and shifts labels automatically.

Source code in app/vision_automl/ml_engine/dataset.py

class CausalLMFromCSVDataset(Dataset):
    """Dataset for causal language modelling tasks.

    Expected CSV column: ``text``.  The datamodule tokenises and shifts
    labels automatically.
    """

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        text_col: str = "text",
    ):
        if isinstance(csv_file, Path):
            try:
                self.df = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.df = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.text_col = text_col

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> str:
        if torch.is_tensor(idx):
            idx = idx.item()
        return str(self.df.iloc[idx][self.text_col])

`ImageClassificationFromCSVDataset` ¶

Bases: Dataset

Torch dataset that reads image paths and labels from a CSV/DataFrame.

Source code in app/vision_automl/ml_engine/dataset.py

class ImageClassificationFromCSVDataset(Dataset):
    """Torch dataset that reads image paths and labels from a CSV/DataFrame."""

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        root_dir: Path,
        img_col: str = "image",
        label_col: str = "label",
        transform: Optional[T.Compose] = None,
    ):
        if isinstance(csv_file, Path):
            try:
                self.label_csv = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.label_csv = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.root_dir = Path(root_dir)
        self.img_col = img_col
        self.label_col = label_col
        # By default, do not apply torchvision transforms so that a Hugging Face
        # AutoImageProcessor can handle preprocessing in a DataLoader collate_fn.
        self.transform = transform

        label_series = self.label_csv[self.label_col]
        self._use_label_subdir: bool = not pd.api.types.is_numeric_dtype(label_series)

        if self._use_label_subdir:
            self.classes = sorted(label_series.unique().tolist())
            self.class_to_idx = {
                cls_name: idx for idx, cls_name in enumerate(self.classes)
            }
            self.idx_to_class = {
                idx: cls_name for cls_name, idx in self.class_to_idx.items()
            }
            self.label_csv[self.label_col] = self.label_csv[self.label_col].map(
                self.class_to_idx
            )
        else:
            raw_vals = sorted(label_series.dropna().unique().tolist())
            self.classes = raw_vals
            self.class_to_idx = {v: i for i, v in enumerate(raw_vals)}
            self.idx_to_class = {i: v for v, i in self.class_to_idx.items()}
            self.label_csv[self.label_col] = self.label_csv[self.label_col].map(
                self.class_to_idx
            )

    def __len__(self):
        """Return number of samples."""
        return len(self.label_csv)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.item()

        row = self.label_csv.iloc[idx]
        label_idx = int(row[self.label_col])

        filename = str(row[self.img_col]).strip().replace("\\", "/")

        if self._use_label_subdir:
            label_name = self.idx_to_class[label_idx]
            img_path = self.root_dir / str(label_name) / filename
        else:
            img_path = self.root_dir / filename

        if not img_path.exists():
            logger.error(
                "Image not found: root_dir=%s, use_label_subdir=%s, filename=%s",
                self.root_dir,
                self._use_label_subdir,
                filename,
            )
            print(os.listdir(self.root_dir))
            if self._use_label_subdir:
                print(os.listdir(self.root_dir / str(label_name)))

            raise AutoMLDataError(
                f"Image not found\n"
                f"Expected path: {img_path}\n"
                f"root_dir: {self.root_dir}\n"
                f"use_label_subdir: {self._use_label_subdir}\n"
                f"filename: {repr(filename)}"
            )

        try:
            img = Image.open(img_path).convert("RGB")
        except Exception as e:
            logger.error("Failed to open or convert image %s: %s", img_path, e)
            raise

        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(label_idx, dtype=torch.long)

`len()` ¶

Return number of samples.

Source code in app/vision_automl/ml_engine/dataset.py

def __len__(self):
    """Return number of samples."""
    return len(self.label_csv)

`MultimodalClassificationDataset` ¶

Bases: Dataset

Torch dataset for multimodal image classification with auxiliary tabular features.

In addition to image + label (like ImageClassificationFromCSVDataset), this dataset also returns auxiliary feature values from extra CSV columns. Raw values are returned here; encoding/scaling is handled by the datamodule.

Returns (PIL.Image, aux_array, int_label) per sample.

Source code in app/vision_automl/ml_engine/dataset.py

class MultimodalClassificationDataset(Dataset):
    """Torch dataset for multimodal image classification with auxiliary tabular features.

    In addition to image + label (like ``ImageClassificationFromCSVDataset``),
    this dataset also returns auxiliary feature values from extra CSV columns.
    Raw values are returned here; encoding/scaling is handled by the datamodule.

    Returns ``(PIL.Image, aux_array, int_label)`` per sample.
    """

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        root_dir: Path,
        img_col: str = "filename",
        label_col: str = "label",
        auxiliary_columns: list[str] | None = None,
        transform: Optional[T.Compose] = None,
    ):
        if isinstance(csv_file, Path):
            try:
                self.label_csv = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.label_csv = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.root_dir = root_dir
        self.img_col = img_col
        self.label_col = label_col
        self.auxiliary_columns = auxiliary_columns or []
        self.transform = transform

        if self.label_csv[self.label_col].dtype not in [int, float]:
            self.classes = sorted(self.label_csv[self.label_col].unique().tolist())
            self.class_to_idx = {
                cls_name: idx for idx, cls_name in enumerate(self.classes)
            }
            self.idx_to_class = {
                idx: cls_name for cls_name, idx in self.class_to_idx.items()
            }
            self.label_csv[self.label_col] = self.label_csv[self.label_col].map(
                self.class_to_idx
            )
        else:
            self.classes = sorted(self.label_csv[self.label_col].unique().tolist())
            self.class_to_idx = {cls: cls for cls in self.classes}
            self.idx_to_class = {cls: cls for cls in self.classes}

    def __len__(self):
        return len(self.label_csv)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.item()

        row = self.label_csv.iloc[idx]
        label_idx = int(row[self.label_col])
        label_name = str(self.idx_to_class[label_idx])

        filename = str(row[self.img_col]).strip()

        img_path = self.root_dir / label_name / filename
        if not img_path.exists():
            logger.error(
                "Image not found: root_dir=%s, label_name=%s, filename=%s",
                self.root_dir,
                label_name,
                filename,
            )
            raise AutoMLDataError(
                f"Image not found\n"
                f"Expected path: {img_path}\n"
                f"root_dir: {self.root_dir}\n"
                f"label_name: {label_name}\n"
                f"filename: {repr(filename)}"
            )

        try:
            img = Image.open(img_path).convert("RGB")
        except Exception as e:
            logger.error("Failed to open or convert image %s: %s", img_path, e)
            raise

        if self.transform:
            img = self.transform(img)

        if self.auxiliary_columns:
            aux_values = np.array(
                [row[col] for col in self.auxiliary_columns], dtype=np.float32
            )
        else:
            aux_values = np.array([], dtype=np.float32)

        return img, aux_values, torch.tensor(label_idx, dtype=torch.long)

`QuestionAnsweringFromCSVDataset` ¶

Bases: Dataset

Dataset for extractive QA tasks.

Expected CSV columns: question, context, answer_start (int), answer_text (str). Returns raw strings; the datamodule tokenises them.

Source code in app/vision_automl/ml_engine/dataset.py

class QuestionAnsweringFromCSVDataset(Dataset):
    """Dataset for extractive QA tasks.

    Expected CSV columns: ``question``, ``context``, ``answer_start`` (int),
    ``answer_text`` (str).  Returns raw strings; the datamodule tokenises them.
    """

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        question_col: str = "question",
        context_col: str = "context",
        answer_start_col: str = "answer_start",
        answer_text_col: str = "answer_text",
    ):
        if isinstance(csv_file, Path):
            try:
                self.df = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.df = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.question_col = question_col
        self.context_col = context_col
        self.answer_start_col = answer_start_col
        self.answer_text_col = answer_text_col

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict:
        if torch.is_tensor(idx):
            idx = idx.item()
        row = self.df.iloc[idx]
        return {
            "question": str(row[self.question_col]),
            "context": str(row[self.context_col]),
            "answer_start": int(row[self.answer_start_col]),
            "answer_text": str(row[self.answer_text_col]),
        }

`Seq2SeqFromCSVDataset` ¶

Bases: Dataset

Dataset for sequence-to-sequence tasks.

Expected CSV columns: input_text and target_text.

Source code in app/vision_automl/ml_engine/dataset.py

class Seq2SeqFromCSVDataset(Dataset):
    """Dataset for sequence-to-sequence tasks.

    Expected CSV columns: ``input_text`` and ``target_text``.
    """

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        input_col: str = "input_text",
        target_col: str = "target_text",
    ):
        if isinstance(csv_file, Path):
            try:
                self.df = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.df = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.input_col = input_col
        self.target_col = target_col

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> tuple[str, str]:
        if torch.is_tensor(idx):
            idx = idx.item()
        row = self.df.iloc[idx]
        return str(row[self.input_col]), str(row[self.target_col])

`TextClassificationFromCSVDataset` ¶

Bases: Dataset

Torch dataset that reads text and labels from a CSV/DataFrame.

Expected columns: text (str) and label (str or int). Returns (text, label_idx) tuples — the collate function in the datamodule applies the tokeniser.

Source code in app/vision_automl/ml_engine/dataset.py

class TextClassificationFromCSVDataset(Dataset):
    """Torch dataset that reads text and labels from a CSV/DataFrame.

    Expected columns: ``text`` (str) and ``label`` (str or int).
    Returns ``(text, label_idx)`` tuples — the collate function in the
    datamodule applies the tokeniser.
    """

    def __init__(
        self,
        csv_file: Union[Path, pd.DataFrame],
        text_col: str = "text",
        label_col: str = "label",
    ):
        if isinstance(csv_file, Path):
            try:
                self.df = pd.read_csv(csv_file)
            except FileNotFoundError:
                logger.error("Dataset CSV file not found: %s", csv_file)
                raise
            except pd.errors.EmptyDataError:
                logger.error("Dataset CSV file is empty: %s", csv_file)
                raise AutoMLDataError(f"Dataset CSV file is empty: {csv_file}")
            except pd.errors.ParserError as e:
                logger.error("Failed to parse dataset CSV file: %s", e)
                raise
            except Exception as e:
                logger.error("Unexpected error reading dataset CSV file: %s", e)
                raise
        elif isinstance(csv_file, pd.DataFrame):
            self.df = csv_file.reset_index(drop=True)
        else:
            raise AutoMLValidationError("csv_file must be a path or DataFrame")

        self.text_col = text_col
        self.label_col = label_col

        if self.df[self.label_col].dtype == object:
            self.classes = sorted(self.df[self.label_col].unique().tolist())
            self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
            self.df = self.df.copy()
            self.df[self.label_col] = self.df[self.label_col].map(self.class_to_idx)
        else:
            self.classes = sorted(self.df[self.label_col].unique().tolist())
            self.class_to_idx = {c: c for c in self.classes}

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> tuple[str, int]:
        if torch.is_tensor(idx):
            idx = idx.item()
        row = self.df.iloc[idx]
        return str(row[self.text_col]), int(row[self.label_col])

`AudioClassificationModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForAudioClassification.

Source code in app/vision_automl/ml_engine/model.py

class AudioClassificationModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForAudioClassification."""

    def __init__(
        self,
        model_id: str,
        num_classes: int = 2,
        id2label: dict | None = None,
        label2id: dict | None = None,
    ):
        super().__init__()
        try:
            self.model = AutoModelForAudioClassification.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                num_labels=num_classes,
                id2label=id2label or {i: str(i) for i in range(num_classes)},
                label2id=label2id or {str(i): i for i in range(num_classes)},
            )
        except Exception as e:
            logger.error(
                "Failed to load audio classification model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load audio classification model from {model_id}: {e}"
            ) from e

    def forward(self, input_values: torch.Tensor) -> torch.Tensor:
        return self.model(input_values=input_values).logits

`CausalLMModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForCausalLM.

Source code in app/vision_automl/ml_engine/model.py

class CausalLMModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForCausalLM."""

    def __init__(self, model_id: str):
        super().__init__()
        try:
            self.model = AutoModelForCausalLM.from_pretrained(model_id)
        except Exception as e:
            logger.error("Failed to load causal LM model from %s: %s", model_id, e)
            raise AutoMLTrainingError(
                f"Failed to load causal LM model from {model_id}: {e}"
            ) from e

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Always returns the scalar language modelling loss."""
        return self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        ).loss

`forward(input_ids, attention_mask=None, labels=None)` ¶

Always returns the scalar language modelling loss.

Source code in app/vision_automl/ml_engine/model.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    labels: torch.Tensor | None = None,
) -> torch.Tensor:
    """Always returns the scalar language modelling loss."""
    return self.model(
        input_ids=input_ids, attention_mask=attention_mask, labels=labels
    ).loss

`ImageClassificationModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForImageClassification. This module is responsible for Image classification!!!

Source code in app/vision_automl/ml_engine/model.py

class ImageClassificationModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForImageClassification. This module is responsible for Image classification!!!"""

    def __init__(
        self,
        model_id: str = "google/vit-base-patch16-224",
        num_classes: int = 2,
        freeze_backbone: bool = True,
        id2label: dict | None = None,
        label2id: dict | None = None,
    ):
        super().__init__()
        config_kwargs = {
            "num_labels": num_classes,
            "id2label": id2label or {i: str(i) for i in range(num_classes)},
            "label2id": label2id or {str(i): i for i in range(num_classes)},
        }
        try:
            self.model = AutoModelForImageClassification.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                **config_kwargs,
            )
        except Exception as e:
            logger.error(
                "Failed to load image classification model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load image classification model from {model_id}: {e}"
            ) from e
        if freeze_backbone:
            for param in self.model.parameters():
                param.requires_grad = False
            if hasattr(self.model, "classifier"):
                for param in self.model.classifier.parameters():
                    param.requires_grad = True

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        return self.model(pixel_values).logits

`ImageSegmentationModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForImageSegmentation.

Source code in app/vision_automl/ml_engine/model.py

class ImageSegmentationModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForImageSegmentation."""

    def __init__(
        self,
        model_id: str,
        num_classes: int = 2,
        id2label: dict | None = None,
        label2id: dict | None = None,
    ):
        super().__init__()
        try:
            self.model = AutoModelForImageSegmentation.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                num_labels=num_classes,
                id2label=id2label or {i: str(i) for i in range(num_classes)},
                label2id=label2id or {str(i): i for i in range(num_classes)},
            )
        except Exception as e:
            logger.error(
                "Failed to load image segmentation model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load image segmentation model from {model_id}: {e}"
            ) from e

    def forward(self, pixel_values: torch.Tensor, labels: torch.Tensor | None = None):
        """Returns loss (scalar) when labels provided, else logits."""
        output = self.model(pixel_values=pixel_values, labels=labels)
        return output.loss if labels is not None else output.logits

`forward(pixel_values, labels=None)` ¶

Returns loss (scalar) when labels provided, else logits.

Source code in app/vision_automl/ml_engine/model.py

def forward(self, pixel_values: torch.Tensor, labels: torch.Tensor | None = None):
    """Returns loss (scalar) when labels provided, else logits."""
    output = self.model(pixel_values=pixel_values, labels=labels)
    return output.loss if labels is not None else output.logits

`KeypointDetectionModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForKeypointDetection.

Source code in app/vision_automl/ml_engine/model.py

class KeypointDetectionModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForKeypointDetection."""

    def __init__(self, model_id: str):
        super().__init__()
        try:
            self.model = AutoModelForKeypointDetection.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
            )
        except Exception as e:
            logger.error(
                "Failed to load keypoint detection model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load keypoint detection model from {model_id}: {e}"
            ) from e

    def forward(self, pixel_values: torch.Tensor, labels=None):
        """Returns loss when labels provided, else raw output."""
        output = self.model(pixel_values=pixel_values, labels=labels)
        return output.loss if labels is not None else output

`forward(pixel_values, labels=None)` ¶

Returns loss when labels provided, else raw output.

Source code in app/vision_automl/ml_engine/model.py

def forward(self, pixel_values: torch.Tensor, labels=None):
    """Returns loss when labels provided, else raw output."""
    output = self.model(pixel_values=pixel_values, labels=labels)
    return output.loss if labels is not None else output

`MaskedLMModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForMaskedLM.

Source code in app/vision_automl/ml_engine/model.py

class MaskedLMModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForMaskedLM."""

    def __init__(self, model_id: str):
        super().__init__()
        try:
            self.model = AutoModelForMaskedLM.from_pretrained(model_id)
        except Exception as e:
            logger.error("Failed to load masked LM model from %s: %s", model_id, e)
            raise AutoMLTrainingError(
                f"Failed to load masked LM model from {model_id}: {e}"
            ) from e

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Returns the scalar masked language modelling loss."""
        return self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        ).loss

`forward(input_ids, attention_mask=None, labels=None)` ¶

Returns the scalar masked language modelling loss.

Source code in app/vision_automl/ml_engine/model.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    labels: torch.Tensor | None = None,
) -> torch.Tensor:
    """Returns the scalar masked language modelling loss."""
    return self.model(
        input_ids=input_ids, attention_mask=attention_mask, labels=labels
    ).loss

`MultimodalClassificationModel` ¶

Bases: Module

Multimodal image classification model that fuses vision embeddings with auxiliary tabular features via early (concatenation) fusion.

Architecture

HF vision backbone (without classification head) produces image embeddings.
A small MLP processes tabular auxiliary features.
Image and tabular embeddings are concatenated and passed through a fusion classifier head.

Source code in app/vision_automl/ml_engine/model.py

class MultimodalClassificationModel(nn.Module):
    """Multimodal image classification model that fuses vision embeddings
    with auxiliary tabular features via early (concatenation) fusion.

    Architecture:
        1. HF vision backbone (without classification head) produces image embeddings.
        2. A small MLP processes tabular auxiliary features.
        3. Image and tabular embeddings are concatenated and passed through a
           fusion classifier head.
    """

    def __init__(
        self,
        model_id: str = "google/vit-base-patch16-224",
        num_classes: int = 2,
        aux_feature_dim: int = 0,
        freeze_backbone: bool = True,
        id2label: dict | None = None,
        label2id: dict | None = None,
        fusion_hidden_dim: int = 128,
    ):
        super().__init__()
        self.aux_feature_dim = aux_feature_dim

        from transformers import AutoConfig

        config_kwargs = {
            "num_labels": num_classes,
            "id2label": id2label or {i: str(i) for i in range(num_classes)},
            "label2id": label2id or {str(i): i for i in range(num_classes)},
        }
        try:
            hf_config = AutoConfig.from_pretrained(model_id, **config_kwargs)
            self.backbone = AutoModelForImageClassification.from_pretrained(
                model_id,
                config=hf_config,
                ignore_mismatched_sizes=True,
            )
        except Exception as e:
            logger.error("Failed to load vision backbone from %s: %s", model_id, e)
            raise AutoMLTrainingError(
                f"Failed to load vision backbone from {model_id}: {e}"
            ) from e

        if freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False
            if hasattr(self.backbone, "classifier"):
                for param in self.backbone.classifier.parameters():
                    param.requires_grad = True

        vision_dim = self._get_vision_embed_dim()
        tabular_dim = max(aux_feature_dim, 1)

        self.tabular_mlp = (
            nn.Sequential(
                nn.Linear(tabular_dim, fusion_hidden_dim),
                nn.ReLU(),
                nn.Linear(fusion_hidden_dim, fusion_hidden_dim),
                nn.ReLU(),
            )
            if aux_feature_dim > 0
            else nn.Identity()
        )

        self.fusion_head = nn.Sequential(
            nn.Linear(
                vision_dim + fusion_hidden_dim if aux_feature_dim > 0 else vision_dim,
                fusion_hidden_dim,
            ),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_hidden_dim, num_classes),
        )

    def _get_vision_embed_dim(self) -> int:
        model = self.backbone
        if hasattr(model, "classifier") and hasattr(model.classifier, "in_features"):
            return model.classifier.in_features
        if hasattr(model, "fc") and hasattr(model.fc, "in_features"):
            return model.fc.in_features
        config = getattr(model, "config", None)
        if config is not None:
            hidden_size = getattr(config, "hidden_size", None)
            if hidden_size is not None:
                return hidden_size
        raise AutoMLConfigError(
            "Cannot determine vision embedding dimension from model config"
        )

    def _extract_vision_embeddings(self, pixel_values: torch.Tensor) -> torch.Tensor:
        if hasattr(self.backbone, "classifier"):
            original_classifier = self.backbone.classifier
            self.backbone.classifier = nn.Identity()
            try:
                embeddings = self.backbone(pixel_values)
                if hasattr(embeddings, "logits"):
                    embeddings = embeddings.logits
            finally:
                self.backbone.classifier = original_classifier
            return embeddings
        output = self.backbone(pixel_values)
        return output.logits if hasattr(output, "logits") else output

    def forward(
        self, pixel_values: torch.Tensor, aux_features: torch.Tensor | None = None
    ) -> torch.Tensor:
        vision_embeds = self._extract_vision_embeddings(pixel_values)

        if aux_features is not None and self.aux_feature_dim > 0:
            tabular_embeds = self.tabular_mlp(aux_features)
            combined = torch.cat([vision_embeds, tabular_embeds], dim=-1)
        else:
            combined = vision_embeds

        return self.fusion_head(combined)

`ObjectDetectionModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForObjectDetection.

Source code in app/vision_automl/ml_engine/model.py

class ObjectDetectionModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForObjectDetection."""

    def __init__(self, model_id: str, num_classes: int = 2):
        super().__init__()
        try:
            self.model = AutoModelForObjectDetection.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                num_labels=num_classes,
            )
        except Exception as e:
            logger.error(
                "Failed to load object detection model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load object detection model from {model_id}: {e}"
            ) from e

    def forward(self, pixel_values: torch.Tensor, labels=None):
        """Returns loss when labels provided (list of dicts), else raw output."""
        output = self.model(pixel_values=pixel_values, labels=labels)
        return output.loss if labels is not None else output

`forward(pixel_values, labels=None)` ¶

Returns loss when labels provided (list of dicts), else raw output.

Source code in app/vision_automl/ml_engine/model.py

def forward(self, pixel_values: torch.Tensor, labels=None):
    """Returns loss when labels provided (list of dicts), else raw output."""
    output = self.model(pixel_values=pixel_values, labels=labels)
    return output.loss if labels is not None else output

`QuestionAnsweringModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForQuestionAnswering.

Source code in app/vision_automl/ml_engine/model.py

class QuestionAnsweringModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForQuestionAnswering."""

    def __init__(self, model_id: str):
        super().__init__()
        try:
            self.model = AutoModelForQuestionAnswering.from_pretrained(model_id)
        except Exception as e:
            logger.error(
                "Failed to load question answering model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load question answering model from {model_id}: {e}"
            ) from e

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        start_positions: torch.Tensor | None = None,
        end_positions: torch.Tensor | None = None,
    ):
        """Returns loss scalar when start/end positions provided, else raw output."""
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions,
        )
        if start_positions is not None and end_positions is not None:
            return output.loss
        return output

`forward(input_ids, attention_mask=None, start_positions=None, end_positions=None)` ¶

Returns loss scalar when start/end positions provided, else raw output.

Source code in app/vision_automl/ml_engine/model.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    start_positions: torch.Tensor | None = None,
    end_positions: torch.Tensor | None = None,
):
    """Returns loss scalar when start/end positions provided, else raw output."""
    output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        start_positions=start_positions,
        end_positions=end_positions,
    )
    if start_positions is not None and end_positions is not None:
        return output.loss
    return output

`Seq2SeqLMModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForSeq2SeqLM.

Source code in app/vision_automl/ml_engine/model.py

class Seq2SeqLMModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForSeq2SeqLM."""

    def __init__(self, model_id: str):
        super().__init__()
        try:
            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
        except Exception as e:
            logger.error("Failed to load seq2seq LM model from %s: %s", model_id, e)
            raise AutoMLTrainingError(
                f"Failed to load seq2seq LM model from {model_id}: {e}"
            ) from e

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        decoder_input_ids: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Returns the scalar seq2seq loss."""
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels,
        ).loss

`forward(input_ids, attention_mask=None, decoder_input_ids=None, labels=None)` ¶

Returns the scalar seq2seq loss.

Source code in app/vision_automl/ml_engine/model.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    decoder_input_ids: torch.Tensor | None = None,
    labels: torch.Tensor | None = None,
) -> torch.Tensor:
    """Returns the scalar seq2seq loss."""
    return self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        labels=labels,
    ).loss

`SequenceClassificationModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForSequenceClassification.

Source code in app/vision_automl/ml_engine/model.py

class SequenceClassificationModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForSequenceClassification."""

    def __init__(
        self,
        model_id: str,
        num_classes: int = 2,
        id2label: dict | None = None,
        label2id: dict | None = None,
    ):
        super().__init__()
        try:
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                num_labels=num_classes,
                id2label=id2label or {i: str(i) for i in range(num_classes)},
                label2id=label2id or {str(i): i for i in range(num_classes)},
            )
        except Exception as e:
            logger.error(
                "Failed to load sequence classification model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load sequence classification model from {model_id}: {e}"
            ) from e

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

`VideoClassificationModel` ¶

Bases: Module

Thin nn.Module wrapping HF AutoModelForVideoClassification.

Source code in app/vision_automl/ml_engine/model.py

class VideoClassificationModel(nn.Module):
    """Thin nn.Module wrapping HF AutoModelForVideoClassification."""

    def __init__(
        self,
        model_id: str,
        num_classes: int = 2,
        id2label: dict | None = None,
        label2id: dict | None = None,
    ):
        super().__init__()
        try:
            self.model = AutoModelForVideoClassification.from_pretrained(
                model_id,
                ignore_mismatched_sizes=True,
                num_labels=num_classes,
                id2label=id2label or {i: str(i) for i in range(num_classes)},
                label2id=label2id or {str(i): i for i in range(num_classes)},
            )
        except Exception as e:
            logger.error(
                "Failed to load video classification model from %s: %s", model_id, e
            )
            raise AutoMLTrainingError(
                f"Failed to load video classification model from {model_id}: {e}"
            ) from e

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        return self.model(pixel_values=pixel_values).logits

`EarlyStopping` ¶

Simple early stopping callback based on monitored metric.

Source code in app/vision_automl/ml_engine/trainer.py

class EarlyStopping:
    """Simple early stopping callback based on monitored metric."""

    def __init__(
        self, monitor: str = "val_loss", patience: int = 3, min_delta: float = 0.0
    ) -> None:
        self.monitor: str = monitor
        self.patience: int = patience
        self.min_delta: float = min_delta
        self.best: float = float("inf")
        self.counter: int = 0

    def on_epoch_end(
        self, trainer: "FabricTrainer", epoch: int, logs: dict[str, float]
    ) -> None:
        """Update state after epoch; may signal stopping on trainer."""
        current: float | None = logs.get(self.monitor)
        if current is None:
            logger.warning(
                f"Metric '{self.monitor}' not found in logs. Skipping early stopping check."
            )
            return

        if current < self.best - self.min_delta:
            self.best = current
            self.counter = 0
            logger.info(f"New best {self.monitor}: {self.best:.4f}")
        else:
            self.counter += 1
            logger.info(f"EarlyStopping counter: {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                logger.info("Early stopping triggered!")
                trainer.epochs = epoch + 1

`on_epoch_end(trainer, epoch, logs)` ¶

Update state after epoch; may signal stopping on trainer.

Source code in app/vision_automl/ml_engine/trainer.py

def on_epoch_end(
    self, trainer: "FabricTrainer", epoch: int, logs: dict[str, float]
) -> None:
    """Update state after epoch; may signal stopping on trainer."""
    current: float | None = logs.get(self.monitor)
    if current is None:
        logger.warning(
            f"Metric '{self.monitor}' not found in logs. Skipping early stopping check."
        )
        return

    if current < self.best - self.min_delta:
        self.best = current
        self.counter = 0
        logger.info(f"New best {self.monitor}: {self.best:.4f}")
    else:
        self.counter += 1
        logger.info(f"EarlyStopping counter: {self.counter}/{self.patience}")
        if self.counter >= self.patience:
            logger.info("Early stopping triggered!")
            trainer.epochs = epoch + 1

`FabricTrainer` ¶

Minimal trainer using Lightning Fabric.

Supports both: - Classification tasks where the model returns logits and the trainer computes the loss via loss_fn (model_computes_loss=False). - Generative / structured-prediction tasks where the model computes its own loss internally and returns a scalar tensor (model_computes_loss=True).

Source code in app/vision_automl/ml_engine/trainer.py

class FabricTrainer:
    """Minimal trainer using Lightning Fabric.

    Supports both:
    - Classification tasks where the model returns logits and the trainer
      computes the loss via ``loss_fn`` (``model_computes_loss=False``).
    - Generative / structured-prediction tasks where the model computes
      its own loss internally and returns a scalar tensor
      (``model_computes_loss=True``).
    """

    def __init__(
        self,
        datamodule: Any,
        model_class: type[nn.Module],
        model_kwargs: dict[str, Any] = {},
        optimizer_class: type[optim.Optimizer] = optim.AdamW,
        optimizer_kwargs: dict[str, Any] = {},
        loss_fn: nn.Module = nn.CrossEntropyLoss(),
        lr: float = 0.001,
        epochs: int = 1,
        time_limit: float | None = None,
        device: str = "auto",
        callbacks: list[Any] = [],
        input_dtype: torch.dtype = torch.float32,
        target_dtype: torch.dtype = torch.long,
        model_computes_loss: bool = False,
    ) -> None:
        self.datamodule: Any = datamodule
        self.model_class: type[nn.Module] = model_class
        self.model_kwargs: dict[str, Any] = model_kwargs
        self.optimizer_class: type[optim.Optimizer] = optimizer_class
        self.optimizer_kwargs: dict[str, Any] = optimizer_kwargs or {"lr": lr}
        self.loss_fn: nn.Module = loss_fn
        self.epochs: int = epochs
        self.time_limit: float | None = time_limit
        self.device: str = device
        self.callbacks: list[Any] = callbacks
        self.input_dtype: torch.dtype = input_dtype
        self.target_dtype: torch.dtype = target_dtype
        self.model_computes_loss: bool = model_computes_loss

        self.fabric: L.Fabric = L.Fabric(devices=self.device)
        self._setup_model_optimizer()

    def _setup_model_optimizer(self) -> None:
        """Instantiate model and optimizer and prepare loaders with Fabric."""
        logger.info("Setting up model and optimizer.")
        self.model: nn.Module = self.model_class(**self.model_kwargs)
        self.optimizer: optim.Optimizer = self.optimizer_class(
            self.model.parameters(), **self.optimizer_kwargs
        )

        train_loader: Any = self.datamodule.train_dataloader()
        val_loader: Any = self.datamodule.val_dataloader()
        (
            self.model,
            self.optimizer,
            self.train_loader,
            self.val_loader,
        ) = self.fabric.setup(self.model, self.optimizer, train_loader, val_loader)
        self.test_loader: Any = self.datamodule.test_dataloader()
        logger.info("Model and optimizer setup complete.")

    def _move_batch(self, batch: Any) -> dict[str, Any]:
        """Move batch to the Fabric device.

        Handles arbitrary dict batches (all modalities) and legacy
        ``(images, labels)`` tuple batches.  Non-tensor values (e.g. list
        of annotation dicts for object detection) are passed through as-is.
        Integer tensors (``input_ids``, etc.) are moved without dtype coercion.
        """
        if isinstance(batch, dict):
            moved: dict[str, Any] = {}
            for k, v in batch.items():
                if not isinstance(v, torch.Tensor):
                    moved[k] = v  # keep non-tensors (e.g. list of dicts)
                elif k in _TARGET_KEYS:
                    moved[k] = v.to(self.fabric.device, dtype=self.target_dtype)
                elif v.dtype.is_floating_point:
                    moved[k] = v.to(self.fabric.device, dtype=self.input_dtype)
                else:
                    # int/long tensors (input_ids, etc.) — preserve dtype
                    moved[k] = v.to(self.fabric.device)
            return moved
        else:
            imgs, batch_labels = batch
            return {
                "pixel_values": imgs.to(self.fabric.device, dtype=self.input_dtype),
                "labels": batch_labels.to(self.fabric.device, dtype=self.target_dtype),
            }

    def _check_time_limit(self, start_time: float) -> bool:
        """Return True if configured time limit has been exceeded."""
        elapsed: float = time.time() - start_time
        if self.time_limit and elapsed > self.time_limit:
            logger.warning(f"Time limit reached ({elapsed:.2f}s). Stopping training.")
            return True
        return False

    def _compute_loss_and_logits(
        self, moved: dict[str, Any]
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        """Run forward pass, return (loss, logits_or_None)."""
        if self.model_computes_loss:
            outputs = self.model(**moved)
            loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
            return loss, None
        else:
            labels = moved.pop("labels")
            outputs = self.model(**moved)
            loss = self.loss_fn(outputs, labels)
            return loss, outputs

    def train_epoch(self, epoch: int, start_time: float) -> float:
        """Train for a single epoch and return average training loss."""
        self.model.train()
        running_loss: float = 0.0
        batch_count: int = len(self.train_loader)

        for batch in tqdm(
            self.train_loader, desc=f"Epoch {epoch+1} Training", leave=False
        ):
            if self._check_time_limit(start_time):
                return running_loss / max(1, batch_count)

            moved = self._move_batch(batch)
            self.optimizer.zero_grad()
            loss, _ = self._compute_loss_and_logits(moved)
            self.fabric.backward(loss)
            self.optimizer.step()
            running_loss += loss.item()

        avg_loss: float = running_loss / batch_count
        logger.info(f"Epoch {epoch+1} Training Loss: {avg_loss:.4f}")
        return avg_loss

    def validate(self, start_time: float) -> tuple[float, float]:
        """Evaluate on validation set; return (avg_loss, accuracy)."""
        self.model.eval()
        val_loss: float = 0.0
        correct: int = 0
        total: int = 0

        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation", leave=False):
                if self._check_time_limit(start_time):
                    break

                moved = self._move_batch(batch)

                if self.model_computes_loss:
                    outputs = self.model(**moved)
                    loss = (
                        outputs if isinstance(outputs, torch.Tensor) else outputs.loss
                    )
                    val_loss += loss.item()
                else:
                    labels = moved.pop("labels")
                    outputs = self.model(**moved)
                    loss = self.loss_fn(outputs, labels)
                    val_loss += loss.item()
                    preds = outputs.argmax(dim=1)
                    correct += (preds == labels).sum().item()
                    total += labels.size(0)

        avg_loss: float = val_loss / max(1, len(self.val_loader))
        accuracy: float = correct / max(1, total)
        logger.info(f"Validation - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
        return avg_loss, accuracy

    def test(self) -> tuple[float, float]:
        """Evaluate on test set; return (avg_loss, accuracy)."""
        self.model.eval()
        test_loss: float = 0.0
        correct: int = 0
        total: int = 0

        with torch.no_grad():
            for batch in tqdm(self.test_loader, desc="Testing"):
                moved = self._move_batch(batch)

                if self.model_computes_loss:
                    outputs = self.model(**moved)
                    loss = (
                        outputs if isinstance(outputs, torch.Tensor) else outputs.loss
                    )
                    test_loss += loss.item()
                else:
                    labels = moved.pop("labels")
                    outputs = self.model(**moved)
                    loss = self.loss_fn(outputs, labels)
                    test_loss += loss.item()
                    preds = outputs.argmax(dim=1)
                    correct += (preds == labels).sum().item()
                    total += labels.size(0)

        avg_loss: float = test_loss / len(self.test_loader)
        accuracy: float = correct / max(1, total)
        logger.info(f"Test Results - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
        return avg_loss, accuracy

    def fit(self, trial: optuna.Trial | None = None) -> tuple[float, float]:
        logger.info("Starting training loop.")
        start_time: float = time.time()

        for epoch in range(self.epochs):
            train_loss = self.train_epoch(epoch, start_time)
            val_loss, val_acc = self.validate(start_time)

            if trial is not None:
                trial.report(val_loss, step=epoch)
                if trial.should_prune():
                    raise optuna.TrialPruned()

            logs = {
                "train_loss": train_loss,
                "val_loss": val_loss,
                "val_acc": val_acc,
            }

            for cb in self.callbacks:
                cb.on_epoch_end(self, epoch, logs)

            if self._check_time_limit(start_time):
                break

        return self.test()

`test()` ¶

Evaluate on test set; return (avg_loss, accuracy).

Source code in app/vision_automl/ml_engine/trainer.py

def test(self) -> tuple[float, float]:
    """Evaluate on test set; return (avg_loss, accuracy)."""
    self.model.eval()
    test_loss: float = 0.0
    correct: int = 0
    total: int = 0

    with torch.no_grad():
        for batch in tqdm(self.test_loader, desc="Testing"):
            moved = self._move_batch(batch)

            if self.model_computes_loss:
                outputs = self.model(**moved)
                loss = (
                    outputs if isinstance(outputs, torch.Tensor) else outputs.loss
                )
                test_loss += loss.item()
            else:
                labels = moved.pop("labels")
                outputs = self.model(**moved)
                loss = self.loss_fn(outputs, labels)
                test_loss += loss.item()
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

    avg_loss: float = test_loss / len(self.test_loader)
    accuracy: float = correct / max(1, total)
    logger.info(f"Test Results - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    return avg_loss, accuracy

`train_epoch(epoch, start_time)` ¶

Train for a single epoch and return average training loss.

Source code in app/vision_automl/ml_engine/trainer.py

def train_epoch(self, epoch: int, start_time: float) -> float:
    """Train for a single epoch and return average training loss."""
    self.model.train()
    running_loss: float = 0.0
    batch_count: int = len(self.train_loader)

    for batch in tqdm(
        self.train_loader, desc=f"Epoch {epoch+1} Training", leave=False
    ):
        if self._check_time_limit(start_time):
            return running_loss / max(1, batch_count)

        moved = self._move_batch(batch)
        self.optimizer.zero_grad()
        loss, _ = self._compute_loss_and_logits(moved)
        self.fabric.backward(loss)
        self.optimizer.step()
        running_loss += loss.item()

    avg_loss: float = running_loss / batch_count
    logger.info(f"Epoch {epoch+1} Training Loss: {avg_loss:.4f}")
    return avg_loss

`validate(start_time)` ¶

Evaluate on validation set; return (avg_loss, accuracy).

Source code in app/vision_automl/ml_engine/trainer.py

def validate(self, start_time: float) -> tuple[float, float]:
    """Evaluate on validation set; return (avg_loss, accuracy)."""
    self.model.eval()
    val_loss: float = 0.0
    correct: int = 0
    total: int = 0

    with torch.no_grad():
        for batch in tqdm(self.val_loader, desc="Validation", leave=False):
            if self._check_time_limit(start_time):
                break

            moved = self._move_batch(batch)

            if self.model_computes_loss:
                outputs = self.model(**moved)
                loss = (
                    outputs if isinstance(outputs, torch.Tensor) else outputs.loss
                )
                val_loss += loss.item()
            else:
                labels = moved.pop("labels")
                outputs = self.model(**moved)
                loss = self.loss_fn(outputs, labels)
                val_loss += loss.item()
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

    avg_loss: float = val_loss / max(1, len(self.val_loader))
    accuracy: float = correct / max(1, total)
    logger.info(f"Validation - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    return avg_loss, accuracy

`run_optuna_search(*, task_type='image_classification', csv_path, images_dir=None, filename_column='filename', label_column='label', n_trials=3, timeout=None, model_size='small', workdir, **extra_kwargs)` ¶

Run an Optuna hyperparameter search for the given task type.

Dispatches to the appropriate per-task objective via OBJECTIVE_REGISTRY. extra_kwargs are forwarded to the objective (e.g. text_column for text tasks).

Raises:

Type	Description
`ValueError`	If `task_type` is not in `OBJECTIVE_REGISTRY`.

Source code in app/vision_automl/ml_engine/trainer.py

def run_optuna_search(
    *,
    task_type: str = "image_classification",
    csv_path: Path,
    images_dir: Path | None = None,
    filename_column: str = "filename",
    label_column: str = "label",
    n_trials: int = 3,
    timeout: int | None = None,
    model_size: str = "small",
    workdir: Path,
    **extra_kwargs,
) -> dict:
    """Run an Optuna hyperparameter search for the given task type.

    Dispatches to the appropriate per-task objective via ``OBJECTIVE_REGISTRY``.
    ``extra_kwargs`` are forwarded to the objective (e.g. ``text_column`` for
    text tasks).

    Raises:
        ValueError: If ``task_type`` is not in ``OBJECTIVE_REGISTRY``.
    """
    if task_type not in OBJECTIVE_REGISTRY:
        raise AutoMLConfigError(
            f"Unknown task type '{task_type}'. "
            f"Supported: {sorted(OBJECTIVE_REGISTRY)}"
        )

    config = load_task_config(task_type)
    objective_fn = OBJECTIVE_REGISTRY[task_type]

    run_dir = workdir / "optuna"
    run_dir.mkdir(exist_ok=True)

    pruner = optuna.pruners.SuccessiveHalvingPruner(
        min_resource=10,
        reduction_factor=3,
        min_early_stopping_rate=0,
    )
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
    timeout_per_trial = timeout / max(n_trials, 1) if timeout else None

    # Build keyword arguments for the objective
    objective_kwargs: dict = {
        "csv_path": csv_path,
        "images_dir": images_dir,
        "filename_column": filename_column,
        "label_column": label_column,
        "model_size": model_size,
        "timeout_per_trial": timeout_per_trial,
        "config": config,
        **extra_kwargs,
    }

    study.optimize(
        functools.partial(objective_fn, **objective_kwargs),
        n_trials=n_trials,
        timeout=timeout,
    )

    completed = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed:
        raise AutoMLTrainingError(
            f"All {len(study.trials)} Optuna trial(s) failed or were pruned. "
            "Check your dataset, model IDs, and time budget."
        )

    return {
        "best_value": study.best_value,
        "best_params": study.best_params,
        "n_trials": len(study.trials),
        "model_dir": run_dir / f"trial_{study.best_trial.number}",
    }

Vision AutoML¶

AudioClassificationTask ¶

CausalLMTask ¶

ImageClassificationTask ¶

ImageMultiLabelClassificationTask ¶

ImageRegressionTask ¶

ImageSegmentationTask ¶

ImageTask ¶

KeypointDetectionTask ¶

MaskedLMTask ¶

ObjectDetectionTask ¶

QuestionAnsweringTask ¶

Seq2SeqLMTask ¶

SequenceClassificationTask ¶

TextTask ¶

VideoClassificationTask ¶

find_best_model_for_multimodal_vision(request, user_id, dataset_id, dataset_version=None, filename_column='filename', label_column='label', exclude_columns=None, time_budget=60, model_size='small', dataset_split=None) async ¶

find_best_model_for_vision(request, user_id, dataset_id, dataset_version='v1', filename_column='filename', label_column='label', task_type='image_classification', time_budget=60, model_size='small', dataset_split=None) async ¶

show_accepted_format_instructions() async ¶

show_deployment_instructions() async ¶

build_upload_payload(dataset_id, dataset_version, metadata, task_type, leaderboard_json) ¶

collect_missing_files(df, images_dir, filename_col, label_col) ¶

convert_leaderboard_safely(optuna_result) ¶

download_dataset(download_url, workdir, original_filename) ¶

extract_and_locate_dataset(zip_path, workdir) ¶

fetch_dataset_metadata(autodw_base, user_id, dataset_id, dataset_version) ¶

get_num_params_if_available(repo_id, revision=None) ¶

normalize_dataframe_filenames(df, filename_column, csv_path) ¶

resolve_download_url(autodw_base, user_id, dataset_id, dataset_version, metadata, split) ¶

resolve_images_root(images_dir) ¶

search_hf_for_pytorch_models_with_estimated_parameters(filter='image-classification', limit=3, sort='downloads') ¶

serialize_and_zip_model(workdir) ¶

sort_models_by_size(models, size_tier) ¶

train_automl(csv_path, images_dir, filename_column, label_column, time_budget, model_size, workdir, task_type='image_classification') async ¶

train_automl_multimodal(csv_path, images_dir, filename_column, label_column, auxiliary_columns, time_budget, model_size, workdir) async ¶

upload_model(upload_url, zip_path, payload, task_id) ¶

validate_multimodal_inputs(csv_path, images_dir, filename_column, label_column, exclude_columns=None) ¶

validate_vision_inputs(csv_path, images_dir, filename_column, label_column, task_type='image_classification') ¶

vision_data_instructions() ¶

ML engine¶

load_task_config(task_type) ¶

AudioClassificationDataModule ¶

CausalLMDataModule ¶

ImageClassificationDataModule ¶

setup() ¶

ImageSegmentationDataModule ¶

KeypointDetectionDataModule ¶

MaskedLMDataModule ¶

MultimodalClassificationDataModule ¶

ObjectDetectionDataModule ¶

QuestionAnsweringDataModule ¶

Seq2SeqLMDataModule ¶

SequenceClassificationDataModule ¶

VideoClassificationDataModule ¶

CausalLMFromCSVDataset ¶

ImageClassificationFromCSVDataset ¶

__len__() ¶

MultimodalClassificationDataset ¶

QuestionAnsweringFromCSVDataset ¶

Seq2SeqFromCSVDataset ¶

TextClassificationFromCSVDataset ¶

AudioClassificationModel ¶

CausalLMModel ¶

forward(input_ids, attention_mask=None, labels=None) ¶

ImageClassificationModel ¶

ImageSegmentationModel ¶

forward(pixel_values, labels=None) ¶

KeypointDetectionModel ¶

forward(pixel_values, labels=None) ¶

MaskedLMModel ¶

forward(input_ids, attention_mask=None, labels=None) ¶

MultimodalClassificationModel ¶

ObjectDetectionModel ¶

forward(pixel_values, labels=None) ¶

QuestionAnsweringModel ¶

forward(input_ids, attention_mask=None, start_positions=None, end_positions=None) ¶

Seq2SeqLMModel ¶

forward(input_ids, attention_mask=None, decoder_input_ids=None, labels=None) ¶

SequenceClassificationModel ¶

VideoClassificationModel ¶

`AudioClassificationTask` ¶

`CausalLMTask` ¶

`ImageClassificationTask` ¶

`ImageMultiLabelClassificationTask` ¶

`ImageRegressionTask` ¶

`ImageSegmentationTask` ¶

`ImageTask` ¶

`KeypointDetectionTask` ¶

`MaskedLMTask` ¶

`ObjectDetectionTask` ¶

`QuestionAnsweringTask` ¶

`Seq2SeqLMTask` ¶

`SequenceClassificationTask` ¶

`TextTask` ¶

`VideoClassificationTask` ¶

`find_best_model_for_multimodal_vision(request, user_id, dataset_id, dataset_version=None, filename_column='filename', label_column='label', exclude_columns=None, time_budget=60, model_size='small', dataset_split=None)` `async` ¶

`find_best_model_for_vision(request, user_id, dataset_id, dataset_version='v1', filename_column='filename', label_column='label', task_type='image_classification', time_budget=60, model_size='small', dataset_split=None)` `async` ¶

`show_accepted_format_instructions()` `async` ¶

`show_deployment_instructions()` `async` ¶

`build_upload_payload(dataset_id, dataset_version, metadata, task_type, leaderboard_json)` ¶

`collect_missing_files(df, images_dir, filename_col, label_col)` ¶

`convert_leaderboard_safely(optuna_result)` ¶

`download_dataset(download_url, workdir, original_filename)` ¶

`extract_and_locate_dataset(zip_path, workdir)` ¶

`fetch_dataset_metadata(autodw_base, user_id, dataset_id, dataset_version)` ¶

`get_num_params_if_available(repo_id, revision=None)` ¶

`normalize_dataframe_filenames(df, filename_column, csv_path)` ¶

`resolve_download_url(autodw_base, user_id, dataset_id, dataset_version, metadata, split)` ¶

`resolve_images_root(images_dir)` ¶

`search_hf_for_pytorch_models_with_estimated_parameters(filter='image-classification', limit=3, sort='downloads')` ¶

`serialize_and_zip_model(workdir)` ¶

`sort_models_by_size(models, size_tier)` ¶

`train_automl(csv_path, images_dir, filename_column, label_column, time_budget, model_size, workdir, task_type='image_classification')` `async` ¶

`train_automl_multimodal(csv_path, images_dir, filename_column, label_column, auxiliary_columns, time_budget, model_size, workdir)` `async` ¶

`upload_model(upload_url, zip_path, payload, task_id)` ¶

`validate_multimodal_inputs(csv_path, images_dir, filename_column, label_column, exclude_columns=None)` ¶

`validate_vision_inputs(csv_path, images_dir, filename_column, label_column, task_type='image_classification')` ¶

`vision_data_instructions()` ¶

`load_task_config(task_type)` ¶

`AudioClassificationDataModule` ¶

`CausalLMDataModule` ¶

`ImageClassificationDataModule` ¶

`setup()` ¶

`ImageSegmentationDataModule` ¶

`KeypointDetectionDataModule` ¶

`MaskedLMDataModule` ¶

`MultimodalClassificationDataModule` ¶

`ObjectDetectionDataModule` ¶

`QuestionAnsweringDataModule` ¶

`Seq2SeqLMDataModule` ¶

`SequenceClassificationDataModule` ¶

`VideoClassificationDataModule` ¶

`CausalLMFromCSVDataset` ¶

`ImageClassificationFromCSVDataset` ¶

`len()` ¶

`MultimodalClassificationDataset` ¶

`QuestionAnsweringFromCSVDataset` ¶

`Seq2SeqFromCSVDataset` ¶

`TextClassificationFromCSVDataset` ¶

`AudioClassificationModel` ¶

`CausalLMModel` ¶

`forward(input_ids, attention_mask=None, labels=None)` ¶

`ImageClassificationModel` ¶

`ImageSegmentationModel` ¶

`forward(pixel_values, labels=None)` ¶

`KeypointDetectionModel` ¶

`forward(pixel_values, labels=None)` ¶

`MaskedLMModel` ¶

`forward(input_ids, attention_mask=None, labels=None)` ¶

`MultimodalClassificationModel` ¶

`ObjectDetectionModel` ¶

`forward(pixel_values, labels=None)` ¶

`QuestionAnsweringModel` ¶

`forward(input_ids, attention_mask=None, start_positions=None, end_positions=None)` ¶

`Seq2SeqLMModel` ¶

`forward(input_ids, attention_mask=None, decoder_input_ids=None, labels=None)` ¶

`SequenceClassificationModel` ¶

`VideoClassificationModel` ¶

`EarlyStopping` ¶

`on_epoch_end(trainer, epoch, logs)` ¶