Skip to content

Tabular AutoML

FastAPI app init for the tabular AutoML service.

TabularSupervisedClassificationTask

Bases: TabularTask

Tabular classification task configuration.

Typical use-cases: churn prediction, loan approval, disease type, etc.

Source code in app/tabular_automl/models.py
25
26
27
28
29
30
31
class TabularSupervisedClassificationTask(TabularTask):
    """Tabular classification task configuration.

    Typical use-cases: churn prediction, loan approval, disease type, etc.
    """

    task_type: str = "tabular_classification"

TabularSupervisedRegressionTask

Bases: TabularTask

Tabular regression task configuration.

Predicts continuous numeric values (e.g., price, salary, demand).

Source code in app/tabular_automl/models.py
34
35
36
37
38
39
40
class TabularSupervisedRegressionTask(TabularTask):
    """Tabular regression task configuration.

    Predicts continuous numeric values (e.g., price, salary, demand).
    """

    task_type: str = "tabular_regression"

TabularSupervisedTimeSeriesTask

Bases: TabularTask

Time-series forecasting task configuration for tabular data.

Source code in app/tabular_automl/models.py
43
44
45
46
47
class TabularSupervisedTimeSeriesTask(TabularTask):
    """Time-series forecasting task configuration for tabular data."""

    task_type: str = "tabular_time_series"
    time_stamp_col: str = "timestamp"

TabularTask

Bases: BaseModel

Base Pydantic model describing common tabular task inputs.

Source code in app/tabular_automl/models.py
13
14
15
16
17
18
19
20
21
22
class TabularTask(BaseModel):
    """Base Pydantic model describing common tabular task inputs."""

    target_feature: str
    time_stamp_col: pd.DataFrame | None = None
    train_file_path: Path
    test_file_path: Path | None = None

    class Config:
        arbitrary_types_allowed: bool = True

AutoMLTrainer

Wrapper around AutoGluon Tabular training routines.

Source code in app/tabular_automl/modules.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
class AutoMLTrainer:
    """Wrapper around AutoGluon Tabular training routines."""

    def __init__(
        self,
        save_model_path: Path,
        DatasetClass=TabularDataset,
        PredictorClass=TabularPredictor,
    ):
        if save_model_path == "":
            raise ValueError("save_model_path cannot be None or empty")

        self.save_model_path: Path = Path(save_model_path)

        if self.save_model_path.exists() and not self.save_model_path.is_dir():
            raise ValueError(
                f"save_model_path must be a directory, got: {self.save_model_path}"
            )

        self.DatasetClass = DatasetClass
        self.PredictorClass = PredictorClass
        logger.debug(f"Automl trainer, model path {self.save_model_path}")

    def train(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame | None,
        target_column: str,
        time_limit: int,
    ) -> tuple[pd.DataFrame, TabularPredictor]:
        """Train AutoGluon Tabular and return leaderboard or error."""

        if train_df is None or train_df.empty:
            raise ValueError("train_df cannot be None or empty")

        if not target_column or not isinstance(target_column, str):
            raise ValueError("target_column must be a non-empty string")

        if target_column not in train_df.columns:
            raise ValueError(
                f"target_column '{target_column}' not found in train_df columns: {train_df.columns.tolist()}"
            )

        if test_df is not None and target_column not in test_df.columns:
            raise ValueError(
                f"target_column '{target_column}' not found in test_df columns: {test_df.columns.tolist()}"
            )

        if not isinstance(time_limit, int) or time_limit <= 0:
            raise ValueError(f"time_limit must be a positive integer, got {time_limit}")

        try:
            final_train_df, final_test_df = self.train_test_split(
                test_df=test_df, train_df=train_df
            )
        except Exception as e:
            logger.error(f"Failed to split train/test data: {e}")
            raise ValueError(f"Train/test split failed: {e}") from e

        try:
            train_dataset = self.DatasetClass(final_train_df)
            test_dataset = self.DatasetClass(final_test_df)
        except Exception as e:
            logger.error(f"Failed to create TabularDataset: {e}")
            raise ValueError(f"Dataset creation failed: {e}") from e

        try:
            predictor = self.PredictorClass(
                label=target_column, path=str(self.save_model_path)
            ).fit(train_data=train_dataset, time_limit=time_limit)
        except Exception as e:
            logger.error(f"AutoGluon training failed: {e}")
            raise RuntimeError(f"Model training failed: {e}") from e

        try:
            save_path_clone_opt = self.save_model_path / "-clone-opt"
            path_clone_opt = predictor.clone_for_deployment(
                path=str(save_path_clone_opt)
            )
            predictor_clone_opt = self.PredictorClass.load(path=str(path_clone_opt))
        except Exception as e:
            logger.error(f"Failed to clone model for deployment: {e}")
            raise RuntimeError(f"Model cloning failed: {e}") from e

        try:
            leaderboard = predictor.leaderboard(test_dataset)
            return leaderboard, predictor_clone_opt
        except Exception as e:
            logger.error(f"Failed to generate leaderboard: {e}")
            raise RuntimeError(f"Leaderboard generation failed: {e}") from e

    def train_test_split(
        self, test_df: pd.DataFrame | None, train_df: pd.DataFrame | None = None
    ) -> tuple[pd.DataFrame, pd.DataFrame]:

        if train_df is None:
            raise ValueError("train_df cannot be None")

        if train_df.empty:
            raise ValueError("train_df cannot be empty")

        if (
            DEFAULT_TABULAR_TRAIN_TEST_SPLIT_SIZE <= 0
            or DEFAULT_TABULAR_TRAIN_TEST_SPLIT_SIZE >= 1
        ):
            raise ValueError(
                f"DEFAULT_TABULAR_TRAIN_TEST_SPLIT_SIZE must be between 0 and 1 (exclusive), got {DEFAULT_TABULAR_TRAIN_TEST_SPLIT_SIZE}"
            )

        if test_df is None:
            logger.debug("Test dataset not found, creating split")
            try:
                final_train_df = train_df.sample(
                    frac=DEFAULT_TABULAR_TRAIN_TEST_SPLIT_SIZE, random_state=42
                )
                final_test_df = train_df.drop(index=final_train_df.index.tolist())
            except Exception as e:
                logger.error(f"Failed to create train/test split: {e}")
                raise ValueError(f"Train/test split failed: {e}") from e
        else:
            logger.debug("Test dataset found")

            if test_df.empty:
                raise ValueError("test_df cannot be empty")

            final_train_df = train_df
            final_test_df = test_df

        if final_train_df.empty:
            raise ValueError("Final training DataFrame is empty after split")

        if final_test_df.empty:
            raise ValueError("Final test DataFrame is empty after split")

        return final_train_df, final_test_df

train(train_df, test_df, target_column, time_limit)

Train AutoGluon Tabular and return leaderboard or error.

Source code in app/tabular_automl/modules.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def train(
    self,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame | None,
    target_column: str,
    time_limit: int,
) -> tuple[pd.DataFrame, TabularPredictor]:
    """Train AutoGluon Tabular and return leaderboard or error."""

    if train_df is None or train_df.empty:
        raise ValueError("train_df cannot be None or empty")

    if not target_column or not isinstance(target_column, str):
        raise ValueError("target_column must be a non-empty string")

    if target_column not in train_df.columns:
        raise ValueError(
            f"target_column '{target_column}' not found in train_df columns: {train_df.columns.tolist()}"
        )

    if test_df is not None and target_column not in test_df.columns:
        raise ValueError(
            f"target_column '{target_column}' not found in test_df columns: {test_df.columns.tolist()}"
        )

    if not isinstance(time_limit, int) or time_limit <= 0:
        raise ValueError(f"time_limit must be a positive integer, got {time_limit}")

    try:
        final_train_df, final_test_df = self.train_test_split(
            test_df=test_df, train_df=train_df
        )
    except Exception as e:
        logger.error(f"Failed to split train/test data: {e}")
        raise ValueError(f"Train/test split failed: {e}") from e

    try:
        train_dataset = self.DatasetClass(final_train_df)
        test_dataset = self.DatasetClass(final_test_df)
    except Exception as e:
        logger.error(f"Failed to create TabularDataset: {e}")
        raise ValueError(f"Dataset creation failed: {e}") from e

    try:
        predictor = self.PredictorClass(
            label=target_column, path=str(self.save_model_path)
        ).fit(train_data=train_dataset, time_limit=time_limit)
    except Exception as e:
        logger.error(f"AutoGluon training failed: {e}")
        raise RuntimeError(f"Model training failed: {e}") from e

    try:
        save_path_clone_opt = self.save_model_path / "-clone-opt"
        path_clone_opt = predictor.clone_for_deployment(
            path=str(save_path_clone_opt)
        )
        predictor_clone_opt = self.PredictorClass.load(path=str(path_clone_opt))
    except Exception as e:
        logger.error(f"Failed to clone model for deployment: {e}")
        raise RuntimeError(f"Model cloning failed: {e}") from e

    try:
        leaderboard = predictor.leaderboard(test_dataset)
        return leaderboard, predictor_clone_opt
    except Exception as e:
        logger.error(f"Failed to generate leaderboard: {e}")
        raise RuntimeError(f"Leaderboard generation failed: {e}") from e

build_upload_payload(dataset_id, dataset_version, metadata, task_type, leaderboard_json)

Return (model_id, form_data_dict) for the AutoDW upload request.

Source code in app/tabular_automl/services.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
def build_upload_payload(
    dataset_id: str,
    dataset_version: str | None,
    metadata: dict,
    task_type: str,
    leaderboard_json: list | dict,
) -> tuple[str, dict]:
    """Return (model_id, form_data_dict) for the AutoDW upload request."""

    if not dataset_id or not isinstance(dataset_id, str):
        raise ValueError("dataset_id must be a non-empty string")

    if not task_type or not isinstance(task_type, str):
        raise ValueError("task_type must be a non-empty string")

    try:
        model_id = f"automl_{dataset_id}_{int(datetime.utcnow().timestamp())}"
    except Exception as e:
        logger.error(f"Failed to generate model_id: {e}")
        raise RuntimeError(f"Failed to generate model_id: {e}") from e

    try:
        leaderboard_str = json.dumps(leaderboard_json)
    except TypeError as e:
        logger.error(f"Failed to serialize leaderboard_json: {e}")
        raise RuntimeError(f"Failed to serialize leaderboard: {e}") from e

    version = dataset_version or metadata.get("version", "v1")
    if not isinstance(version, str):
        version = "v1"

    data = {
        "model_id": model_id,
        "name": f"AutoML Model - {model_id}",
        "description": "AutoML trained model for tabular data",
        "framework": "sklearn",
        "model_type": task_type,
        "training_dataset": str(dataset_id),
        "training_dataset_version": version,
        "leaderboard": leaderboard_str,
        "deployment_instructions": deployment_instructions(),
    }

    if not isinstance(data["deployment_instructions"], str):
        data["deployment_instructions"] = ""

    return model_id, data

create_session_directory(upload_root=UPLOAD_ROOT)

Create and return a new session id and directory path.

Source code in app/tabular_automl/services.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def create_session_directory(upload_root: Path = UPLOAD_ROOT) -> tuple[str, Path]:
    """Create and return a new session id and directory path."""

    session_id = str(uuid.uuid4())
    session_dir = upload_root / session_id

    try:
        session_dir.mkdir(parents=True, exist_ok=True)
    except Exception as e:
        logging.error(f"Failed to create session directory {session_dir}: {e}")
        raise RuntimeError(f"Failed to create session directory: {e}") from e

    logging.debug(f"Session directory created at {session_dir}")
    return session_id, session_dir

download_dataset(download_url, dest_path)

Stream-download a dataset file to dest_path.

Source code in app/tabular_automl/services.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def download_dataset(download_url: str, dest_path: Path) -> None:
    """Stream-download a dataset file to dest_path."""

    if not download_url or not isinstance(download_url, str):
        raise ValueError("download_url must be a non-empty string")

    if dest_path.parent and not dest_path.parent.exists():
        dest_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        with requests.get(download_url, stream=True, timeout=30) as resp:
            resp.raise_for_status()
            with open(dest_path, "wb") as f:
                for chunk in resp.iter_content(8192):
                    f.write(chunk)
    except requests.RequestException as e:
        if isinstance(e, requests.Timeout):
            logger.error(f"Timeout downloading from {download_url}")
            raise RuntimeError("Timeout downloading dataset") from e
        elif isinstance(e, requests.ConnectionError):
            logger.error(f"Connection error downloading dataset: {e}")
            raise RuntimeError(f"Failed to connect to download URL: {e}") from e
        elif isinstance(e, requests.HTTPError):
            logger.error(f"HTTP error downloading dataset: {e}")
            raise RuntimeError(f"HTTP error downloading dataset: {e}") from e
        else:
            logger.error(f"Request error downloading dataset: {e}")
            raise RuntimeError(f"Request error downloading dataset: {e}") from e
    except OSError as e:
        logger.error(f"Failed to write dataset to {dest_path}: {e}")
        raise RuntimeError(f"Failed to save dataset file: {e}") from e
    except Exception as e:
        logger.error(f"Unexpected error downloading dataset: {e}")
        raise RuntimeError(f"Unexpected error downloading dataset: {e}") from e

    if not dest_path.exists():
        raise RuntimeError(f"Download completed but file not created at {dest_path}")

    if dest_path.stat().st_size == 0:
        raise RuntimeError(f"Downloaded file is empty: {dest_path}")

    logger.info(f"Dataset saved to {dest_path}")

fetch_dataset_metadata(autodw_base, user_id, dataset_id, dataset_version)

Fetch and return dataset metadata from AutoDW.

Source code in app/tabular_automl/services.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def fetch_dataset_metadata(
    autodw_base: str,
    user_id: str,
    dataset_id: str,
    dataset_version: str | None,
) -> dict:
    """Fetch and return dataset metadata from AutoDW."""

    if not autodw_base or not isinstance(autodw_base, str):
        raise ValueError("autodw_base must be a non-empty string")

    if not user_id or not isinstance(user_id, str):
        raise ValueError("user_id must be a non-empty string")

    if not dataset_id or not isinstance(dataset_id, str):
        raise ValueError("dataset_id must be a non-empty string")

    metadata_url = _build_metadata_url(
        autodw_base, user_id, dataset_id, dataset_version
    )
    logger.debug(f"Fetching dataset metadata: {metadata_url}")

    try:
        resp = requests.get(metadata_url, timeout=15)
        resp.raise_for_status()
    except requests.Timeout:
        logger.error(f"Timeout fetching metadata from {metadata_url}")
        raise RuntimeError("Timeout fetching dataset metadata from AutoDW") from None
    except requests.ConnectionError as e:
        logger.error(f"Connection error fetching metadata: {e}")
        raise RuntimeError(f"Failed to connect to AutoDW: {e}") from e
    except requests.HTTPError as e:
        logger.error(f"HTTP error fetching metadata: {e}")
        raise RuntimeError(f"AutoDW returned HTTP error: {e}") from e
    except Exception as e:
        logger.error(f"Unexpected error fetching metadata: {e}")
        raise RuntimeError(f"Unexpected error fetching metadata: {e}") from e

    try:
        metadata = resp.json()
    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse JSON response from AutoDW: {e}")
        raise RuntimeError(f"Invalid JSON response from AutoDW: {e}") from e

    if not isinstance(metadata, dict):
        logger.error(f"Metadata is not a dict: {type(metadata)}")
        raise RuntimeError(
            f"Invalid metadata format: expected dict, got {type(metadata)}"
        )

    return metadata

load_table(file_path)

Load a table file into a DataFrame based on file extension.

Source code in app/tabular_automl/services.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def load_table(file_path: Path) -> pd.DataFrame:
    """Load a table file into a DataFrame based on file extension."""
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    if not file_path.is_file():
        raise ValueError(f"Path is not a file: {file_path}")

    suffix = file_path.suffix.lower()

    try:
        if suffix in [".csv"]:
            logging.debug("csv file loaded")
            return pd.read_csv(file_path)
        if suffix in [".tsv"]:
            logging.debug("tsv file loaded")
            return pd.read_csv(file_path, sep="\t")
        if suffix in [".xls", ".xlsx", ".xlsm", ".xlsb"]:
            logging.debug("excel file loaded")
            return pd.read_excel(file_path)
        if suffix in [".parquet", ".pq"]:
            logging.debug("Parquet file loaded")
            return pd.read_parquet(file_path)
        if suffix in [".json"]:
            logging.debug("Json file loaded")
            return pd.read_json(file_path)
        # Fallback: try csv to keep previous behavior
        return pd.read_csv(file_path)
    except pd.errors.EmptyDataError:
        logging.error(f"File is empty: {file_path}")
        raise ValueError(f"File is empty: {file_path}") from None
    except pd.errors.ParserError as e:
        logging.error(f"Failed to parse file {file_path}: {e}")
        raise ValueError(f"Failed to parse file: {e}") from e
    except Exception as e:
        logging.error(f"Unexpected error loading table from {file_path}: {e}")
        raise RuntimeError(f"Failed to load table: {e}") from e

resolve_download_url(autodw_base, user_id, dataset_id, dataset_version, metadata, dataset_split)

Determine the correct dataset download URL, accounting for splits.

Source code in app/tabular_automl/services.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def resolve_download_url(
    autodw_base: str,
    user_id: str,
    dataset_id: str,
    dataset_version: str | None,
    metadata: dict,
    dataset_split: str | None,
) -> str:
    """Determine the correct dataset download URL, accounting for splits."""
    base_url = _build_metadata_url(autodw_base, user_id, dataset_id, dataset_version)
    download_url = f"{base_url}/download"

    has_split = bool(metadata.get("custom_metadata", {}).get("split"))
    effective_split = (
        dataset_split
        if (has_split and dataset_split in ("train", "test", "drift"))
        else None
    )

    if effective_split:
        download_url = f"{download_url}?split={effective_split}"
        logger.info(
            f"Dataset has splits; downloading '{effective_split}' split from: {download_url}"
        )
    else:
        if dataset_split and not has_split:
            logger.warning(
                f"dataset_split='{dataset_split}' was requested but dataset has no splits; "
                "downloading full dataset."
            )
        logger.debug(f"Downloading full dataset file: {download_url}")

    return download_url

save_upload(file, destination)

Persist an uploaded file to the given destination path.

Source code in app/tabular_automl/services.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def save_upload(file: UploadFile, destination: Path) -> None:
    """Persist an uploaded file to the given destination path."""
    if not hasattr(file, "file"):
        raise ValueError("file must have a 'file' attribute")

    if destination.parent:
        destination.parent.mkdir(parents=True, exist_ok=True)

    try:
        with open(destination, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        logging.debug(f"File saved to {destination}")
    except IOError as e:
        logging.error(f"Failed to write file to {destination}: {e}")
        raise RuntimeError(f"Failed to save uploaded file: {e}") from e
    except Exception as e:
        logging.error(f"Unexpected error saving file to {destination}: {e}")
        raise RuntimeError(f"Unexpected error saving file: {e}") from e

serialize_and_zip_predictor(predictor, save_model_path, tmp_path)

Pickle the predictor and zip the model directory. Returns the zip path.

Source code in app/tabular_automl/services.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
def serialize_and_zip_predictor(
    predictor, save_model_path: Path, tmp_path: Path
) -> Path:
    """Pickle the predictor and zip the model directory. Returns the zip path."""

    if predictor is None:
        raise ValueError("predictor cannot be None")

    if not save_model_path.exists():
        raise ValueError(f"save_model_path does not exist: {save_model_path}")

    predictor_path = save_model_path / "predictor.pkl"

    try:
        with open(predictor_path, "wb") as f:
            pickle.dump(predictor, f)
        logger.debug(f"Predictor serialized to {predictor_path}")
    except IOError as e:
        logger.error(f"Failed to write predictor pickle: {e}")
        raise RuntimeError(f"Failed to serialize predictor: {e}") from e
    except pickle.PicklingError as e:
        logger.error(f"Failed to pickle predictor: {e}")
        raise RuntimeError(f"Failed to pickle predictor: {e}") from e

    try:
        instructions_path = save_model_path / "tabular_deployment_instructions.md"
        with open(instructions_path, "w") as f:
            f.write(deployment_instructions())
        logger.debug(f"Deployment instructions written to {instructions_path}")
    except Exception as e:
        logger.debug(f"No deployment_instructions found, {e}")

    zip_path = tmp_path / "automl_predictor.zip"

    try:
        base_name = str(zip_path).replace(".zip", "")
        shutil.make_archive(
            base_name=base_name,
            format="zip",
            root_dir=save_model_path,
        )
        logger.debug(f"Model zipped to {zip_path}")
    except Exception as e:
        logger.error(f"Failed to create zip archive: {e}")
        raise RuntimeError(f"Failed to zip model: {e}") from e

    if not zip_path.exists():
        raise RuntimeError(f"Zip file was not created at {zip_path}")

    return zip_path

train_automl(dataset_path, save_model_path, target_column_name, task_type, time_budget)

Train an AutoML model and return (leaderboard, predictor).

Source code in app/tabular_automl/services.py
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
def train_automl(
    dataset_path: Path,
    save_model_path: Path,
    target_column_name: str,
    task_type: str,
    time_budget: int,
):
    """Train an AutoML model and return (leaderboard, predictor)."""

    if not target_column_name or not isinstance(target_column_name, str):
        raise ValueError("target_column_name must be a non-empty string")

    if not isinstance(time_budget, int) or time_budget <= 0:
        raise ValueError("time_budget must be a positive integer")

    try:
        os.makedirs(save_model_path, exist_ok=True)
    except OSError as e:
        logger.error(f"Failed to create model directory {save_model_path}: {e}")
        raise RuntimeError(f"Failed to create model directory: {e}") from e

    try:
        trainer = AutoMLTrainer(save_model_path=save_model_path)
    except ValueError as e:
        logger.error(f"Failed to initialize AutoML trainer: {e}")
        raise RuntimeError(f"Failed to initialize trainer: {e}") from e

    try:
        train_df = load_table(dataset_path)
    except Exception as e:
        logger.error(f"Failed to load training data: {e}")
        raise RuntimeError(f"Failed to load training data: {e}") from e

    try:
        return trainer.train(
            train_df=train_df,
            test_df=None,
            target_column=target_column_name,
            time_limit=int(time_budget),
        )
    except ValueError as e:
        logger.error(f"Training validation error: {e}")
        raise
    except RuntimeError as e:
        logger.error(f"Training runtime error: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error during training: {e}")
        raise RuntimeError(f"Unexpected error during training: {e}") from e

upload_model(upload_url, zip_path, payload, task_id)

Upload the zipped model to AutoDW. Returns the raw response.

Source code in app/tabular_automl/services.py
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def upload_model(
    upload_url: str,
    zip_path: Path,
    payload: dict,
    task_id: str | None,
) -> requests.Response:
    """Upload the zipped model to AutoDW. Returns the raw response."""

    if not upload_url or not isinstance(upload_url, str):
        raise ValueError("upload_url must be a non-empty string")

    if not zip_path.exists():
        raise FileNotFoundError(f"Zip file not found: {zip_path}")

    if not isinstance(payload, dict) or not payload:
        raise ValueError("payload must be a non-empty dict")

    headers = {"X-Task-ID": task_id} if task_id else {}
    if task_id:
        logger.debug(f"Including X-Task-ID header: {task_id}")

    try:
        with open(zip_path, "rb") as f:
            files = {"file": (zip_path.name, f, "application/octet-stream")}
            logger.debug(f"Uploading model to {upload_url}")
            return requests.post(
                upload_url, headers=headers, files=files, data=payload, timeout=120
            )
    except requests.RequestException as e:
        if isinstance(e, requests.Timeout):
            logger.error(f"Timeout uploading to {upload_url}")
            raise RuntimeError("Timeout uploading model to AutoDW") from e
        elif isinstance(e, requests.ConnectionError):
            logger.error(f"Connection error uploading model: {e}")
            raise RuntimeError(f"Failed to connect to upload URL: {e}") from e
        elif isinstance(e, requests.HTTPError):
            logger.error(f"HTTP error uploading model: {e}")
            raise RuntimeError(f"HTTP error uploading model: {e}") from e
        else:
            logger.error(f"Request error uploading model: {e}")
            raise RuntimeError(f"Request error uploading model: {e}") from e
    except OSError as e:
        logger.error(f"Failed to read zip file {zip_path}: {e}")
        raise RuntimeError(f"Failed to read zip file: {e}") from e
    except Exception as e:
        logger.error(f"Unexpected error uploading model: {e}")
        raise RuntimeError(f"Unexpected error uploading model: {e}") from e

validate_tabular_inputs(train_path, target_column_name, time_stamp_column_name=None, task_type='tabular_classification')

Validate required columns and task type for tabular training.

Source code in app/tabular_automl/services.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def validate_tabular_inputs(
    train_path: Path,
    target_column_name: str,
    time_stamp_column_name: str | None = None,
    task_type: str = "tabular_classification",
) -> str | None:
    """Validate required columns and task type for tabular training."""

    if not train_path.exists():
        logger.error(f"Training file not found: {train_path}")
        return f"Training file not found: {train_path}"

    if not target_column_name or not isinstance(target_column_name, str):
        logger.error("target_column_name must be a non-empty string")
        return "target_column_name must be a non-empty string"

    if task_type not in SUPPORTED_TABULAR_TASK_TYPES:
        logger.error(f"Invalid task type {task_type}")
        return f"Invalid task_type '{task_type}'. Must be one of: {SUPPORTED_TABULAR_TASK_TYPES}"

    try:
        train_df = load_table(train_path)
    except FileNotFoundError as e:
        logging.error(f"Training file not found: {e}")
        return f"Training file not found: {e}"
    except ValueError as e:
        logging.error(f"Could not read training data: {e}")
        return f"Could not read training data: {e}"
    except Exception as e:
        logging.error(f"Unexpected error reading training data: {e}")
        return f"Unexpected error reading training data: {e}"

    if train_df.empty:
        logger.error("Training dataframe is empty")
        return "Training dataframe is empty"

    if target_column_name not in train_df.columns:
        available_columns = ", ".join(train_df.columns.tolist())
        logger.error(
            f"Target column '{target_column_name}' not found. Available columns: {available_columns}"
        )
        return f"Target column '{target_column_name}' not found. Available columns: {available_columns}"

    if time_stamp_column_name and time_stamp_column_name not in train_df.columns:
        available_columns = ", ".join(train_df.columns.tolist())
        logger.error(
            f"Timestamp column '{time_stamp_column_name}' not found. Available columns: {available_columns}"
        )
        return f"Timestamp column '{time_stamp_column_name}' not found. Available columns: {available_columns}"

    return None

Route definitions for the tabular AutoML service.

show_deployment_instructions() async

Show deployment instructions from a template

Source code in app/tabular_automl/router.py
33
34
35
36
37
38
39
40
41
42
43
44
@router.post("/deployment_instructions/")
async def show_deployment_instructions() -> JSONResponse:
    """Show deployment instructions from a template"""
    try:
        return JSONResponse(
            content={"instructions": deployment_instructions()}, status_code=200
        )
    except Exception as e:
        logger.exception(
            "Unexpected error in finding deployment instructions in tabular"
        )
        return JSONResponse(status_code=500, content={"error": str(e)})