Source code for modep_client.automl

import glob
import io
import logging
import tempfile
import zipfile
from typing import List, Union

import pandas as pd

from modep_client.client import Client
from modep_client.tasks import INCOMPLETE_STATUSES, BaseTask

logger = logging.getLogger(__name__)


[docs]class Frameworks:
    def __init__(self, client: Client):
        """
        Initialize the Framworks class

        :param client: A :class:`modep_client.client.Client` object
        """
        self.client = client

[docs]    def info(self):
        """
        Get info about the AutoML frameworks available through the API

        :return: A :class:`pandas.DataFrame` with one row for each framework
        """
        url = self.client.url + "frameworks/tabular/info"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            js = resp.json()
            df = pd.DataFrame(js)
            if len(df) > 0:
                # keep column order same as json
                df = df[list(js[0].keys())].set_index("framework_name")
            return df
        else:
            self.client.response_exception(resp)

[docs]    def list(self):
        """
        List all AutoML framework training runs

        :return: A :class:`pandas.DataFrame` with one row for each training run
        """
        url = self.client.url + "frameworks/tabular"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            js = resp.json()
            df = pd.DataFrame(js)
            if len(df) > 0:
                # keep column order same as json
                df = df[list(js[0].keys())].set_index("id")
                df = df.sort_values(by="created", ascending=False)
            return df
        else:
            self.client.response_exception(resp)

[docs]    def train(
        self,
        framework_name: str,
        train_ids: Union[str, List[str]],
        test_ids: Union[str, List[str]],
        target: str,
        max_runtime_seconds: int,
    ):
        """
        Train an AutoML framework

        :param str framework_name: The name of the framework (ie. AutoGluon, AutoGluon_bestquality,
            autosklearn, autosklearn2, AutoWEKA, constantpredictor, DecisionTree, flaml, GAMA, H2OAutoML,
            hyperoptsklearn, mljarsupervised, mljarsupervised_compete, MLNet, RandomForest, TPOT, TunedRandomForest)
        :param train_ids: The id(s) of dataset(s) to train on (ie. `e1bc3d16b-6d67-43cd-af59-8d39d8cb2a02`)
        :param test_ids: The id(s) of dataset(s) to test on (ie. `1bc3d16b-6d67-43cd-af59-8d39d8cb2a02`)
        :param str target: The name of the target column in the training dataset(s)
        :param int max_runtime_seconds: The maximum amount of time in seconds to train per dataset(s)
        :return: A :class:`modep_client.tasks.BaseTask` object
        """

        url = self.client.url + "frameworks/tabular"

        train_ids = [train_ids] if isinstance(train_ids, str) else train_ids
        test_ids = [test_ids] if isinstance(test_ids, str) else test_ids

        data = dict(
            framework_name=framework_name,
            train_ids=train_ids,
            test_ids=test_ids,
            target=target,
            max_runtime_seconds=max_runtime_seconds,
            experiment_id="",
        )
        logger.info(data)

        resp = self.client.sess.post(url, json=data, headers=self.client.auth_header())
        if resp.ok:
            return BaseTask(resp.json(), self.get)
        else:
            self.client.response_exception(resp)

[docs]    def get(self, id: str):
        """
        Get an AutoML training run by id

        :param str id: The id of the training run
        :return: A dictionary containing the training run
        """
        url = self.client.url + "frameworks/tabular/" + str(id)
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def stop(self, id):
        """
        Stop an AutoML training run

        :param str id: The id of the training run to stop
        :return: A dictionary containing the training run
        """
        url = self.client.url + f"frameworks/tabular/{id}/stop"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def delete(self, id):
        """
        Delete an AutoML training run

        :param str id: The id of the training run to delete
        :return: A dictionary containing info about the deletion
        """
        url = self.client.url + "frameworks/tabular/" + str(id)
        resp = self.client.sess.delete(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def predict(self, framework_id, dataset_id):
        """
        Start a job to get predictions from an AutoML framework on a new dataset

        :param str framework_id: The id of the framework to use
        :param str dataset_id: The id of the dataset to predict on
        :return: A :class:`modep_client.tasks.BaseTask` object
        """
        url = self.client.url + "frameworks/tabular/predict"

        data = dict(
            framework_id=framework_id,
            dataset_id=dataset_id,
        )

        resp = self.client.sess.post(url, json=data, headers=self.client.auth_header())
        if resp.ok:
            return BaseTask(resp.json(), self.get_predictions)
        else:
            self.client.response_exception(resp)

[docs]    def get_predictions(self, predictions_id):
        """
        Get the predictions created by an AutoML training or prediction job

        :param str predictions_id: The id of the predictions to get
        :return: A dictionary containing the predictions
        """
        url = self.client.url + f"frameworks/tabular/predictions/{predictions_id}"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def get_output(self, framework_id, target_dir=None):
        """
        Get the output files generated by the AutoML training run.

        :param str framework_id: The id of the training run
        :param str target_dir: The local directory to download the files to. If None,
            the files are downloaded to a temp directory.
        """
        url = self.client.url + f"frameworks/tabular/{framework_id}/output"
        resp = self.client.sess.get(url, headers=self.client.auth_header())

        if target_dir is None:
            target_dir = tempfile.NamedTemporaryFile(suffix=f"-modep-output").name

        if resp.ok:
            # download contents to directory
            z = zipfile.ZipFile(io.BytesIO(resp.content))
            z.extractall(path=target_dir)
            return target_dir
        else:
            self.client.response_exception(resp)

[docs]    def print_log(self, framework_id, target_dir=None):
        """
        Print the logs generated by the AutoML training run.

        :param str framework_id: The id of the training run
        :param str target_dir: The local directory to download the files to. If None,
            the files are downloaded to a temp directory.
        """
        target_dir = self.get_output(framework_id, target_dir=target_dir)

        logfile = glob.glob(target_dir + "/*/logs/*.full.log")
        if len(logfile) == 1:
            with open(logfile[0], "r") as f:
                logtxt = f.read()
            print(logtxt)
        else:
            raise Exception("Could not find log file")
        return logfile


[docs]class FrameworkFlights:
    def __init__(self, client: Client):
        """
        Initialize the FrameworkFlights class. A Flight is a set of AutoML
        frameworks trained on the same data for comparison purposes.

        :param client: A :class:`modep_client.client.Client` object
        """
        self.client = client

[docs]    def train(
        self,
        framework_names: List[str],
        train_ids: Union[str, List[str]],
        test_ids: Union[str, List[str]],
        target: str,
        max_runtime_seconds: int,
    ):
        """
        Start a job to train an AutoML framework flight.

        :param framework_names: A list of framework names to train. If empty, then all
            frameworks are trained. Use :class:`modep_client.frameworks.Frameworks.info()`
            to get a list of available frameworks. Available ones are AutoGluon, AutoGluon_bestquality,
            autosklearn, autosklearn2, AutoWEKA, constantpredictor, DecisionTree, flaml, GAMA, H2OAutoML,
            hyperoptsklearn, mljarsupervised, mljarsupervised_compete, MLNet, RandomForest, TPOT, TunedRandomForest.
        :type framework_names: str or list of str
        :param list train_ids: The ids of the dataset(s) to train on
        :param list test_ids: The ids of the dataset(s) to test on
        :param str target: The name of the target column
        :param int max_runtime_seconds: The maximum amount of time in seconds to train per dataset(s)
        :return: A :class:`modep_client.tasks.BaseTask` object
        """

        url = self.client.url + "frameworks/tabular/flight"

        # convert train_ids and test_ids to lists if they are not already
        train_ids = [train_ids] if isinstance(train_ids, str) else train_ids
        test_ids = [test_ids] if isinstance(test_ids, str) else test_ids

        data = dict(
            framework_names=framework_names,
            train_ids=train_ids,
            test_ids=test_ids,
            target=target,
            max_runtime_seconds=max_runtime_seconds,
        )

        resp = self.client.sess.post(url, json=data, headers=self.client.auth_header())
        if resp.ok:
            return BaseTask(resp.json(), self.get)
        else:
            self.client.response_exception(resp)

[docs]    def list(self):
        """
        List all flights

        :return: A :class:`pandas.DataFrame` containing the flights
        """
        url = self.client.url + "frameworks/tabular/flight"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            js = resp.json()
            df = pd.DataFrame(js)
            if len(df) > 0:
                # keep column order same as json
                df = df[list(js[0].keys())].set_index("id")
                df = df.sort_values(by="created", ascending=False)
            return df
        else:
            self.client.response_exception(resp)

[docs]    def get(self, id):
        """
        Get a flight by id

        :return: A dictionary containing the flight
        """
        url = self.client.url + "frameworks/tabular/flight/" + str(id)
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def delete(self, id):
        """
        Delete a flight by id

        :return: A dictionary containing deletion information
        """
        url = self.client.url + "frameworks/tabular/flight/" + str(id)
        resp = self.client.sess.delete(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def stop(self, id):
        """
        Stop a flight by id

        :return: A dictionary containing stopping information
        """
        url = self.client.url + f"frameworks/tabular/flight/{id}/stop"
        resp = self.client.sess.get(url, headers=self.client.auth_header())
        if resp.ok:
            return resp.json()
        else:
            self.client.response_exception(resp)

[docs]    def wait(self, id):
        """
        Wait for a flight to finish while printing out a DataFrame of the results.
        This version is for running in a Jupyter notebook, for the terminal version,
        see :func:`wait_terminal()`.

        :param str id: The id of the flight to wait for
        """
        import time

        from IPython.display import HTML, clear_output, display

        while True:
            flight = self.get(id)

            # make a DataFrame for the individual AutoML frameworks for this flight
            frameworks = pd.DataFrame(flight.pop("frameworks", []))

            # drop some columns for optimal viewing
            frameworks = frameworks.drop(
                ["fold_results", "fold_leaderboard", "fold_model_txt"], 1
            )

            print(f"flight status: {flight['status']}")
            print("flight members:")
            display(HTML(frameworks.to_html()))

            if flight["status"] not in INCOMPLETE_STATUSES:
                break

            time.sleep(5)
            clear_output(wait=True)

[docs]    def wait_terminal(self, id):
        """
        Wait for a flight to finish while printing out a DataFrame of the results.
        This version is for running in a terminal. Use :func:`wait()` if you are
        running in a Jupyter notebook.

        :param str id: The id of the flight to wait for
        """
        import time

        while True:
            flight = self.get(id)

            # make a DataFrame for the individual AutoML frameworks for this flight
            frameworks = pd.DataFrame(flight.pop("frameworks", []))

            # drop some columns for optimal viewing
            frameworks = frameworks.drop(
                ["fold_results", "fold_leaderboard", "fold_model_txt"], 1
            )

            print(f"flight status: {flight['status']}")
            print("flight members:")
            print(frameworks)

            if flight["status"] not in INCOMPLETE_STATUSES:
                break

            time.sleep(5)