Data

`ArrowData` ¶

Bases: DataInterface

Source code in python/opsml/data/_data.pyi

class ArrowData(DataInterface):
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
             data (pa.Table | None):
                PyArrow Table
            dependent_vars (DependentVars | List[str] | List[int] | None):
                List of dependent variables to associate with data
            data_splits (DataSplits | List[DataSplit]):
                Optional list of `DataSplit`
            sql_logic (SqlLogic | None):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile
        """

    def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
        """Saves pyarrow table to parquet via write_table

        Args:
            path (Path):
                Base path to save the data to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.

        Acceptable save kwargs:
            row_group_size (int | None):
                Maximum number of rows in each written row group. If None, the row group size will be the minimum of the
                Table size and 1024 * 1024. Default is None.
            version ({'1.0', '2.4', '2.6'}):
                Determine which Parquet logical types are available for use. Default is '2.6'.
            use_dictionary (bool | list):
                Specify if dictionary encoding should be used in general or only for some columns. Default is True.
            compression (str | dict):
                Specify the compression codec, either on a general basis or per-column.
                Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. Default is 'snappy'.
            write_statistics (bool | list):
                Specify if statistics should be written in general or only for some columns. Default is True.
            use_deprecated_int96_timestamps (bool | None):
                Write timestamps to INT96 Parquet format. Default is None.
            coerce_timestamps (str | None):
                Cast timestamps to a particular resolution. Valid values: {None, 'ms', 'us'}. Default is None.
            allow_truncated_timestamps (bool):
                Allow loss of data when coercing timestamps to a particular resolution. Default is False.
            data_page_size (int | None):
                Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes).
                Default is None.
            flavor ({'spark'} | None):
                Sanitize schema or set other compatibility options to work with various target systems. Default is None.
            filesystem (FileSystem | None):
                Filesystem object to use when reading the parquet file. Default is None.
            compression_level (int | dict | None):
                Specify the compression level for a codec, either on a general basis or per-column. Default is None.
            use_byte_stream_split (bool | list):
                Specify if the byte_stream_split encoding should be used in general or only for some columns. Default is False.
            column_encoding (str | dict | None):
                Specify the encoding scheme on a per column basis. Default is None.
            data_page_version ({'1.0', '2.0'}):
                The serialized Parquet data page format version to write. Default is '1.0'.
            use_compliant_nested_type (bool):
                Whether to write compliant Parquet nested type (lists). Default is True.
            encryption_properties (FileEncryptionProperties | None):
                File encryption properties for Parquet Modular Encryption. Default is None.
            write_batch_size (int | None):
                Number of values to write to a page at a time. Default is None.
            dictionary_pagesize_limit (int | None):
                Specify the dictionary page size limit per row group. Default is None.
            store_schema (bool):
                By default, the Arrow schema is serialized and stored in the Parquet file metadata. Default is True.
            write_page_index (bool):
                Whether to write a page index in general for all columns. Default is False.
            write_page_checksum (bool):
                Whether to write page checksums in general for all columns. Default is False.
            sorting_columns (Sequence[SortingColumn] | None):
                Specify the sort order of the data being written. Default is None.
            store_decimal_as_integer (bool):
                Allow decimals with 1 <= precision <= 18 to be stored as integers. Default is False.
            **kwargs:
                Additional options for ParquetWriter.

        Additional Information:
            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the data from a file

        Args:
            path (Path):
                Base path to load the data from.
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to pass in.

        Acceptable load kwargs:
            columns (list | None):
                If not None, only these columns will be read from the file. A column name may be a prefix of a nested field,
                e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. If empty, no columns will be read. Default is None.
            use_threads (bool):
                Perform multi-threaded column reads. Default is True.
            schema (Schema | None):
                Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source.
                Default is None.
            use_pandas_metadata (bool):
                If True and file has custom pandas schema metadata, ensure that index columns are also loaded. Default is False.
            read_dictionary (list | None):
                List of names or column paths (for nested types) to read directly as DictionaryArray.
                Only supported for BYTE_ARRAY storage. Default is None.
            memory_map (bool):
                If the source is a file path, use a memory map to read file, which can improve performance in some environments.
                Default is False.
            buffer_size (int):
                If positive, perform read buffering when deserializing individual column chunks.
                Otherwise IO calls are unbuffered. Default is 0.
            partitioning (pyarrow.dataset.Partitioning | str | list of str):
                The partitioning scheme for a partitioned dataset. Default is 'hive'.
            filesystem (FileSystem | None):
                If nothing passed, will be inferred based on path. Default is None.
            filters (pyarrow.compute.Expression | list[tuple] | list[list[tuple]] | None):
                Rows which do not match the filter predicate will be removed from scanned data. Default is None.
            use_legacy_dataset (bool | None):
                Deprecated and has no effect from PyArrow version 15.0.0. Default is None.
            ignore_prefixes (list | None):
                Files matching any of these prefixes will be ignored by the discovery process. Default is ['.', '_'].
            pre_buffer (bool):
                Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3).
                Default is True.
            coerce_int96_timestamp_unit (str | None):
                Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Default is None.
            decryption_properties (FileDecryptionProperties | None):
                File-level decryption properties. Default is None.
            thrift_string_size_limit (int | None):
                If not None, override the maximum total string size allocated when decoding Thrift structures. Default is None.
            thrift_container_size_limit (int | None):
                If not None, override the maximum total size of containers allocated when decoding Thrift structures.
                Default is None.
            page_checksum_verification (bool):
                If True, verify the checksum for each page read from the file. Default is False.

        Additional Information:
            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
        """

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`Table \| None`	PyArrow Table	`None`

dependent_vars (DependentVars | List[str] | List[int] | None):
    List of dependent variables to associate with data
data_splits (DataSplits | List[DataSplit]):
    Optional list of `DataSplit`
sql_logic (SqlLogic | None):
    Sql logic used to generate data represented as a dictionary.
data_profile (DataProfile | None):
    Data profile

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
         data (pa.Table | None):
            PyArrow Table
        dependent_vars (DependentVars | List[str] | List[int] | None):
            List of dependent variables to associate with data
        data_splits (DataSplits | List[DataSplit]):
            Optional list of `DataSplit`
        sql_logic (SqlLogic | None):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile
    """

`load(path, metadata, load_kwargs=None)` ¶

Load the data from a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from.	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to pass in.	`None`

Acceptable load kwargs

columns (list | None): If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. If empty, no columns will be read. Default is None. use_threads (bool): Perform multi-threaded column reads. Default is True. schema (Schema | None): Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source. Default is None. use_pandas_metadata (bool): If True and file has custom pandas schema metadata, ensure that index columns are also loaded. Default is False. read_dictionary (list | None): List of names or column paths (for nested types) to read directly as DictionaryArray. Only supported for BYTE_ARRAY storage. Default is None. memory_map (bool): If the source is a file path, use a memory map to read file, which can improve performance in some environments. Default is False. buffer_size (int): If positive, perform read buffering when deserializing individual column chunks. Otherwise IO calls are unbuffered. Default is 0. partitioning (pyarrow.dataset.Partitioning | str | list of str): The partitioning scheme for a partitioned dataset. Default is 'hive'. filesystem (FileSystem | None): If nothing passed, will be inferred based on path. Default is None. filters (pyarrow.compute.Expression | list[tuple] | list[list[tuple]] | None): Rows which do not match the filter predicate will be removed from scanned data. Default is None. use_legacy_dataset (bool | None): Deprecated and has no effect from PyArrow version 15.0.0. Default is None. ignore_prefixes (list | None): Files matching any of these prefixes will be ignored by the discovery process. Default is ['.', '_']. pre_buffer (bool): Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). Default is True. coerce_int96_timestamp_unit (str | None): Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Default is None. decryption_properties (FileDecryptionProperties | None): File-level decryption properties. Default is None. thrift_string_size_limit (int | None): If not None, override the maximum total string size allocated when decoding Thrift structures. Default is None. thrift_container_size_limit (int | None): If not None, override the maximum total size of containers allocated when decoding Thrift structures. Default is None. page_checksum_verification (bool): If True, verify the checksum for each page read from the file. Default is False.

Additional Information

https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html

Source code in python/opsml/data/_data.pyi

def load(
    self,
    path: Path,
    metadata: DataInterfaceSaveMetadata,
    load_kwargs: Optional[DataLoadKwargs] = None,
) -> None:
    """Load the data from a file

    Args:
        path (Path):
            Base path to load the data from.
        metadata (DataInterfaceSaveMetadata):
            Metadata associated with the data
        load_kwargs (DataLoadKwargs):
            Additional kwargs to pass in.

    Acceptable load kwargs:
        columns (list | None):
            If not None, only these columns will be read from the file. A column name may be a prefix of a nested field,
            e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. If empty, no columns will be read. Default is None.
        use_threads (bool):
            Perform multi-threaded column reads. Default is True.
        schema (Schema | None):
            Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source.
            Default is None.
        use_pandas_metadata (bool):
            If True and file has custom pandas schema metadata, ensure that index columns are also loaded. Default is False.
        read_dictionary (list | None):
            List of names or column paths (for nested types) to read directly as DictionaryArray.
            Only supported for BYTE_ARRAY storage. Default is None.
        memory_map (bool):
            If the source is a file path, use a memory map to read file, which can improve performance in some environments.
            Default is False.
        buffer_size (int):
            If positive, perform read buffering when deserializing individual column chunks.
            Otherwise IO calls are unbuffered. Default is 0.
        partitioning (pyarrow.dataset.Partitioning | str | list of str):
            The partitioning scheme for a partitioned dataset. Default is 'hive'.
        filesystem (FileSystem | None):
            If nothing passed, will be inferred based on path. Default is None.
        filters (pyarrow.compute.Expression | list[tuple] | list[list[tuple]] | None):
            Rows which do not match the filter predicate will be removed from scanned data. Default is None.
        use_legacy_dataset (bool | None):
            Deprecated and has no effect from PyArrow version 15.0.0. Default is None.
        ignore_prefixes (list | None):
            Files matching any of these prefixes will be ignored by the discovery process. Default is ['.', '_'].
        pre_buffer (bool):
            Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3).
            Default is True.
        coerce_int96_timestamp_unit (str | None):
            Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Default is None.
        decryption_properties (FileDecryptionProperties | None):
            File-level decryption properties. Default is None.
        thrift_string_size_limit (int | None):
            If not None, override the maximum total string size allocated when decoding Thrift structures. Default is None.
        thrift_container_size_limit (int | None):
            If not None, override the maximum total size of containers allocated when decoding Thrift structures.
            Default is None.
        page_checksum_verification (bool):
            If True, verify the checksum for each page read from the file. Default is False.

    Additional Information:
        https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
    """

`save(path, save_kwargs=None)` ¶

Saves pyarrow table to parquet via write_table

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to save the data to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Acceptable save kwargs

row_group_size (int | None): Maximum number of rows in each written row group. If None, the row group size will be the minimum of the Table size and 1024 * 1024. Default is None. version ({'1.0', '2.4', '2.6'}): Determine which Parquet logical types are available for use. Default is '2.6'. use_dictionary (bool | list): Specify if dictionary encoding should be used in general or only for some columns. Default is True. compression (str | dict): Specify the compression codec, either on a general basis or per-column. Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. Default is 'snappy'. write_statistics (bool | list): Specify if statistics should be written in general or only for some columns. Default is True. use_deprecated_int96_timestamps (bool | None): Write timestamps to INT96 Parquet format. Default is None. coerce_timestamps (str | None): Cast timestamps to a particular resolution. Valid values: {None, 'ms', 'us'}. Default is None. allow_truncated_timestamps (bool): Allow loss of data when coercing timestamps to a particular resolution. Default is False. data_page_size (int | None): Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Default is None. flavor ({'spark'} | None): Sanitize schema or set other compatibility options to work with various target systems. Default is None. filesystem (FileSystem | None): Filesystem object to use when reading the parquet file. Default is None. compression_level (int | dict | None): Specify the compression level for a codec, either on a general basis or per-column. Default is None. use_byte_stream_split (bool | list): Specify if the byte_stream_split encoding should be used in general or only for some columns. Default is False. column_encoding (str | dict | None): Specify the encoding scheme on a per column basis. Default is None. data_page_version ({'1.0', '2.0'}): The serialized Parquet data page format version to write. Default is '1.0'. use_compliant_nested_type (bool): Whether to write compliant Parquet nested type (lists). Default is True. encryption_properties (FileEncryptionProperties | None): File encryption properties for Parquet Modular Encryption. Default is None. write_batch_size (int | None): Number of values to write to a page at a time. Default is None. dictionary_pagesize_limit (int | None): Specify the dictionary page size limit per row group. Default is None. store_schema (bool): By default, the Arrow schema is serialized and stored in the Parquet file metadata. Default is True. write_page_index (bool): Whether to write a page index in general for all columns. Default is False. write_page_checksum (bool): Whether to write page checksums in general for all columns. Default is False. sorting_columns (Sequence[SortingColumn] | None): Specify the sort order of the data being written. Default is None. store_decimal_as_integer (bool): Allow decimals with 1 <= precision <= 18 to be stored as integers. Default is False. **kwargs: Additional options for ParquetWriter.

Additional Information

https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html

Source code in python/opsml/data/_data.pyi

def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
    """Saves pyarrow table to parquet via write_table

    Args:
        path (Path):
            Base path to save the data to.
        save_kwargs (DataSaveKwargs):
            Additional kwargs to pass in.

    Acceptable save kwargs:
        row_group_size (int | None):
            Maximum number of rows in each written row group. If None, the row group size will be the minimum of the
            Table size and 1024 * 1024. Default is None.
        version ({'1.0', '2.4', '2.6'}):
            Determine which Parquet logical types are available for use. Default is '2.6'.
        use_dictionary (bool | list):
            Specify if dictionary encoding should be used in general or only for some columns. Default is True.
        compression (str | dict):
            Specify the compression codec, either on a general basis or per-column.
            Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. Default is 'snappy'.
        write_statistics (bool | list):
            Specify if statistics should be written in general or only for some columns. Default is True.
        use_deprecated_int96_timestamps (bool | None):
            Write timestamps to INT96 Parquet format. Default is None.
        coerce_timestamps (str | None):
            Cast timestamps to a particular resolution. Valid values: {None, 'ms', 'us'}. Default is None.
        allow_truncated_timestamps (bool):
            Allow loss of data when coercing timestamps to a particular resolution. Default is False.
        data_page_size (int | None):
            Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes).
            Default is None.
        flavor ({'spark'} | None):
            Sanitize schema or set other compatibility options to work with various target systems. Default is None.
        filesystem (FileSystem | None):
            Filesystem object to use when reading the parquet file. Default is None.
        compression_level (int | dict | None):
            Specify the compression level for a codec, either on a general basis or per-column. Default is None.
        use_byte_stream_split (bool | list):
            Specify if the byte_stream_split encoding should be used in general or only for some columns. Default is False.
        column_encoding (str | dict | None):
            Specify the encoding scheme on a per column basis. Default is None.
        data_page_version ({'1.0', '2.0'}):
            The serialized Parquet data page format version to write. Default is '1.0'.
        use_compliant_nested_type (bool):
            Whether to write compliant Parquet nested type (lists). Default is True.
        encryption_properties (FileEncryptionProperties | None):
            File encryption properties for Parquet Modular Encryption. Default is None.
        write_batch_size (int | None):
            Number of values to write to a page at a time. Default is None.
        dictionary_pagesize_limit (int | None):
            Specify the dictionary page size limit per row group. Default is None.
        store_schema (bool):
            By default, the Arrow schema is serialized and stored in the Parquet file metadata. Default is True.
        write_page_index (bool):
            Whether to write a page index in general for all columns. Default is False.
        write_page_checksum (bool):
            Whether to write page checksums in general for all columns. Default is False.
        sorting_columns (Sequence[SortingColumn] | None):
            Specify the sort order of the data being written. Default is None.
        store_decimal_as_integer (bool):
            Allow decimals with 1 <= precision <= 18 to be stored as integers. Default is False.
        **kwargs:
            Additional options for ParquetWriter.

    Additional Information:
        https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
    """

`ColumnSplit` ¶

Source code in python/opsml/data/_data.pyi

class ColumnSplit:
    column_name: str
    column_value: ColValType
    column_type: ColType
    inequality: Inequality

    def __init__(
        self,
        column_name: str,
        column_value: Union[str, float, int],
        column_type: ColType = ColType.Builtin,
        inequality: Optional[Union[str, Inequality]] = None,
    ) -> None:
        """Define a column split

        Args:
            column_name:
                The name of the column
            column_value:
                The value of the column. Can be a string, float, or int. If
                timestamp, convert to isoformat (str) and specify timestamp coltype
            column_type:
                The type of the column. Defaults to ColType.Builtin. If providing ColtType.Timestamp, the
                column_value should be a float
            inequality:
                The inequality of the column
        """

`init(column_name, column_value, column_type=ColType.Builtin, inequality=None)` ¶

Define a column split

Parameters:

Name	Type	Description	Default
`column_name`	`str`	The name of the column	required
`column_value`	`Union[str, float, int]`	The value of the column. Can be a string, float, or int. If timestamp, convert to isoformat (str) and specify timestamp coltype	required
`column_type`	`ColType`	The type of the column. Defaults to ColType.Builtin. If providing ColtType.Timestamp, the column_value should be a float	`Builtin`
`inequality`	`Optional[Union[str, Inequality]]`	The inequality of the column	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    column_name: str,
    column_value: Union[str, float, int],
    column_type: ColType = ColType.Builtin,
    inequality: Optional[Union[str, Inequality]] = None,
) -> None:
    """Define a column split

    Args:
        column_name:
            The name of the column
        column_value:
            The value of the column. Can be a string, float, or int. If
            timestamp, convert to isoformat (str) and specify timestamp coltype
        column_type:
            The type of the column. Defaults to ColType.Builtin. If providing ColtType.Timestamp, the
            column_value should be a float
        inequality:
            The inequality of the column
    """

`DataInterface` ¶

Source code in python/opsml/data/_data.pyi

class DataInterface:
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
            data (Any):
                Data. Can be a pyarrow table, pandas dataframe, polars dataframe
                or numpy array
            dependent_vars (DependentVars):
                List of dependent variables to associate with data
            data_splits (DataSplits):
                Optional list of `DataSplit`
            sql_logic (SqlLogic):
                SqlLogic class used to generate data.
            data_profile (DataProfile):
                Data profile
        """

    @property
    def data(self) -> Optional[Any]:
        """Returns the data"""

    @data.setter
    def data(self, data: Any) -> None:
        """Sets the data"""

    @property
    def data_splits(self) -> DataSplits:
        """Returns the data splits."""

    @data_splits.setter
    def data_splits(self, data_splits: Union[DataSplits, List[DataSplit]]) -> None:
        """Sets the data splits"""

    @property
    def dependent_vars(self) -> DependentVars:
        """Returns the dependent variables."""

    @dependent_vars.setter
    def dependent_vars(
        self,
        dependent_vars: Union[DependentVars, List[str], List[int]],
    ) -> None:
        """Sets the dependent variables"""

    @property
    def schema(self) -> FeatureSchema:
        """Returns the feature map."""

    @schema.setter
    def schema(self, schema: FeatureSchema) -> None:
        """Sets the feature map"""

    @property
    def sql_logic(self) -> SqlLogic:
        """Returns the sql logic."""

    @property
    def data_type(self) -> DataType:
        """Return the data type."""

    def add_sql_logic(
        self,
        name: str,
        query: Optional[str] = None,
        filepath: Optional[str] = None,
    ) -> None:
        """Add sql logic to the data interface

        Args:
            name:
                The name of the sql logic
            query:
                The optional query to use
            filepath:
                The optional filepath to open the query from
        """

    def save(
        self,
        path: Path,
        save_kwargs: Optional[DataSaveKwargs] = None,
    ) -> DataInterfaceMetadata:
        """Saves all data interface component to the given path. This used as part of saving a
        DataCard

        Methods called in save:
            - save_sql: Saves all sql logic to files(s)
            - create_schema: Creates a FeatureSchema from the associated data
            - save_data: Saves the data to a file

        Args:
            path (Path):
                The path to save the data interface components to.
            save_kwargs (DataSaveKwargs):
                The save kwargs to use.

        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the data from a file

        Args:
            path (Path):
                Base path to load the data from
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to pass in.
        """

    def split_data(self) -> Dict[str, Data]:
        """Split the data

        Returns:
            A dictionary of data splits
        """

    def create_data_profile(
        self,
        bin_size: Optional[int] = 20,
        compute_correlations: Optional[bool] = False,
    ) -> DataProfile:
        """Create a data profile


        Args:
            bin_size (int):
                The bin size for the data profile
            compute_correlations (bool):
                Whether to compute correlations
        """

    @property
    def data_profile(self) -> Optional[DataProfile]:
        """Return the data profile

        Returns:
            The data profile
        """

    @data_profile.setter
    def data_profile(self, data_profile: Optional[DataProfile]) -> None:
        """Set the data profile

        Args:
            data_profile (DataProfile | None):
                The data profile to set
        """

`data` `property` `writable` ¶

Returns the data

`data_profile` `property` `writable` ¶

Return the data profile

Returns:

Type	Description
`Optional[DataProfile]`	The data profile

`data_splits` `property` `writable` ¶

Returns the data splits.

`data_type` `property` ¶

Return the data type.

`dependent_vars` `property` `writable` ¶

Returns the dependent variables.

`schema` `property` `writable` ¶

Returns the feature map.

`sql_logic` `property` ¶

Returns the sql logic.

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`Any`	Data. Can be a pyarrow table, pandas dataframe, polars dataframe or numpy array	`None`
`dependent_vars`	`DependentVars`	List of dependent variables to associate with data	`None`
`data_splits`	`DataSplits`	Optional list of `DataSplit`	`None`
`sql_logic`	`SqlLogic`	SqlLogic class used to generate data.	`None`
`data_profile`	`DataProfile`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
        data (Any):
            Data. Can be a pyarrow table, pandas dataframe, polars dataframe
            or numpy array
        dependent_vars (DependentVars):
            List of dependent variables to associate with data
        data_splits (DataSplits):
            Optional list of `DataSplit`
        sql_logic (SqlLogic):
            SqlLogic class used to generate data.
        data_profile (DataProfile):
            Data profile
    """

`add_sql_logic(name, query=None, filepath=None)` ¶

Add sql logic to the data interface

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the sql logic	required
`query`	`Optional[str]`	The optional query to use	`None`
`filepath`	`Optional[str]`	The optional filepath to open the query from	`None`

Source code in python/opsml/data/_data.pyi

def add_sql_logic(
    self,
    name: str,
    query: Optional[str] = None,
    filepath: Optional[str] = None,
) -> None:
    """Add sql logic to the data interface

    Args:
        name:
            The name of the sql logic
        query:
            The optional query to use
        filepath:
            The optional filepath to open the query from
    """

`create_data_profile(bin_size=20, compute_correlations=False)` ¶

Create a data profile

Parameters:

Name	Type	Description	Default
`bin_size`	`int`	The bin size for the data profile	`20`
`compute_correlations`	`bool`	Whether to compute correlations	`False`

Source code in python/opsml/data/_data.pyi

def create_data_profile(
    self,
    bin_size: Optional[int] = 20,
    compute_correlations: Optional[bool] = False,
) -> DataProfile:
    """Create a data profile


    Args:
        bin_size (int):
            The bin size for the data profile
        compute_correlations (bool):
            Whether to compute correlations
    """

`load(path, metadata, load_kwargs=None)` ¶

Load the data from a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to pass in.	`None`

Source code in python/opsml/data/_data.pyi

def load(
    self,
    path: Path,
    metadata: DataInterfaceSaveMetadata,
    load_kwargs: Optional[DataLoadKwargs] = None,
) -> None:
    """Load the data from a file

    Args:
        path (Path):
            Base path to load the data from
        metadata (DataInterfaceSaveMetadata):
            Metadata associated with the data
        load_kwargs (DataLoadKwargs):
            Additional kwargs to pass in.
    """

`save(path, save_kwargs=None)` ¶

Saves all data interface component to the given path. This used as part of saving a DataCard

Methods called in save

save_sql: Saves all sql logic to files(s)
create_schema: Creates a FeatureSchema from the associated data
save_data: Saves the data to a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to save the data interface components to.	required
`save_kwargs`	`DataSaveKwargs`	The save kwargs to use.	`None`

Source code in python/opsml/data/_data.pyi

def save(
    self,
    path: Path,
    save_kwargs: Optional[DataSaveKwargs] = None,
) -> DataInterfaceMetadata:
    """Saves all data interface component to the given path. This used as part of saving a
    DataCard

    Methods called in save:
        - save_sql: Saves all sql logic to files(s)
        - create_schema: Creates a FeatureSchema from the associated data
        - save_data: Saves the data to a file

    Args:
        path (Path):
            The path to save the data interface components to.
        save_kwargs (DataSaveKwargs):
            The save kwargs to use.

    """

`split_data()` ¶

Split the data

Returns:

Type	Description
`Dict[str, Data]`	A dictionary of data splits

Source code in python/opsml/data/_data.pyi

def split_data(self) -> Dict[str, Data]:
    """Split the data

    Returns:
        A dictionary of data splits
    """

`DataInterfaceMetadata` ¶

Source code in python/opsml/data/_data.pyi

class DataInterfaceMetadata:
    save_metadata: DataInterfaceSaveMetadata
    schema: FeatureSchema
    extra_metadata: dict[str, str]
    sql_logic: SqlLogic  # pylint: disable=used-before-assignment
    interface_type: DataInterfaceType
    data_splits: DataSplits
    dependent_vars: DependentVars
    data_type: DataType

    def __init__(
        self,
        save_metadata: DataInterfaceSaveMetadata,
        schema: FeatureSchema,
        extra_metadata: dict[str, str],
        sql_logic: SqlLogic,
        interface_type: DataInterfaceType,
        data_splits: DataSplits,
        dependent_vars: DependentVars,
        data_type: DataType,
    ) -> None:
        """Instantiate DataInterfaceMetadata object

        Args:
            save_metadata:
                The save metadata
            schema:
                The schema
            extra_metadata:
                Extra metadata
            sql_logic:
                Sql logic
            interface_type:
                The interface type
            data_splits:
                The data splits
            dependent_vars:
                Dependent variables
            data_type:
                The data type
        """

    def __str__(self) -> str:
        """Return the string representation of the model interface metadata"""

    def model_dump_json(self) -> str:
        """Dump the model interface metadata to json"""

    @staticmethod
    def model_validate_json(json_string: str) -> "DataInterfaceMetadata":
        """Validate the model interface metadata json"""

`init(save_metadata, schema, extra_metadata, sql_logic, interface_type, data_splits, dependent_vars, data_type)` ¶

Instantiate DataInterfaceMetadata object

Parameters:

Name	Type	Description	Default
`save_metadata`	`DataInterfaceSaveMetadata`	The save metadata	required
`schema`	`FeatureSchema`	The schema	required
`extra_metadata`	`dict[str, str]`	Extra metadata	required
`sql_logic`	`SqlLogic`	Sql logic	required
`interface_type`	`DataInterfaceType`	The interface type	required
`data_splits`	`DataSplits`	The data splits	required
`dependent_vars`	`DependentVars`	Dependent variables	required
`data_type`	`DataType`	The data type	required

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    save_metadata: DataInterfaceSaveMetadata,
    schema: FeatureSchema,
    extra_metadata: dict[str, str],
    sql_logic: SqlLogic,
    interface_type: DataInterfaceType,
    data_splits: DataSplits,
    dependent_vars: DependentVars,
    data_type: DataType,
) -> None:
    """Instantiate DataInterfaceMetadata object

    Args:
        save_metadata:
            The save metadata
        schema:
            The schema
        extra_metadata:
            Extra metadata
        sql_logic:
            Sql logic
        interface_type:
            The interface type
        data_splits:
            The data splits
        dependent_vars:
            Dependent variables
        data_type:
            The data type
    """

`str()` ¶

Return the string representation of the model interface metadata

Source code in python/opsml/data/_data.pyi

def __str__(self) -> str:
    """Return the string representation of the model interface metadata"""

`model_dump_json()` ¶

Dump the model interface metadata to json

Source code in python/opsml/data/_data.pyi

def model_dump_json(self) -> str:
    """Dump the model interface metadata to json"""

`model_validate_json(json_string)` `staticmethod` ¶

Validate the model interface metadata json

Source code in python/opsml/data/_data.pyi

@staticmethod
def model_validate_json(json_string: str) -> "DataInterfaceMetadata":
    """Validate the model interface metadata json"""

`DataInterfaceSaveMetadata` ¶

Source code in python/opsml/data/_data.pyi

class DataInterfaceSaveMetadata:
    data_uri: Path
    data_profile_uri: Optional[Path]
    sql_uri: Optional[Path]
    extra: Optional[ExtraMetadata]
    save_kwargs: DataSaveKwargs

    def __init__(
        self,
        data_uri: Path,
        data_profile_uri: Optional[Path] = None,
        sql_uri: Optional[Path] = None,
        extra: Optional[ExtraMetadata] = None,
        save_kwargs: Optional[DataSaveKwargs] = None,
    ) -> None:
        """Define interface save metadata

        Args:
            data_uri:
                The data uri
            data_profile_uri:
                The data profile uri
            sql_uri:
                The sql uri
            extra:
                Extra metadata
            save_kwargs:
                Save kwargs
        """

`init(data_uri, data_profile_uri=None, sql_uri=None, extra=None, save_kwargs=None)` ¶

Define interface save metadata

Parameters:

Name	Type	Description	Default
`data_uri`	`Path`	The data uri	required
`data_profile_uri`	`Optional[Path]`	The data profile uri	`None`
`sql_uri`	`Optional[Path]`	The sql uri	`None`
`extra`	`Optional[ExtraMetadata]`	Extra metadata	`None`
`save_kwargs`	`Optional[DataSaveKwargs]`	Save kwargs	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data_uri: Path,
    data_profile_uri: Optional[Path] = None,
    sql_uri: Optional[Path] = None,
    extra: Optional[ExtraMetadata] = None,
    save_kwargs: Optional[DataSaveKwargs] = None,
) -> None:
    """Define interface save metadata

    Args:
        data_uri:
            The data uri
        data_profile_uri:
            The data profile uri
        sql_uri:
            The sql uri
        extra:
            Extra metadata
        save_kwargs:
            Save kwargs
    """

`DataLoadKwargs` ¶

Source code in python/opsml/data/_data.pyi

class DataLoadKwargs:
    data: Optional[Dict]

    def __init__(
        self,
        data: Optional[Dict] = None,
    ) -> None:
        """Optional arguments to pass to load_model

        Args:
            data (Dict):
                Optional data arguments to use when loading

        """

`init(data=None)` ¶

Optional arguments to pass to load_model

Parameters:

Name	Type	Description	Default
`data`	`Dict`	Optional data arguments to use when loading	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Dict] = None,
) -> None:
    """Optional arguments to pass to load_model

    Args:
        data (Dict):
            Optional data arguments to use when loading

    """

`DataSaveKwargs` ¶

Source code in python/opsml/data/_data.pyi

class DataSaveKwargs:
    def __init__(
        self,
        data: Optional[Dict] = None,
    ) -> None:
        """Optional arguments to pass to save_model

        Args:
            data (Dict):
                Optional data arguments to use when saving model to onnx format
        """

    def __str__(self): ...
    def model_dump_json(self) -> str: ...
    @staticmethod
    def model_validate_json(json_string: str) -> "DataSaveKwargs": ...

`init(data=None)` ¶

Optional arguments to pass to save_model

Parameters:

Name	Type	Description	Default
`data`	`Dict`	Optional data arguments to use when saving model to onnx format	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Dict] = None,
) -> None:
    """Optional arguments to pass to save_model

    Args:
        data (Dict):
            Optional data arguments to use when saving model to onnx format
    """

`DataSplit` ¶

Source code in python/opsml/data/_data.pyi

class DataSplit:
    label: str
    column_split: Optional[ColumnSplit]
    start_stop_split: Optional[StartStopSplit]
    indice_split: Optional[IndiceSplit]

    def __init__(
        self,
        label: str,
        column_split: Optional[ColumnSplit] = None,
        start_stop_split: Optional[StartStopSplit] = None,
        indice_split: Optional[IndiceSplit] = None,
    ) -> None:
        """Define a data split

        Args:
            label:
                The label of the split
            column_split:
                The column split
            start_stop_split:
                The start stop split
            indice_split:
                The indice split
        """

`init(label, column_split=None, start_stop_split=None, indice_split=None)` ¶

Define a data split

Parameters:

Name	Type	Description	Default
`label`	`str`	The label of the split	required
`column_split`	`Optional[ColumnSplit]`	The column split	`None`
`start_stop_split`	`Optional[StartStopSplit]`	The start stop split	`None`
`indice_split`	`Optional[IndiceSplit]`	The indice split	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    label: str,
    column_split: Optional[ColumnSplit] = None,
    start_stop_split: Optional[StartStopSplit] = None,
    indice_split: Optional[IndiceSplit] = None,
) -> None:
    """Define a data split

    Args:
        label:
            The label of the split
        column_split:
            The column split
        start_stop_split:
            The start stop split
        indice_split:
            The indice split
    """

`DataSplits` ¶

Source code in python/opsml/data/_data.pyi

class DataSplits:
    def __init__(self, splits: List[DataSplit]) -> None:
        """Define data splits

        Args:
            splits:
                The data splits
        """

    def __str__(self) -> str:
        """String representation of the data splits"""

    @property
    def splits(self) -> List[DataSplit]:
        """Return the splits"""

    @splits.setter
    def splits(self, splits: List[DataSplit]) -> None:
        """Set the splits"""

    def split_data(
        self,
        data: Any,
        data_type: DataType,
        dependent_vars: DependentVars,
    ) -> Dict[str, Data]:
        """Split the data

        Args:
            data:
                The data to split
            data_type:
                The data type
            dependent_vars:
                Dependent variables to associate with the data

        Returns:
            A dictionary of data splits
        """

`splits` `property` `writable` ¶

Return the splits

`init(splits)` ¶

Define data splits

Parameters:

Name	Type	Description	Default
`splits`	`List[DataSplit]`	The data splits	required

Source code in python/opsml/data/_data.pyi

def __init__(self, splits: List[DataSplit]) -> None:
    """Define data splits

    Args:
        splits:
            The data splits
    """

`str()` ¶

String representation of the data splits

Source code in python/opsml/data/_data.pyi

def __str__(self) -> str:
    """String representation of the data splits"""

`split_data(data, data_type, dependent_vars)` ¶

Split the data

Parameters:

Name	Type	Description	Default
`data`	`Any`	The data to split	required
`data_type`	`DataType`	The data type	required
`dependent_vars`	`DependentVars`	Dependent variables to associate with the data	required

Returns:

Type	Description
`Dict[str, Data]`	A dictionary of data splits

Source code in python/opsml/data/_data.pyi

def split_data(
    self,
    data: Any,
    data_type: DataType,
    dependent_vars: DependentVars,
) -> Dict[str, Data]:
    """Split the data

    Args:
        data:
            The data to split
        data_type:
            The data type
        dependent_vars:
            Dependent variables to associate with the data

    Returns:
        A dictionary of data splits
    """

`DataSplitter` ¶

Source code in python/opsml/data/_data.pyi

class DataSplitter:
    @staticmethod
    def split_data(
        split: DataSplit,
        data: Any,
        data_type: DataType,
        dependent_vars: DependentVars,
    ) -> Data:
        """Create a split

        Args:
            split:
                The data split to use to split the data
            data:
                The data to split
            data_type:
                The data type
            dependent_vars:
                Dependent variables to associate with the data

        Returns:
            A Data object
        """

`split_data(split, data, data_type, dependent_vars)` `staticmethod` ¶

Create a split

Parameters:

Name	Type	Description	Default
`split`	`DataSplit`	The data split to use to split the data	required
`data`	`Any`	The data to split	required
`data_type`	`DataType`	The data type	required
`dependent_vars`	`DependentVars`	Dependent variables to associate with the data	required

Returns:

Type	Description
`Data`	A Data object

Source code in python/opsml/data/_data.pyi

@staticmethod
def split_data(
    split: DataSplit,
    data: Any,
    data_type: DataType,
    dependent_vars: DependentVars,
) -> Data:
    """Create a split

    Args:
        split:
            The data split to use to split the data
        data:
            The data to split
        data_type:
            The data type
        dependent_vars:
            Dependent variables to associate with the data

    Returns:
        A Data object
    """

`DependentVars` ¶

Source code in python/opsml/data/_data.pyi

class DependentVars:
    def __init__(
        self,
        column_names: Optional[List[str]] = None,
        column_indices: Optional[List[int]] = None,
    ) -> None:
        """Define dependent variables for the data interface. User
        can specify either column names or column indices.

        Args:
            column_names:
                The column names of the dependent variables
            column_indices:
                The column indices of the dependent variables
        """

    def __str__(self) -> str:
        """String representation of the dependent variables"""

    @property
    def column_names(self) -> List[str]:
        """Return the column names"""

    @column_names.setter
    def column_names(self, column_names: List[str]) -> None:
        """Set the column names"""

    @property
    def column_indices(self) -> List[int]:
        """Return the column indices"""

    @column_indices.setter
    def column_indices(self, column_indices: List[int]) -> None:
        """Set the column indices"""

`column_indices` `property` `writable` ¶

Return the column indices

`column_names` `property` `writable` ¶

Return the column names

`init(column_names=None, column_indices=None)` ¶

Define dependent variables for the data interface. User can specify either column names or column indices.

Parameters:

Name	Type	Description	Default
`column_names`	`Optional[List[str]]`	The column names of the dependent variables	`None`
`column_indices`	`Optional[List[int]]`	The column indices of the dependent variables	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    column_names: Optional[List[str]] = None,
    column_indices: Optional[List[int]] = None,
) -> None:
    """Define dependent variables for the data interface. User
    can specify either column names or column indices.

    Args:
        column_names:
            The column names of the dependent variables
        column_indices:
            The column indices of the dependent variables
    """

`str()` ¶

String representation of the dependent variables

Source code in python/opsml/data/_data.pyi

def __str__(self) -> str:
    """String representation of the dependent variables"""

`IndiceSplit` ¶

Source code in python/opsml/data/_data.pyi

class IndiceSplit:
    indices: List[int]

    def __init__(self, indices: List[int]) -> None:
        """Define an indice split

        Args:
            indices:
                The indices of the split
        """

`init(indices)` ¶

Define an indice split

Parameters:

Name	Type	Description	Default
`indices`	`List[int]`	The indices of the split	required

Source code in python/opsml/data/_data.pyi

def __init__(self, indices: List[int]) -> None:
    """Define an indice split

    Args:
        indices:
            The indices of the split
    """

`NumpyData` ¶

Bases: DataInterface

Source code in python/opsml/data/_data.pyi

class NumpyData(DataInterface):
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
            data (np.NDArray | None):
                Numpy array
            dependent_vars (DependentVars | List[str] | List[int] | None):
                List of dependent variables to associate with data
            data_splits (DataSplits | List[DataSplit]):
                Optional list of `DataSplit`
            sql_logic (SqlLogic | None):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile
        """

    def save(
        self,
        path: Path,
        save_kwargs: Optional[DataSaveKwargs] = None,
    ) -> DataInterfaceMetadata:
        """Save data using numpy save format

        Args:
            path (Path):
                Base path to save the data to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.

        Acceptable save kwargs:

            see: https://numpy.org/doc/stable/reference/generated/numpy.save.html

            allow_pickle (bool):
                Allow saving object arrays using Python pickles.
            fix_imports (bool):
                The fix_imports flag is deprecated and has no effect

        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the data via numpy.load

        Args:
            path (Path):
                Base path to load the data from.
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to use when loading

        Acceptable load kwargs:

            see: https://numpy.org/doc/stable/reference/generated/numpy.load.html

            mmap_mode:
                If not None, then memory-map the file, using the given mode
            allow_pickle (bool):
                Allow loading pickled object arrays stored in npy files
            fix_imports (bool):
                If fix_imports is True, pickle will try to map the old Python 2 names to the new names used in Python 3.
            encoding (str):
                What encoding to use when reading Python 2 strings. Only useful when py3k is True.
            max_header_size (int):
                The maximum size of the file header
        """

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`NDArray \| None`	Numpy array	`None`
`dependent_vars`	`DependentVars \| List[str] \| List[int] \| None`	List of dependent variables to associate with data	`None`
`data_splits`	`DataSplits \| List[DataSplit]`	Optional list of `DataSplit`	`None`
`sql_logic`	`SqlLogic \| None`	Sql logic used to generate data represented as a dictionary.	`None`
`data_profile`	`DataProfile \| None`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
        data (np.NDArray | None):
            Numpy array
        dependent_vars (DependentVars | List[str] | List[int] | None):
            List of dependent variables to associate with data
        data_splits (DataSplits | List[DataSplit]):
            Optional list of `DataSplit`
        sql_logic (SqlLogic | None):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile
    """

`load(path, metadata, load_kwargs=None)` ¶

Load the data via numpy.load

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from.	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to use when loading	`None`

Acceptable load kwargs:

see: https://numpy.org/doc/stable/reference/generated/numpy.load.html

mmap_mode:
    If not None, then memory-map the file, using the given mode
allow_pickle (bool):
    Allow loading pickled object arrays stored in npy files
fix_imports (bool):
    If fix_imports is True, pickle will try to map the old Python 2 names to the new names used in Python 3.
encoding (str):
    What encoding to use when reading Python 2 strings. Only useful when py3k is True.
max_header_size (int):
    The maximum size of the file header

Source code in python/opsml/data/_data.pyi

def load(
    self,
    path: Path,
    metadata: DataInterfaceSaveMetadata,
    load_kwargs: Optional[DataLoadKwargs] = None,
) -> None:
    """Load the data via numpy.load

    Args:
        path (Path):
            Base path to load the data from.
        metadata (DataInterfaceSaveMetadata):
            Metadata associated with the data
        load_kwargs (DataLoadKwargs):
            Additional kwargs to use when loading

    Acceptable load kwargs:

        see: https://numpy.org/doc/stable/reference/generated/numpy.load.html

        mmap_mode:
            If not None, then memory-map the file, using the given mode
        allow_pickle (bool):
            Allow loading pickled object arrays stored in npy files
        fix_imports (bool):
            If fix_imports is True, pickle will try to map the old Python 2 names to the new names used in Python 3.
        encoding (str):
            What encoding to use when reading Python 2 strings. Only useful when py3k is True.
        max_header_size (int):
            The maximum size of the file header
    """

`save(path, save_kwargs=None)` ¶

Save data using numpy save format

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to save the data to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Acceptable save kwargs:

see: https://numpy.org/doc/stable/reference/generated/numpy.save.html

allow_pickle (bool):
    Allow saving object arrays using Python pickles.
fix_imports (bool):
    The fix_imports flag is deprecated and has no effect

Source code in python/opsml/data/_data.pyi

def save(
    self,
    path: Path,
    save_kwargs: Optional[DataSaveKwargs] = None,
) -> DataInterfaceMetadata:
    """Save data using numpy save format

    Args:
        path (Path):
            Base path to save the data to.
        save_kwargs (DataSaveKwargs):
            Additional kwargs to pass in.

    Acceptable save kwargs:

        see: https://numpy.org/doc/stable/reference/generated/numpy.save.html

        allow_pickle (bool):
            Allow saving object arrays using Python pickles.
        fix_imports (bool):
            The fix_imports flag is deprecated and has no effect

    """

`PandasData` ¶

Bases: DataInterface

Source code in python/opsml/data/_data.pyi

class PandasData(DataInterface):
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
            data (pd.DataFrame | None):
                Pandas dataframe
            dependent_vars (DependentVars | List[str] | List[int] | None):
                List of dependent variables to associate with data
            data_splits (DataSplits | List[DataSplit]):
                Optional list of `DataSplit`
            sql_logic (SqlLogic | None):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile
        """

    def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
        """Saves pandas dataframe as parquet file via to_parquet

        Args:
            path (Path):
                Base path to save the data to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.

        Acceptable save kwargs:
            engine ({'auto', 'pyarrow', 'fastparquet'}):
                Parquet library to use. If 'auto', then the option io.parquet.engine is used.
                The default io.parquet.engine behavior is to try 'pyarrow',
                falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'.
            compression (str | None):
                Name of the compression to use. Use None for no compression.
                Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. Default is 'snappy'.
            index (bool | None):
                If True, include the dataframe's index(es) in the file output.
                If False, they will not be written to the file. If None, similar to True the dataframe's index(es) will be saved.
                However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't
                require much space and is faster.
                Other indexes will be included as columns in the file output. Default is None.
            partition_cols (list | None):
                Column names by which to partition the dataset. Columns are partitioned in the order they are given.
                Must be None if path is not a string. Default is None.
            storage_options (dict | None):
                Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc.
                For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options.
                For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open.
                Default is None.
            **kwargs:
                Any additional kwargs are passed to the engine

        Additional Information:
            https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html
        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the pandas dataframe from a parquet dataset via read_parquet

        Args:
            path (Path):
                Base path to load the data from.
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to pass in.

        Acceptable load kwargs:
            engine ({'auto', 'pyarrow', 'fastparquet'}):
                Parquet library to use. If 'auto', then the option io.parquet.engine is used.
                The default io.parquet.engine behavior is to try 'pyarrow',
                falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'.
            columns (list | None):
                If not None, only these columns will be read from the file. Default is None.
            storage_options (dict | None):
                Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc.
                For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options.
                For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open.
                Default is None.
            use_nullable_dtypes (bool):
                If True, use dtypes that use pd.NA as missing value indicator for the resulting DataFrame.
                (only applicable for the pyarrow engine) As new dtypes are added that support pd.NA in the future,
                the output with this option will change to use those dtypes.
                Note: this is an experimental option, and behaviour (e.g. additional support dtypes) may change without notice.
                Default is False. Deprecated since version 2.0.
            dtype_backend ({'numpy_nullable', 'pyarrow'}):
                Back-end data type applied to the resultant DataFrame (still experimental).
                Behaviour is as follows:
                    - "numpy_nullable": returns nullable-dtype-backed DataFrame (default).
                    - "pyarrow": returns pyarrow-backed nullable ArrowDtype DataFrame. Default is 'numpy_nullable'.
            filesystem (fsspec | pyarrow filesystem | None):
                Filesystem object to use when reading the parquet file. Only implemented for engine="pyarrow". Default is None.
            filters (list[tuple] | list[list[tuple]] | None):
                To filter out data.
                Filter syntax:
                    [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in]
                The innermost tuples are transposed into a set of filters applied through an AND operation.
                The outer list combines these sets of filters through an OR operation. A single list of tuples can also be used,
                meaning that no OR operation between set of filters is to be conducted.
                Using this argument will NOT result in row-wise filtering of the final partitions unless engine="pyarrow"
                is also specified.
                For other engines, filtering is only performed at the partition level, that is,
                to prevent the loading of some row-groups and/or files. Default is None.
            **kwargs:
                Any additional kwargs are passed to the engine.

        Additional Information:
            https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
        """

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`DataFrame \| None`	Pandas dataframe	`None`
`dependent_vars`	`DependentVars \| List[str] \| List[int] \| None`	List of dependent variables to associate with data	`None`
`data_splits`	`DataSplits \| List[DataSplit]`	Optional list of `DataSplit`	`None`
`sql_logic`	`SqlLogic \| None`	Sql logic used to generate data represented as a dictionary.	`None`
`data_profile`	`DataProfile \| None`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
        data (pd.DataFrame | None):
            Pandas dataframe
        dependent_vars (DependentVars | List[str] | List[int] | None):
            List of dependent variables to associate with data
        data_splits (DataSplits | List[DataSplit]):
            Optional list of `DataSplit`
        sql_logic (SqlLogic | None):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile
    """

`load(path, metadata, load_kwargs=None)` ¶

Load the pandas dataframe from a parquet dataset via read_parquet

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from.	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to pass in.	`None`

Acceptable load kwargs

engine ({'auto', 'pyarrow', 'fastparquet'}): Parquet library to use. If 'auto', then the option io.parquet.engine is used. The default io.parquet.engine behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'. columns (list | None): If not None, only these columns will be read from the file. Default is None. storage_options (dict | None): Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options. For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open. Default is None. use_nullable_dtypes (bool): If True, use dtypes that use pd.NA as missing value indicator for the resulting DataFrame. (only applicable for the pyarrow engine) As new dtypes are added that support pd.NA in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional support dtypes) may change without notice. Default is False. Deprecated since version 2.0. dtype_backend ({'numpy_nullable', 'pyarrow'}): Back-end data type applied to the resultant DataFrame (still experimental). Behaviour is as follows: - "numpy_nullable": returns nullable-dtype-backed DataFrame (default). - "pyarrow": returns pyarrow-backed nullable ArrowDtype DataFrame. Default is 'numpy_nullable'. filesystem (fsspec | pyarrow filesystem | None): Filesystem object to use when reading the parquet file. Only implemented for engine="pyarrow". Default is None. filters (list[tuple] | list[list[tuple]] | None): To filter out data. Filter syntax: [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in] The innermost tuples are transposed into a set of filters applied through an AND operation. The outer list combines these sets of filters through an OR operation. A single list of tuples can also be used, meaning that no OR operation between set of filters is to be conducted. Using this argument will NOT result in row-wise filtering of the final partitions unless engine="pyarrow" is also specified. For other engines, filtering is only performed at the partition level, that is, to prevent the loading of some row-groups and/or files. Default is None. **kwargs: Any additional kwargs are passed to the engine.

Additional Information

https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html

Source code in python/opsml/data/_data.pyi

def load(
    self,
    path: Path,
    metadata: DataInterfaceSaveMetadata,
    load_kwargs: Optional[DataLoadKwargs] = None,
) -> None:
    """Load the pandas dataframe from a parquet dataset via read_parquet

    Args:
        path (Path):
            Base path to load the data from.
        metadata (DataInterfaceSaveMetadata):
            Metadata associated with the data
        load_kwargs (DataLoadKwargs):
            Additional kwargs to pass in.

    Acceptable load kwargs:
        engine ({'auto', 'pyarrow', 'fastparquet'}):
            Parquet library to use. If 'auto', then the option io.parquet.engine is used.
            The default io.parquet.engine behavior is to try 'pyarrow',
            falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'.
        columns (list | None):
            If not None, only these columns will be read from the file. Default is None.
        storage_options (dict | None):
            Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc.
            For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options.
            For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open.
            Default is None.
        use_nullable_dtypes (bool):
            If True, use dtypes that use pd.NA as missing value indicator for the resulting DataFrame.
            (only applicable for the pyarrow engine) As new dtypes are added that support pd.NA in the future,
            the output with this option will change to use those dtypes.
            Note: this is an experimental option, and behaviour (e.g. additional support dtypes) may change without notice.
            Default is False. Deprecated since version 2.0.
        dtype_backend ({'numpy_nullable', 'pyarrow'}):
            Back-end data type applied to the resultant DataFrame (still experimental).
            Behaviour is as follows:
                - "numpy_nullable": returns nullable-dtype-backed DataFrame (default).
                - "pyarrow": returns pyarrow-backed nullable ArrowDtype DataFrame. Default is 'numpy_nullable'.
        filesystem (fsspec | pyarrow filesystem | None):
            Filesystem object to use when reading the parquet file. Only implemented for engine="pyarrow". Default is None.
        filters (list[tuple] | list[list[tuple]] | None):
            To filter out data.
            Filter syntax:
                [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in]
            The innermost tuples are transposed into a set of filters applied through an AND operation.
            The outer list combines these sets of filters through an OR operation. A single list of tuples can also be used,
            meaning that no OR operation between set of filters is to be conducted.
            Using this argument will NOT result in row-wise filtering of the final partitions unless engine="pyarrow"
            is also specified.
            For other engines, filtering is only performed at the partition level, that is,
            to prevent the loading of some row-groups and/or files. Default is None.
        **kwargs:
            Any additional kwargs are passed to the engine.

    Additional Information:
        https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
    """

`save(path, save_kwargs=None)` ¶

Saves pandas dataframe as parquet file via to_parquet

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to save the data to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Acceptable save kwargs

engine ({'auto', 'pyarrow', 'fastparquet'}): Parquet library to use. If 'auto', then the option io.parquet.engine is used. The default io.parquet.engine behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'. compression (str | None): Name of the compression to use. Use None for no compression. Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. Default is 'snappy'. index (bool | None): If True, include the dataframe's index(es) in the file output. If False, they will not be written to the file. If None, similar to True the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. Default is None. partition_cols (list | None): Column names by which to partition the dataset. Columns are partitioned in the order they are given. Must be None if path is not a string. Default is None. storage_options (dict | None): Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options. For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open. Default is None. **kwargs: Any additional kwargs are passed to the engine

Additional Information

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html

Source code in python/opsml/data/_data.pyi

def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
    """Saves pandas dataframe as parquet file via to_parquet

    Args:
        path (Path):
            Base path to save the data to.
        save_kwargs (DataSaveKwargs):
            Additional kwargs to pass in.

    Acceptable save kwargs:
        engine ({'auto', 'pyarrow', 'fastparquet'}):
            Parquet library to use. If 'auto', then the option io.parquet.engine is used.
            The default io.parquet.engine behavior is to try 'pyarrow',
            falling back to 'fastparquet' if 'pyarrow' is unavailable. Default is 'auto'.
        compression (str | None):
            Name of the compression to use. Use None for no compression.
            Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. Default is 'snappy'.
        index (bool | None):
            If True, include the dataframe's index(es) in the file output.
            If False, they will not be written to the file. If None, similar to True the dataframe's index(es) will be saved.
            However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't
            require much space and is faster.
            Other indexes will be included as columns in the file output. Default is None.
        partition_cols (list | None):
            Column names by which to partition the dataset. Columns are partitioned in the order they are given.
            Must be None if path is not a string. Default is None.
        storage_options (dict | None):
            Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc.
            For HTTP(S) URLs the key-value pairs are forwarded to urllib.request.Request as header options.
            For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to fsspec.open.
            Default is None.
        **kwargs:
            Any additional kwargs are passed to the engine

    Additional Information:
        https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html
    """

`PolarsData` ¶

Bases: DataInterface

Source code in python/opsml/data/_data.pyi

class PolarsData(DataInterface):
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
            data (pl.DataFrame | None):
                Pandas dataframe
            dependent_vars (DependentVars | List[str] | List[int] | None):
                List of dependent variables to associate with data
            data_splits (DataSplits | List[DataSplit]):
                Optional list of `DataSplit`
            sql_logic (SqlLogic | None):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile

        """

    def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
        """Saves polars dataframe to parquet dataset via write_parquet

        Args:
            path (Path):
                Base path to save the data to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.

        Acceptable save kwargs:
            compression (ParquetCompression):
                Compression codec to use for writing.
            compression_level (int | None):
                Compression level to use. Default is None.
            statistics (bool | str | dict[str, bool]):
                Whether to write statistics. Default is True.
            row_group_size (int | None):
                Number of rows per row group. Default is None.
            data_page_size (int | None):
                Size of data pages. Default is None.
            use_pyarrow (bool):
                Whether to use PyArrow for writing. Default is False.
            pyarrow_options (dict[str, Any] | None):
                Additional options for PyArrow. Default is None.
            partition_by (str | Sequence[str] | None):
                Columns to partition by. Default is None.
            partition_chunk_size_bytes (int):
                Size of partition chunks in bytes. Default is 4294967296.
            storage_options (dict[str, Any] | None):
                Additional storage options. Default is None.
            credential_provider (CredentialProviderFunction | Literal['auto'] | None):
                Credential provider function. Default is 'auto'.
            retries (int):
                Number of retries for writing. Default is 2.

        See Also:
            https://docs.pola.rs/api/python/dev/reference/api/polars.DataFrame.write_parquet.html

        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the data from a file

        Args:
            path (Path):
                Base path to load the data from.
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to pass in.

        Acceptable load kwargs:
            columns (list[int] | list[str] | None):
                Columns to load. Default is None.
            n_rows (int | None):
                Number of rows to load. Default is None.
            row_index_name (str | None):
                Name of the row index. Default is None.
            row_index_offset (int):
                Offset for the row index. Default is 0.
            parallel (ParallelStrategy):
                Parallel strategy to use. Default is 'auto'.
            use_statistics (bool):
                Whether to use statistics. Default is True.
            hive_partitioning (bool | None):
                Whether to use hive partitioning. Default is None.
            glob (bool):
                Whether to use glob pattern matching. Default is True.
            schema (SchemaDict | None):
                Schema to use. Default is None.
            hive_schema (SchemaDict | None):
                Hive schema to use. Default is None.
            try_parse_hive_dates (bool):
                Whether to try parsing hive dates. Default is True.
            rechunk (bool):
                Whether to rechunk the data. Default is False.
            low_memory (bool):
                Whether to use low memory mode. Default is False.
            storage_options (dict[str, Any] | None):
                Additional storage options. Default is None.
            credential_provider (CredentialProviderFunction | Literal['auto'] | None):
                Credential provider function. Default is 'auto'.
            retries (int):
                Number of retries for loading. Default is 2.
            use_pyarrow (bool):
                Whether to use PyArrow for loading. Default is False.
            pyarrow_options (dict[str, Any] | None):
                Additional options for PyArrow. Default is None.
            memory_map (bool):
                Whether to use memory mapping. Default is True.
            include_file_paths (str | None):
                File paths to include. Default is None.
            allow_missing_columns (bool):
                Whether to allow missing columns. Default is False.

        See Also:
            https://docs.pola.rs/api/python/dev/reference/api/polars.read_parquet.html
        """

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`DataFrame \| None`	Pandas dataframe	`None`
`dependent_vars`	`DependentVars \| List[str] \| List[int] \| None`	List of dependent variables to associate with data	`None`
`data_splits`	`DataSplits \| List[DataSplit]`	Optional list of `DataSplit`	`None`
`sql_logic`	`SqlLogic \| None`	Sql logic used to generate data represented as a dictionary.	`None`
`data_profile`	`DataProfile \| None`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
        data (pl.DataFrame | None):
            Pandas dataframe
        dependent_vars (DependentVars | List[str] | List[int] | None):
            List of dependent variables to associate with data
        data_splits (DataSplits | List[DataSplit]):
            Optional list of `DataSplit`
        sql_logic (SqlLogic | None):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile

    """

`load(path, metadata, load_kwargs=None)` ¶

Load the data from a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from.	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to pass in.	`None`

Acceptable load kwargs

columns (list[int] | list[str] | None): Columns to load. Default is None. n_rows (int | None): Number of rows to load. Default is None. row_index_name (str | None): Name of the row index. Default is None. row_index_offset (int): Offset for the row index. Default is 0. parallel (ParallelStrategy): Parallel strategy to use. Default is 'auto'. use_statistics (bool): Whether to use statistics. Default is True. hive_partitioning (bool | None): Whether to use hive partitioning. Default is None. glob (bool): Whether to use glob pattern matching. Default is True. schema (SchemaDict | None): Schema to use. Default is None. hive_schema (SchemaDict | None): Hive schema to use. Default is None. try_parse_hive_dates (bool): Whether to try parsing hive dates. Default is True. rechunk (bool): Whether to rechunk the data. Default is False. low_memory (bool): Whether to use low memory mode. Default is False. storage_options (dict[str, Any] | None): Additional storage options. Default is None. credential_provider (CredentialProviderFunction | Literal['auto'] | None): Credential provider function. Default is 'auto'. retries (int): Number of retries for loading. Default is 2. use_pyarrow (bool): Whether to use PyArrow for loading. Default is False. pyarrow_options (dict[str, Any] | None): Additional options for PyArrow. Default is None. memory_map (bool): Whether to use memory mapping. Default is True. include_file_paths (str | None): File paths to include. Default is None. allow_missing_columns (bool): Whether to allow missing columns. Default is False.

`save(path, save_kwargs=None)` ¶

Saves polars dataframe to parquet dataset via write_parquet

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to save the data to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Acceptable save kwargs

compression (ParquetCompression): Compression codec to use for writing. compression_level (int | None): Compression level to use. Default is None. statistics (bool | str | dict[str, bool]): Whether to write statistics. Default is True. row_group_size (int | None): Number of rows per row group. Default is None. data_page_size (int | None): Size of data pages. Default is None. use_pyarrow (bool): Whether to use PyArrow for writing. Default is False. pyarrow_options (dict[str, Any] | None): Additional options for PyArrow. Default is None. partition_by (str | Sequence[str] | None): Columns to partition by. Default is None. partition_chunk_size_bytes (int): Size of partition chunks in bytes. Default is 4294967296. storage_options (dict[str, Any] | None): Additional storage options. Default is None. credential_provider (CredentialProviderFunction | Literal['auto'] | None): Credential provider function. Default is 'auto'. retries (int): Number of retries for writing. Default is 2.

`SqlData` ¶

Source code in python/opsml/data/_data.pyi

class SqlData:
    data_type: DataType

    def __init__(
        self,
        sql_logic: SqlLogic,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a sql data interface

        Args:
            sql (SqlLogic):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile
        """

    def save(
        self,
        path: Path,
        save_kwargs: Optional[DataSaveKwargs] = None,
    ) -> DataInterfaceMetadata:
        """Save the sql logic to a file

        Args:
            path (Path):
                The path to save the sql logic to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.
        """

`init(sql_logic, data_profile=None)` ¶

Define a sql data interface

Parameters:

Name	Type	Description	Default
`sql`	`SqlLogic`	Sql logic used to generate data represented as a dictionary.	required
`data_profile`	`DataProfile \| None`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    sql_logic: SqlLogic,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a sql data interface

    Args:
        sql (SqlLogic):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile
    """

`save(path, save_kwargs=None)` ¶

Save the sql logic to a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to save the sql logic to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Source code in python/opsml/data/_data.pyi

def save(
    self,
    path: Path,
    save_kwargs: Optional[DataSaveKwargs] = None,
) -> DataInterfaceMetadata:
    """Save the sql logic to a file

    Args:
        path (Path):
            The path to save the sql logic to.
        save_kwargs (DataSaveKwargs):
            Additional kwargs to pass in.
    """

`SqlLogic` ¶

Source code in python/opsml/data/_data.pyi

class SqlLogic:
    def __init__(self, queries: Dict[str, str]) -> None:
        """Define sql logic

        Args:
            queries:
                Sql logic used to generate data represented as a dictionary.
                Key is the name to assign to the sql logic and value is either a sql query
                or a path to a .sql file.
        """

    def __str__(self) -> str:
        """String representation of the sql logic"""

    def add_sql_logic(
        self,
        name: str,
        query: Optional[str] = None,
        filepath: Optional[str] = None,
    ) -> None:
        """Add sql logic to existing queries

        Args:
            name:
                The name to associate with the sql logic
            query:
                SQL query
            filepath:
                Filepath to SQL query

        """

    @property
    def queries(self) -> Dict[str, str]:
        """Return the queries"""

    @queries.setter
    def queries(self, queries: Dict[str, str]) -> None:
        """Set the queries"""

    def __getitem__(self, key: str) -> str:
        """Get the query by key

        Args:
            key:
                The key to get the query by

        Returns:
            The query
        """

`queries` `property` `writable` ¶

Return the queries

`getitem(key)` ¶

Get the query by key

Parameters:

Name	Type	Description	Default
`key`	`str`	The key to get the query by	required

Returns:

Type	Description
`str`	The query

Source code in python/opsml/data/_data.pyi

def __getitem__(self, key: str) -> str:
    """Get the query by key

    Args:
        key:
            The key to get the query by

    Returns:
        The query
    """

`init(queries)` ¶

Define sql logic

Parameters:

Name	Type	Description	Default
`queries`	`Dict[str, str]`	Sql logic used to generate data represented as a dictionary. Key is the name to assign to the sql logic and value is either a sql query or a path to a .sql file.	required

Source code in python/opsml/data/_data.pyi

def __init__(self, queries: Dict[str, str]) -> None:
    """Define sql logic

    Args:
        queries:
            Sql logic used to generate data represented as a dictionary.
            Key is the name to assign to the sql logic and value is either a sql query
            or a path to a .sql file.
    """

`str()` ¶

String representation of the sql logic

Source code in python/opsml/data/_data.pyi

def __str__(self) -> str:
    """String representation of the sql logic"""

`add_sql_logic(name, query=None, filepath=None)` ¶

Add sql logic to existing queries

Parameters:

Name	Type	Description	Default
`name`	`str`	The name to associate with the sql logic	required
`query`	`Optional[str]`	SQL query	`None`
`filepath`	`Optional[str]`	Filepath to SQL query	`None`

Source code in python/opsml/data/_data.pyi

def add_sql_logic(
    self,
    name: str,
    query: Optional[str] = None,
    filepath: Optional[str] = None,
) -> None:
    """Add sql logic to existing queries

    Args:
        name:
            The name to associate with the sql logic
        query:
            SQL query
        filepath:
            Filepath to SQL query

    """

`StartStopSplit` ¶

Source code in python/opsml/data/_data.pyi

class StartStopSplit:
    start: int
    stop: int

    def __init__(self, start: int, stop: int) -> None:
        """Define a start stop split

        Args:
            start:
                The start of the split
            stop:
                The stop of the split
        """

`init(start, stop)` ¶

Define a start stop split

Parameters:

Name	Type	Description	Default
`start`	`int`	The start of the split	required
`stop`	`int`	The stop of the split	required

Source code in python/opsml/data/_data.pyi

def __init__(self, start: int, stop: int) -> None:
    """Define a start stop split

    Args:
        start:
            The start of the split
        stop:
            The stop of the split
    """

`TorchData` ¶

Bases: DataInterface

Source code in python/opsml/data/_data.pyi

class TorchData(DataInterface):
    def __init__(
        self,
        data: Optional[Any] = None,
        data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
        dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
        sql_logic: Optional[SqlLogic] = None,
        data_profile: Optional[DataProfile] = None,
    ) -> None:
        """Define a data interface

        Args:
            data (torch.Tensor | None):
                Torch tensor
            dependent_vars (DependentVars | List[str] | List[int] | None):
                List of dependent variables to associate with data
            data_splits (DataSplits | List[DataSplit]):
                Optional list of `DataSplit`
            sql_logic (SqlLogic | None):
                Sql logic used to generate data represented as a dictionary.
            data_profile (DataProfile | None):
                Data profile
        """

    def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
        """Saves torch tensor to a file

        Args:
            path (Path):
                Base path to save the data to.
            save_kwargs (DataSaveKwargs):
                Additional kwargs to pass in.

        Acceptable save kwargs:
            pickle_module (Any):
                Module used for pickling metadata and objects.
            pickle_protocol (int):
                Can be specified to override the default protocol.


        Additional Information:
           https://pytorch.org/docs/main/generated/torch.save.html
        """

    def load(
        self,
        path: Path,
        metadata: DataInterfaceSaveMetadata,
        load_kwargs: Optional[DataLoadKwargs] = None,
    ) -> None:
        """Load the torch tensor from file

        Args:
            path (Path):
                Base path to load the data from.
            metadata (DataInterfaceSaveMetadata):
                Metadata associated with the data
            load_kwargs (DataLoadKwargs):
                Additional kwargs to pass in.

        Acceptable load kwargs:
            map_location:
                A function, torch.device, string or a dict specifying how to remap storage locations.
            pickle_module:
                Module used for unpickling metadata and objects (has to match the pickle_module used to serialize file).
            weights_only:
                Indicates whether unpickler should be restricted to loading only tensors, primitive types,
                dictionaries and any types added via torch.serialization.add_safe_globals().
            mmap:
                Indicates whether the file should be mmaped rather than loading all the storages into memory.
                Typically, tensor storages in the file will first be moved from disk to CPU memory,
                after which they are moved to the location that they were tagged with when saving, or specified by map_location.
                This second step is a no-op if the final location is CPU. When the mmap flag is set,
                instead of copying the tensor storages from disk to CPU memory in the first step, f is mmaped.
            pickle_load_args:
                (Python 3 only) optional keyword arguments passed over to pickle_module.load() and pickle_module.Unpickler(),
                e.g., errors=....


        Additional Information:
            https://pytorch.org/docs/stable/generated/torch.load.html
        """

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

Define a data interface

Parameters:

Name	Type	Description	Default
`data`	`Tensor \| None`	Torch tensor	`None`
`dependent_vars`	`DependentVars \| List[str] \| List[int] \| None`	List of dependent variables to associate with data	`None`
`data_splits`	`DataSplits \| List[DataSplit]`	Optional list of `DataSplit`	`None`
`sql_logic`	`SqlLogic \| None`	Sql logic used to generate data represented as a dictionary.	`None`
`data_profile`	`DataProfile \| None`	Data profile	`None`

Source code in python/opsml/data/_data.pyi

def __init__(
    self,
    data: Optional[Any] = None,
    data_splits: Optional[Union[DataSplits, List[DataSplit]]] = None,
    dependent_vars: Optional[Union[DependentVars, List[str], List[int]]] = None,
    sql_logic: Optional[SqlLogic] = None,
    data_profile: Optional[DataProfile] = None,
) -> None:
    """Define a data interface

    Args:
        data (torch.Tensor | None):
            Torch tensor
        dependent_vars (DependentVars | List[str] | List[int] | None):
            List of dependent variables to associate with data
        data_splits (DataSplits | List[DataSplit]):
            Optional list of `DataSplit`
        sql_logic (SqlLogic | None):
            Sql logic used to generate data represented as a dictionary.
        data_profile (DataProfile | None):
            Data profile
    """

`load(path, metadata, load_kwargs=None)` ¶

Load the torch tensor from file

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to load the data from.	required
`metadata`	`DataInterfaceSaveMetadata`	Metadata associated with the data	required
`load_kwargs`	`DataLoadKwargs`	Additional kwargs to pass in.	`None`

Acceptable load kwargs

map_location: A function, torch.device, string or a dict specifying how to remap storage locations. pickle_module: Module used for unpickling metadata and objects (has to match the pickle_module used to serialize file). weights_only: Indicates whether unpickler should be restricted to loading only tensors, primitive types, dictionaries and any types added via torch.serialization.add_safe_globals(). mmap: Indicates whether the file should be mmaped rather than loading all the storages into memory. Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they are moved to the location that they were tagged with when saving, or specified by map_location. This second step is a no-op if the final location is CPU. When the mmap flag is set, instead of copying the tensor storages from disk to CPU memory in the first step, f is mmaped. pickle_load_args: (Python 3 only) optional keyword arguments passed over to pickle_module.load() and pickle_module.Unpickler(), e.g., errors=....

Additional Information

https://pytorch.org/docs/stable/generated/torch.load.html

Source code in python/opsml/data/_data.pyi

def load(
    self,
    path: Path,
    metadata: DataInterfaceSaveMetadata,
    load_kwargs: Optional[DataLoadKwargs] = None,
) -> None:
    """Load the torch tensor from file

    Args:
        path (Path):
            Base path to load the data from.
        metadata (DataInterfaceSaveMetadata):
            Metadata associated with the data
        load_kwargs (DataLoadKwargs):
            Additional kwargs to pass in.

    Acceptable load kwargs:
        map_location:
            A function, torch.device, string or a dict specifying how to remap storage locations.
        pickle_module:
            Module used for unpickling metadata and objects (has to match the pickle_module used to serialize file).
        weights_only:
            Indicates whether unpickler should be restricted to loading only tensors, primitive types,
            dictionaries and any types added via torch.serialization.add_safe_globals().
        mmap:
            Indicates whether the file should be mmaped rather than loading all the storages into memory.
            Typically, tensor storages in the file will first be moved from disk to CPU memory,
            after which they are moved to the location that they were tagged with when saving, or specified by map_location.
            This second step is a no-op if the final location is CPU. When the mmap flag is set,
            instead of copying the tensor storages from disk to CPU memory in the first step, f is mmaped.
        pickle_load_args:
            (Python 3 only) optional keyword arguments passed over to pickle_module.load() and pickle_module.Unpickler(),
            e.g., errors=....


    Additional Information:
        https://pytorch.org/docs/stable/generated/torch.load.html
    """

`save(path, save_kwargs=None)` ¶

Saves torch tensor to a file

Parameters:

Name	Type	Description	Default
`path`	`Path`	Base path to save the data to.	required
`save_kwargs`	`DataSaveKwargs`	Additional kwargs to pass in.	`None`

Acceptable save kwargs

pickle_module (Any): Module used for pickling metadata and objects. pickle_protocol (int): Can be specified to override the default protocol.

Additional Information

https://pytorch.org/docs/main/generated/torch.save.html

Source code in python/opsml/data/_data.pyi

def save(self, path: Path, save_kwargs: Optional[DataSaveKwargs] = None) -> DataInterfaceMetadata:
    """Saves torch tensor to a file

    Args:
        path (Path):
            Base path to save the data to.
        save_kwargs (DataSaveKwargs):
            Additional kwargs to pass in.

    Acceptable save kwargs:
        pickle_module (Any):
            Module used for pickling metadata and objects.
        pickle_protocol (int):
            Can be specified to override the default protocol.


    Additional Information:
       https://pytorch.org/docs/main/generated/torch.save.html
    """

`generate_feature_schema(data, data_type)` ¶

Generate a feature schema

Parameters:

Name	Type	Description	Default
`data`	`Any`	Data to generate the feature schema from	required
`data_type`	`DataType`	The data type	required

Returns:

Type	Description
`FeatureSchema`	A feature map

Source code in python/opsml/data/_data.pyi

def generate_feature_schema(data: Any, data_type: DataType) -> FeatureSchema:
    """Generate a feature schema

    Args:
        data:
            Data to generate the feature schema from
        data_type:
            The data type

    Returns:
        A feature map
    """

Data

ArrowData ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

ColumnSplit ¶

__init__(column_name, column_value, column_type=ColType.Builtin, inequality=None) ¶

DataInterface ¶

data property writable ¶

data_profile property writable ¶

data_splits property writable ¶

data_type property ¶

dependent_vars property writable ¶

schema property writable ¶

sql_logic property ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

add_sql_logic(name, query=None, filepath=None) ¶

create_data_profile(bin_size=20, compute_correlations=False) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

split_data() ¶

DataInterfaceMetadata ¶

__init__(save_metadata, schema, extra_metadata, sql_logic, interface_type, data_splits, dependent_vars, data_type) ¶

__str__() ¶

model_dump_json() ¶

model_validate_json(json_string) staticmethod ¶

DataInterfaceSaveMetadata ¶

__init__(data_uri, data_profile_uri=None, sql_uri=None, extra=None, save_kwargs=None) ¶

DataLoadKwargs ¶

__init__(data=None) ¶

DataSaveKwargs ¶

__init__(data=None) ¶

DataSplit ¶

__init__(label, column_split=None, start_stop_split=None, indice_split=None) ¶

DataSplits ¶

splits property writable ¶

__init__(splits) ¶

__str__() ¶

split_data(data, data_type, dependent_vars) ¶

DataSplitter ¶

split_data(split, data, data_type, dependent_vars) staticmethod ¶

DependentVars ¶

column_indices property writable ¶

column_names property writable ¶

__init__(column_names=None, column_indices=None) ¶

__str__() ¶

IndiceSplit ¶

__init__(indices) ¶

NumpyData ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

PandasData ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

PolarsData ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

SqlData ¶

__init__(sql_logic, data_profile=None) ¶

save(path, save_kwargs=None) ¶

SqlLogic ¶

queries property writable ¶

__getitem__(key) ¶

__init__(queries) ¶

__str__() ¶

add_sql_logic(name, query=None, filepath=None) ¶

StartStopSplit ¶

__init__(start, stop) ¶

TorchData ¶

__init__(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None) ¶

load(path, metadata, load_kwargs=None) ¶

save(path, save_kwargs=None) ¶

generate_feature_schema(data, data_type) ¶

`ArrowData` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`ColumnSplit` ¶

`init(column_name, column_value, column_type=ColType.Builtin, inequality=None)` ¶

`DataInterface` ¶

`data` `property` `writable` ¶

`data_profile` `property` `writable` ¶

`data_splits` `property` `writable` ¶

`data_type` `property` ¶

`dependent_vars` `property` `writable` ¶

`schema` `property` `writable` ¶

`sql_logic` `property` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`add_sql_logic(name, query=None, filepath=None)` ¶

`create_data_profile(bin_size=20, compute_correlations=False)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`split_data()` ¶

`DataInterfaceMetadata` ¶

`init(save_metadata, schema, extra_metadata, sql_logic, interface_type, data_splits, dependent_vars, data_type)` ¶

`str()` ¶

`model_dump_json()` ¶

`model_validate_json(json_string)` `staticmethod` ¶

`DataInterfaceSaveMetadata` ¶

`init(data_uri, data_profile_uri=None, sql_uri=None, extra=None, save_kwargs=None)` ¶

`DataLoadKwargs` ¶

`init(data=None)` ¶

`DataSaveKwargs` ¶

`init(data=None)` ¶

`DataSplit` ¶

`init(label, column_split=None, start_stop_split=None, indice_split=None)` ¶

`DataSplits` ¶

`splits` `property` `writable` ¶

`init(splits)` ¶

`str()` ¶

`split_data(data, data_type, dependent_vars)` ¶

`DataSplitter` ¶

`split_data(split, data, data_type, dependent_vars)` `staticmethod` ¶

`DependentVars` ¶

`column_indices` `property` `writable` ¶

`column_names` `property` `writable` ¶

`init(column_names=None, column_indices=None)` ¶

`str()` ¶

`IndiceSplit` ¶

`init(indices)` ¶

`NumpyData` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`PandasData` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`PolarsData` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`SqlData` ¶

`init(sql_logic, data_profile=None)` ¶

`save(path, save_kwargs=None)` ¶

`SqlLogic` ¶

`queries` `property` `writable` ¶

`getitem(key)` ¶

`init(queries)` ¶

`str()` ¶

`add_sql_logic(name, query=None, filepath=None)` ¶

`StartStopSplit` ¶

`init(start, stop)` ¶

`TorchData` ¶

`init(data=None, data_splits=None, dependent_vars=None, sql_logic=None, data_profile=None)` ¶

`load(path, metadata, load_kwargs=None)` ¶

`save(path, save_kwargs=None)` ¶

`generate_feature_schema(data, data_type)` ¶