Tabular datasets#

Adult#

Class to describe features of the Adult dataset.

class Adult(discrete_only=False, invert_s=False, split=AdultSplits.SEX, binarize_nationality=False, binarize_race=False)#

Bases: ethicml.data.dataset.LoadableDataset

UCI Adult dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class AdultSplits(value)#

Bases: enum.Enum

Available dataset splits for the Adult dataset.

adult(split='Sex', discrete_only=False, binarize_nationality=False, binarize_race=False, invert_s=False)#

UCI Adult dataset.

Parameters
Return type

ethicml.data.tabular_data.adult.Adult

Compas#

Class to describe features of the Compas dataset.

class Compas(discrete_only=False, invert_s=False, split=CompasSplits.SEX)#

Bases: ethicml.data.dataset.LoadableDataset

Compas (or ProPublica) dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class CompasSplits(value)#

Bases: enum.Enum

Available dataset splits for the COMPAS dataset.

compas(split='Sex', discrete_only=False, invert_s=False)#

Compas (or ProPublica) dataset.

Parameters
Return type

ethicml.data.tabular_data.compas.Compas

Credit#

Class to describe features of the UCI Credit dataset.

class Credit(discrete_only=False, invert_s=False, split=CreditSplits.SEX)#

Bases: ethicml.data.dataset.LoadableDataset

UCI Credit Card dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class CreditSplits(value)#

Bases: enum.Enum

Splits for the Credit dataset.

credit(split='Sex', discrete_only=False, invert_s=False)#

UCI Credit Card dataset.

Parameters
Return type

ethicml.data.tabular_data.credit.Credit

Crime#

Class to describe features of the Communities and Crime dataset.

class Crime(discrete_only=False, invert_s=False, split=CrimeSplits.RACE_BINARY)#

Bases: ethicml.data.dataset.LoadableDataset

UCI Communities and Crime dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class CrimeSplits(value)#

Bases: enum.Enum

Splits for the Crime dataset.

crime(split='Race-Binary', discrete_only=False, invert_s=False)#

UCI Communities and Crime dataset.

Parameters
Return type

ethicml.data.tabular_data.crime.Crime

German#

Class to describe features of the German dataset.

class German(discrete_only=False, invert_s=False, split=GermanSplits.SEX)#

Bases: ethicml.data.dataset.LoadableDataset

German credit dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class GermanSplits(value)#

Bases: enum.Enum

Splits for the German dataset.

german(split='Sex', discrete_only=False, invert_s=False)#

German credit dataset.

Parameters
Return type

ethicml.data.tabular_data.german.German

Health#

Class to describe features of the Heritage Health dataset.

class Health(discrete_only=False, invert_s=False, split=HealthSplits.SEX)#

Bases: ethicml.data.dataset.LoadableDataset

Heritage Health dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class HealthSplits(value)#

Bases: enum.Enum

Splits for the Health dataset.

health(split='Sex', discrete_only=False, invert_s=False)#

Heritage Health dataset.

Parameters
Return type

ethicml.data.tabular_data.health.Health

Non-binary toy#

Class to describe features of the toy dataset with more than 2 classes.

class NonBinaryToy(discrete_only=False, invert_s=False)#

Bases: ethicml.data.dataset.LoadableDataset

Dataset with non-binary toy data for testing.

Parameters
  • discrete_only (bool) –

  • invert_s (bool) –

Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

nonbinary_toy()#

Dataset with non-binary toy data for testing.

Return type

ethicml.data.tabular_data.non_binary_toy.NonBinaryToy

Stop, question, frisk#

Class to describe features of the SQF dataset.

class Sqf(discrete_only=False, invert_s=False, split=SqfSplits.SEX)#

Bases: ethicml.data.dataset.LoadableDataset

Stop, question and frisk dataset.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class SqfSplits(value)#

Bases: enum.Enum

Splits for the SQF dataset.

sqf(split='Sex', discrete_only=False, invert_s=False)#

Stop, question and frisk dataset.

Parameters
Return type

ethicml.data.tabular_data.sqf.Sqf

Synthetic#

Class to describe features of the Synthetic dataset.

class Synthetic(discrete_only=False, invert_s=False, scenario=SyntheticScenarios.S1, target=SyntheticTargets.Y3, fair=False, num_samples=1000)#

Bases: ethicml.data.dataset.LoadableDataset

Dataset with synthetic data.

⊥ = is independent of ~ = is an ancestor of in the causal model used to generate the data

Scenario 1 = X⊥S & Y⊥S.
  • This models completely fair data.

Scenario 2 = X_2⊥S & Y_2⊥S; X_1~S, Y_1~S & Y_3~S
  • This models data where the inputs are biased. This is propogated through to the target.

Scenario 3 = X⊥S, Y_1⊥S, Y_2⊥S; Y_3~S
  • This models data where the target is biased.

Scenario 4 = X_2⊥S, Y_2⊥S; X_1~S, Y_1~S, Y_3~S
  • This models data where both the input and target are directly biased.

Parameters
Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

class SyntheticScenarios(value)#

Bases: enum.Enum

Scenarios for the synthetic dataset.

class SyntheticTargets(value)#

Bases: enum.Enum

Targets for the synthetic dataset.

synthetic(scenario=1, target=3, fair=False, num_samples=1000)#

Dataset with synthetic data.

⊥ = is independent of ~ = is an ancestor of in the causal model used to generate the data

Scenario 1 = X⊥S & Y⊥S.
  • This models completely fair data.

Scenario 2 = X_2⊥S & Y_2⊥S; X_1~S, Y_1~S & Y_3~S
  • This models data where the inputs are biased. This is propogated through to the target.

Scenario 3 = X⊥S, Y_1⊥S, Y_2⊥S; Y_3~S
  • This models data where the target is biased.

Scenario 4 = X_2⊥S, Y_2⊥S; X_1~S, Y_1~S, Y_3~S
  • This models data where both the input and target are directly biased.

Parameters
Return type

ethicml.data.tabular_data.synthetic.Synthetic

Toy#

Class to describe features of the Toy dataset.

class Toy(discrete_only=False, invert_s=False)#

Bases: ethicml.data.dataset.LoadableDataset

Dataset with toy data for testing.

Parameters
  • discrete_only (bool) –

  • invert_s (bool) –

Return type

None

__len__()#

Number of elements in the dataset.

Return type

int

property class_labels: List[str]#

Get the list of class labels.

property continuous_features: List[str]#

List of features that are continuous.

property disc_feature_groups: Optional[Dict[str, List[str]]]#

Dictionary of feature groups.

property discrete_features: List[str]#

List of features that are discrete.

expand_labels(label, label_type)#

Expand a label in the form of an index into all the subfeatures.

Parameters
  • label (pandas.DataFrame) –

  • label_type (Literal['s', 'y']) –

Return type

pandas.DataFrame

property feature_split: ethicml.data.dataset.FeatureSplit#

Return a feature split dictionary.

This should have separate entries for the features, the labels and the sensitive attributes.

property features_to_remove: List[str]#

Features that have to be removed from x.

property filepath: pathlib.Path#

Filepath from which to load the data.

load(ordered=False, labels_as_features=False)#

Load dataset from its CSV file.

Parameters
  • ordered (bool) – if True, return features such that discrete come first, then continuous

  • labels_as_features (bool) – if True, the s and y labels are included in the x features

Returns

DataTuple with dataframes of features, labels and sensitive attributes

Return type

ethicml.utility.data_structures.DataTuple

load_aif()#

Load the dataset as an AIF360 dataset.

Experimental. Requires the aif360 library.

Ignores the type check as the return type is not yet defined.

property name: str#

Name of the dataset.

property ordered_features: ethicml.data.dataset.FeatureSplit#

Return an order features dictionary.

This should have separate entries for the features, the labels and the sensitive attributes, but the x features are ordered so first are the discrete features, then the continuous.

property sens_attrs: List[str]#

Get the list of sensitive attributes.

toy()#

Dataset with toy data for testing.

Return type

ethicml.data.tabular_data.toy.Toy