Data Profile
In addition to monitoring, Scouter
also provides data profiling tools to create feature distribution profiles to associate with your data.
Supported Data Types¶
Scouter supports a variety of data types, including:
- Pandas DataFrames: Scouter can handle Pandas DataFrames, making it easy to integrate with existing data processing pipelines.
- Numpy Arrays: Out of the box support for 2D arrays.
- Polars DataFrames: For users who prefer Polars, Scouter supports this data format as well, allowing for efficient data processing and analysis.
Create a Profile¶
import numpy as np
import pandas as pd
from scouter import DataProfile, DataProfiler # type: ignore[attr-defined]
def generate_data() -> pd.DataFrame:
"""Create a fake data frame for testing"""
n = 10_000
X_train = np.random.normal(-4, 2.0, size=(n, 4))
col_names = []
for i in range(0, X_train.shape[1]):
col_names.append(f"feature_{i}")
X = pd.DataFrame(X_train, columns=col_names)
# create string column (with 10 unique values)
X["categorical_feature"] = np.random.choice(
["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], size=n
)
return X
data = generate_data()
# create data profiler
profiler = DataProfiler()
# create data profile
profile: DataProfile = profiler.create_data_profile(data)
# Save (provide a path or leave blank)
profile.save_to_json()
print(profile)
Output
{
"features": {
"categorical_feature": {
"id": "categorical_feature",
"numeric_stats": null,
"string_stats": {
"distinct": {
"count": 10,
"percent": 0.1
},
"char_stats": {
"min_length": 1,
"max_length": 1,
"median_length": 1,
"mean_length": 1.0
},
"word_stats": {
"words": {
"d": {
"count": 1021,
"percent": 0.1021
},
"b": {
"count": 998,
"percent": 0.0998
},
"i": {
"count": 989,
"percent": 0.0989
},
"c": {
"count": 1015,
"percent": 0.1015
},
"f": {
"count": 1007,
"percent": 0.1007
},
"a": {
"count": 987,
"percent": 0.0987
},
"h": {
"count": 982,
"percent": 0.0982
},
"g": {
"count": 988,
"percent": 0.0988
},
"j": {
"count": 1020,
"percent": 0.102
},
"e": {
"count": 993,
"percent": 0.0993
}
}
}
},
"timestamp": "2025-04-28T18:19:33.987594Z",
"correlations": null
},
"feature_0": {
"id": "feature_0",
"numeric_stats": {
"mean": -4.003965640199899,
"stddev": 2.0178515177174448,
"min": -11.084430177884318,
"max": 3.375571168173134,
"distinct": {
"count": 10000,
"percent": 1.0
},
"quantiles": {
"q25": -5.348060765665304,
"q50": -3.9960621892367625,
"q75": -2.6379431350112723,
"q99": 0.7695557247479057
},
"histogram": {
"bins": [
-11.084430177884318,
-10.361430110581445,
-9.638430043278573,
-8.9154299759757,
-8.192429908672828,
-7.469429841369955,
-6.7464297740670816,
-6.023429706764209,
-5.300429639461337,
-4.577429572158464,
-3.8544295048555917,
-3.1314294375527183,
-2.408429370249845,
-1.6854293029469734,
-0.9624292356441,
-0.2394291683412284,
0.48357089896164496,
1.2065709662645183,
1.92957103356739,
2.6525711008702633
],
"bin_counts": [
8,
14,
53,
120,
228,
444,
706,
1006,
1300,
1399,
1404,
1159,
914,
599,
352,
153,
81,
39,
17,
4
]
}
},
"string_stats": null,
"timestamp": "2025-04-28T18:19:34.035699Z",
"correlations": null
},
"feature_1": {
"id": "feature_1",
"numeric_stats": {
"mean": -3.997238711617275,
"stddev": 1.9969030204031852,
"min": -11.2143596931969,
"max": 3.5233542660216592,
"distinct": {
"count": 10000,
"percent": 1.0
},
"quantiles": {
"q25": -5.331856994963132,
"q50": -3.9985892307105217,
"q75": -2.6720412739258803,
"q99": 0.7018279512701495
},
"histogram": {
"bins": [
-11.2143596931969,
-10.477473995235973,
-9.740588297275044,
-9.003702599314117,
-8.266816901353188,
-7.529931203392261,
-6.7930455054313335,
-6.056159807470405,
-5.319274109509477,
-4.582388411548549,
-3.8455027135876207,
-3.1086170156266935,
-2.371731317665766,
-1.634845619704837,
-0.8979599217439098,
-0.16107422378298075,
0.5758114741779465,
1.3126971721388738,
2.049582870099803,
2.78646856806073
],
"bin_counts": [
7,
18,
42,
93,
218,
425,
706,
1015,
1345,
1431,
1430,
1216,
866,
580,
340,
158,
72,
20,
13,
5
]
}
},
"string_stats": null,
"timestamp": "2025-04-28T18:19:34.035702Z",
"correlations": null
},
"feature_2": {
"id": "feature_2",
"numeric_stats": {
"mean": -3.9985933460650362,
"stddev": 1.9872554303247896,
"min": -11.298088188584437,
"max": 3.7792120832159224,
"distinct": {
"count": 10000,
"percent": 1.0
},
"quantiles": {
"q25": -5.3669588658728955,
"q50": -3.991356640189083,
"q75": -2.6604527797988693,
"q99": 0.716291352460563
},
"histogram": {
"bins": [
-11.298088188584437,
-10.54422317499442,
-9.7903581614044,
-9.036493147814383,
-8.282628134224364,
-7.528763120634347,
-6.774898107044329,
-6.021033093454311,
-5.267168079864293,
-4.513303066274275,
-3.7594380526842563,
-3.005573039094239,
-2.2517080255042217,
-1.4978430119142025,
-0.7439779983241852,
0.00988701526583391,
0.7637520288558513,
1.5176170424458686,
2.2714820560358877,
3.025347069625905
],
"bin_counts": [
3,
11,
36,
109,
222,
423,
744,
1114,
1321,
1487,
1454,
1193,
853,
530,
284,
120,
52,
29,
13,
2
]
}
},
"string_stats": null,
"timestamp": "2025-04-28T18:19:34.035703Z",
"correlations": null
},
"feature_3": {
"id": "feature_3",
"numeric_stats": {
"mean": -3.996914164800711,
"stddev": 1.9930422779448296,
"min": -11.140741264279484,
"max": 2.876859924322919,
"distinct": {
"count": 10000,
"percent": 1.0
},
"quantiles": {
"q25": -5.332108894837108,
"q50": -3.9894123025713193,
"q75": -2.639771809727505,
"q99": 0.531352688455816
},
"histogram": {
"bins": [
-11.140741264279484,
-10.439861204849365,
-9.738981145419244,
-9.038101085989123,
-8.337221026559003,
-7.636340967128883,
-6.935460907698763,
-6.234580848268643,
-5.533700788838523,
-4.832820729408403,
-4.131940669978283,
-3.4310606105481627,
-2.7301805511180426,
-2.0293004916879234,
-1.3284204322578024,
-0.6275403728276814,
0.07333968660243784,
0.774219746032557,
1.475099805462678,
2.175979864892799
],
"bin_counts": [
3,
22,
37,
79,
218,
385,
557,
870,
1214,
1333,
1422,
1216,
1002,
759,
422,
251,
146,
42,
16,
6
]
}
},
"string_stats": null,
"timestamp": "2025-04-28T18:19:34.035704Z",
"correlations": null
}
}
}