Data profiling
In addition to monitoring, Scouter also provides data profiling tools to create feature distribution profiles to associate with your data.
Supported Data Types
Section titled “Supported Data Types”Scouter supports a variety of data types, including:
- Pandas DataFrames: Scouter can handle Pandas DataFrames, making it easy to integrate with existing data processing pipelines.
- Numpy Arrays: Out of the box support for 2D arrays.
- Polars DataFrames: For users who prefer Polars, Scouter supports this data format as well, allowing for efficient data processing and analysis.
Create a Profile
Section titled “Create a Profile”import numpy as npimport pandas as pdfrom scouter import DataProfile, DataProfiler # type: ignore[attr-defined]
def generate_data() -> pd.DataFrame: """Create a fake data frame for testing""" n = 10_000 X_train = np.random.normal(-4, 2.0, size=(n, 4)) col_names = [] for i in range(0, X_train.shape[1]): col_names.append(f"feature_{i}") X = pd.DataFrame(X_train, columns=col_names)
# create string column (with 10 unique values) X["categorical_feature"] = np.random.choice( ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], size=n )
return X
data = generate_data()
# create data profilerprofiler = DataProfiler()
# create data profileprofile: DataProfile = profiler.create_data_profile(data)
# Save (provide a path or leave blank)profile.save_to_json()
print(profile)???success “Output”
```json { "features": { "categorical_feature": { "id": "categorical_feature", "numeric_stats": null, "string_stats": { "distinct": { "count": 10, "percent": 0.1 }, "char_stats": { "min_length": 1, "max_length": 1, "median_length": 1, "mean_length": 1.0 }, "word_stats": { "words": { "d": { "count": 1021, "percent": 0.1021 }, "b": { "count": 998, "percent": 0.0998 }, "i": { "count": 989, "percent": 0.0989 }, "c": { "count": 1015, "percent": 0.1015 }, "f": { "count": 1007, "percent": 0.1007 }, "a": { "count": 987, "percent": 0.0987 }, "h": { "count": 982, "percent": 0.0982 }, "g": { "count": 988, "percent": 0.0988 }, "j": { "count": 1020, "percent": 0.102 }, "e": { "count": 993, "percent": 0.0993 } } } }, "timestamp": "2025-04-28T18:19:33.987594Z", "correlations": null }, "feature_0": { "id": "feature_0", "numeric_stats": { "mean": -4.003965640199899, "stddev": 2.0178515177174448, "min": -11.084430177884318, "max": 3.375571168173134, "distinct": { "count": 10000, "percent": 1.0 }, "quantiles": { "q25": -5.348060765665304, "q50": -3.9960621892367625, "q75": -2.6379431350112723, "q99": 0.7695557247479057 }, "histogram": { "bins": [ -11.084430177884318, -10.361430110581445, -9.638430043278573, -8.9154299759757, -8.192429908672828, -7.469429841369955, -6.7464297740670816, -6.023429706764209, -5.300429639461337, -4.577429572158464, -3.8544295048555917, -3.1314294375527183, -2.408429370249845, -1.6854293029469734, -0.9624292356441, -0.2394291683412284, 0.48357089896164496, 1.2065709662645183, 1.92957103356739, 2.6525711008702633 ], "bin_counts": [ 8, 14, 53, 120, 228, 444, 706, 1006, 1300, 1399, 1404, 1159, 914, 599, 352, 153, 81, 39, 17, 4 ] } }, "string_stats": null, "timestamp": "2025-04-28T18:19:34.035699Z", "correlations": null }, "feature_1": { "id": "feature_1", "numeric_stats": { "mean": -3.997238711617275, "stddev": 1.9969030204031852, "min": -11.2143596931969, "max": 3.5233542660216592, "distinct": { "count": 10000, "percent": 1.0 }, "quantiles": { "q25": -5.331856994963132, "q50": -3.9985892307105217, "q75": -2.6720412739258803, "q99": 0.7018279512701495 }, "histogram": { "bins": [ -11.2143596931969, -10.477473995235973, -9.740588297275044, -9.003702599314117, -8.266816901353188, -7.529931203392261, -6.7930455054313335, -6.056159807470405, -5.319274109509477, -4.582388411548549, -3.8455027135876207, -3.1086170156266935, -2.371731317665766, -1.634845619704837, -0.8979599217439098, -0.16107422378298075, 0.5758114741779465, 1.3126971721388738, 2.049582870099803, 2.78646856806073 ], "bin_counts": [ 7, 18, 42, 93, 218, 425, 706, 1015, 1345, 1431, 1430, 1216, 866, 580, 340, 158, 72, 20, 13, 5 ] } }, "string_stats": null, "timestamp": "2025-04-28T18:19:34.035702Z", "correlations": null }, "feature_2": { "id": "feature_2", "numeric_stats": { "mean": -3.9985933460650362, "stddev": 1.9872554303247896, "min": -11.298088188584437, "max": 3.7792120832159224, "distinct": { "count": 10000, "percent": 1.0 }, "quantiles": { "q25": -5.3669588658728955, "q50": -3.991356640189083, "q75": -2.6604527797988693, "q99": 0.716291352460563 }, "histogram": { "bins": [ -11.298088188584437, -10.54422317499442, -9.7903581614044, -9.036493147814383, -8.282628134224364, -7.528763120634347, -6.774898107044329, -6.021033093454311, -5.267168079864293, -4.513303066274275, -3.7594380526842563, -3.005573039094239, -2.2517080255042217, -1.4978430119142025, -0.7439779983241852, 0.00988701526583391, 0.7637520288558513, 1.5176170424458686, 2.2714820560358877, 3.025347069625905 ], "bin_counts": [ 3, 11, 36, 109, 222, 423, 744, 1114, 1321, 1487, 1454, 1193, 853, 530, 284, 120, 52, 29, 13, 2 ] } }, "string_stats": null, "timestamp": "2025-04-28T18:19:34.035703Z", "correlations": null }, "feature_3": { "id": "feature_3", "numeric_stats": { "mean": -3.996914164800711, "stddev": 1.9930422779448296, "min": -11.140741264279484, "max": 2.876859924322919, "distinct": { "count": 10000, "percent": 1.0 }, "quantiles": { "q25": -5.332108894837108, "q50": -3.9894123025713193, "q75": -2.639771809727505, "q99": 0.531352688455816 }, "histogram": { "bins": [ -11.140741264279484, -10.439861204849365, -9.738981145419244, -9.038101085989123, -8.337221026559003, -7.636340967128883, -6.935460907698763, -6.234580848268643, -5.533700788838523, -4.832820729408403, -4.131940669978283, -3.4310606105481627, -2.7301805511180426, -2.0293004916879234, -1.3284204322578024, -0.6275403728276814, 0.07333968660243784, 0.774219746032557, 1.475099805462678, 2.175979864892799 ], "bin_counts": [ 3, 22, 37, 79, 218, 385, 557, 870, 1214, 1333, 1422, 1216, 1002, 759, 422, 251, 146, 42, 16, 6 ] } }, "string_stats": null, "timestamp": "2025-04-28T18:19:34.035704Z", "correlations": null } }}```