This guide provides practical instructions for downloading, loading, and managing machine learning datasets using ObjMLDatasets - the centralized dataset management system for ML workflows.
ObjMLDatasets provides a unified interface for working with machine learning datasets from multiple sources:
source dev-env/bin/activate)data.configfrom ObjMLDatasets import ObjMLDatasets
# Initialize datasets manager
datasets = ObjMLDatasets()
# List all available datasets
available = datasets.list_available_datasets()
for name, info in available.items():
print(f"{name}:")
print(f" Source: {info['source']}")
print(f" Description: {info['description']}")
print()
# Load German Credit dataset
df = datasets.load_dataset("german_credit")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(df.head())
# Check target distribution
print(f"\nTarget distribution:")
print(df['class'].value_counts())
# First, download the dataset
datasets.download_data(dataset_name="credit_score_classification")
# Then load it
df = datasets.load_data_from_kaggle(
dataset_name="credit_score_classification",
kaggle_dataset="parisrohan/credit-score-classification",
kaggle_file="train.csv"
)
print(f"Loaded {len(df)} records from Kaggle")
print(f"Target classes: {df['Credit_Score'].unique()}")
Credit risk classification with 20 attributes.
# Load dataset
df = datasets.load_dataset("german_credit")
# Or use the specific method
df = datasets.load_data_from_german_credit()
# Dataset info
print(f"Samples: {len(df)}")
print(f"Features: {len(df.columns) - 1}")
print(f"Target: {df['class'].value_counts()}")
Features include:
Diabetes prediction based on diagnostic measurements.
df = datasets.load_dataset("pima_indians_diabetes")
# Features
print(df.columns.tolist())
# ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
# 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Target
print(df['Outcome'].value_counts())
# 0: No diabetes
# 1: Diabetes
Kaggle credit scoring dataset with 150,000 samples.
df = datasets.load_dataset("give_me_some_credit")
# Key features
print(df.columns.tolist())
# Includes: age, DebtRatio, MonthlyIncome,
# NumberOfOpenCreditLinesAndLoans, etc.
# Target: SeriousDlqin2yrs (serious delinquency)
print(df['SeriousDlqin2yrs'].value_counts())
Credit card default prediction.
df = datasets.load_dataset("taiwan_credit_card_default")
# Features include payment history, bill amounts, payment amounts
print(f"Features: {len(df.columns)}")
print(f"Target: {df['default.payment.next.month'].value_counts()}")
Multi-class credit score classification (Good, Standard, Poor).
df = datasets.load_dataset("credit_score_classification")
# Multi-class target
print(df['Credit_Score'].value_counts())
# Good
# Standard
# Poor
# Rich feature set for credit assessment
print(f"Features: {len(df.columns)}")
Get Kaggle API credentials:
kaggle.jsonConfigure credentials:
# Place kaggle.json in the correct location
mkdir -p ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
kaggle datasets list | head -5
from ObjMLDatasets import ObjMLDatasets
datasets = ObjMLDatasets()
# Download a dataset
datasets.download_data(dataset_name="credit_score_classification")
# Dataset is cached in: local.documents/ml/credit_score_classification/
# Load from downloaded Kaggle dataset
df = datasets.load_data_from_kaggle(
dataset_name="credit_score_classification",
kaggle_dataset="parisrohan/credit-score-classification",
kaggle_file="train.csv"
)
# Automatic preprocessing based on configuration
# - Drops unwanted columns (ID fields, etc.)
# - Handles missing values
# - Identifies target column
# Download a specific dataset
dev-env/bin/python factory.core/ObjMLDatasets.py download-data credit_score_classification
# Download multiple datasets
dev-env/bin/python factory.core/ObjMLDatasets.py download-data german_credit
dev-env/bin/python factory.core/ObjMLDatasets.py download-data pima_indians_diabetes
Add to ObjMLDatasets.yaml:
datasets:
my_custom_dataset:
data_url: "https://example.com/data/my_dataset.csv"
column_names: [
"feature1", "feature2", "feature3", "target"
]
sep: ","
has_header: true
target_column: "target"
missing_value_strategy: "median"
Add to ObjMLDatasets.yaml:
datasets:
my_kaggle_dataset:
source_type: "kaggle"
kaggle_dataset: "username/dataset-name"
kaggle_file: "train.csv"
sep: ","
has_header: true
drop_columns: ["ID"]
target_column: "target"
missing_value_strategy: "mean"
import pandas as pd
from ObjMLDatasets import ObjMLDatasets
# Load from local file
df = pd.read_csv("local.documents/ml/my_data/data.csv")
# Register in catalog
datasets = ObjMLDatasets()
# Process and use as needed
my_dataset:
data_url: "https://example.com/data.csv"
sep: "," # Separator: "," "\s+" "\t"
has_header: true # First row contains headers
column_names: [...] # Column names if no header
target_column: "target" # Target variable name
my_dataset:
missing_value_strategy: "mean" # "mean", "median", "mode", "drop"
missing_value_columns_0_to_nan: ["col1", "col2"] # Replace 0 with NaN
my_dataset:
drop_columns: ["ID", "timestamp"] # Columns to drop
target_column: "outcome" # Target variable
my_dataset:
target_column: "credit_score"
classification_type: "multiclass"
classes: ["Poor", "Standard", "Good"]
from ObjMLDatasets import ObjMLDatasets
datasets = ObjMLDatasets()
# Load dataset
df = datasets.load_dataset("german_credit")
# Extract features for a sample
sample_guid = "SAMPLE001"
feature_dict = {
"credit_amount": df.iloc[0]["credit_amount"],
"duration_in_month": df.iloc[0]["duration_in_month"],
"age_in_years": df.iloc[0]["age_in_years"]
}
# Store in feature store
datasets.store_feature_set(
name="loan_application_features",
sample_guid=sample_guid,
features=feature_dict
)
# Define a derived feature
datasets.define_feature(
feature_name="debt_to_income_ratio",
feature_type="derived",
computation_logic="total_debt / annual_income",
description="Ratio of total debt to annual income",
provenance="Calculated from application data"
)
# Define an original feature
datasets.define_feature(
feature_name="credit_score",
feature_type="original",
description="Customer credit score from bureau",
provenance="Pulled from credit bureau API"
)
# CLI command
dev-env/bin/python factory.core/ObjMLDatasets.py list-featurestore
# Or in Python
features = datasets.list_feature_store()
for feature in features:
print(f"{feature['feature_name']}: {feature['description']}")
from sklearn.metrics import accuracy_score, precision_score, recall_score
from ObjMLDatasets import ObjMLDatasets
datasets = ObjMLDatasets()
# Train model (example)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# Calculate metrics
metrics = {
"accuracy": accuracy_score(y_test, y_pred),
"precision": precision_score(y_test, y_pred, average='weighted'),
"recall": recall_score(y_test, y_pred, average='weighted'),
"f1_score": f1_score(y_test, y_pred, average='weighted'),
"training_date": "2025-12-26",
"model_type": "LogisticRegression"
}
# Store evaluation results
datasets.store_model_evaluation(
model_name="credit_risk_model_v1",
dataset_name="german_credit",
evaluation_metrics=metrics
)
# List all evaluations for a dataset
evaluations = datasets.list_model_evaluations(dataset_name="german_credit")
print("Model Performance Comparison:")
for eval in evaluations:
print(f"\n{eval['model_name']}:")
metrics = eval['evaluation_metrics']
print(f" Accuracy: {metrics['accuracy']:.4f}")
print(f" F1-Score: {metrics['f1_score']:.4f}")
print(f" Date: {metrics['training_date']}")
# List model evaluations
dev-env/bin/python factory.core/ObjMLDatasets.py list-model-evaluations \
--dataset german_credit
from ObjML import ObjML
from ObjMLDatasets import ObjMLDatasets
# Train model and build scorecard
obj_ml = ObjML(0)
datasets = ObjMLDatasets()
# ... train model ...
# Build scorecard
scorecard = obj_ml.scorecard_scaling(
model=model,
pdo=20,
base_odds=50.0,
base_score=600
)
# Store scorecard in database
datasets.store_scorecard(
scorecard_name="german_credit_scorecard_v1",
dataset_name="german_credit",
model_name="logistic_regression_v1",
scorecard_data=scorecard
)
# Build scorecard from CLI
dev-env/bin/python factory.core/ObjMLDatasets.py build-scorecard \
--model-name credit_model \
--dataset-name german_credit \
--pdo 20 \
--base-score 600
from ObjMLDatasets import ObjMLDatasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
# Step 1: Load dataset
datasets = ObjMLDatasets()
df = datasets.load_dataset("pima_indians_diabetes")
# Step 2: Handle missing values (already done by ObjMLDatasets)
print(f"Missing values: {df.isnull().sum().sum()}")
# Step 3: Separate features and target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Step 5: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 6: Store preprocessing info
preprocessing_info = {
"scaler_mean": scaler.mean_.tolist(),
"scaler_std": scaler.scale_.tolist(),
"features": X.columns.tolist(),
"train_size": len(X_train),
"test_size": len(X_test)
}
# Store as feature set
datasets.store_feature_set(
name="diabetes_preprocessing",
sample_guid="PREPROCESSING_V1",
features=preprocessing_info
)
# Use consistent naming conventions
dataset_name = "company_product_type_date"
# Examples:
# - "acme_credit_scoring_2025"
# - "bank_loan_default_train"
# - "retail_churn_prediction"
# Document dataset purpose
description = "Credit scoring dataset for consumer loans, 2023-2025 data"
# Include version in dataset name
dataset_configs = {
"credit_scoring_v1": {...}, # Original dataset
"credit_scoring_v2": {...}, # With additional features
"credit_scoring_v3": {...} # Updated with recent data
}
# Track dataset lineage in feature store
datasets.define_feature(
feature_name="credit_score_v2",
feature_type="derived",
computation_logic="Updated algorithm as of 2025",
provenance="credit_score_v1 recalculated with new model"
)
# Download once, use many times
if not os.path.exists("local.documents/ml/my_dataset"):
datasets.download_data("my_dataset")
# Load from cache
df = datasets.load_data_from_kaggle(
dataset_name="my_dataset",
kaggle_dataset="owner/dataset-name",
kaggle_file="data.csv"
)
# Document all features in feature store
feature_definitions = {
"credit_utilization": {
"type": "derived",
"logic": "total_balance / total_limit",
"description": "Percentage of credit limit used",
"provenance": "Calculated from credit report data"
},
"payment_history_score": {
"type": "derived",
"logic": "Weighted average of on-time payments",
"description": "0-100 score based on payment timeliness",
"provenance": "Historical payment data"
}
}
for name, info in feature_definitions.items():
datasets.define_feature(
feature_name=name,
feature_type=info["type"],
computation_logic=info["logic"],
description=info["description"],
provenance=info["provenance"]
)
def validate_dataset(df, dataset_name):
"""Validate dataset quality."""
issues = []
# Check for missing values
missing_pct = (df.isnull().sum() / len(df)) * 100
high_missing = missing_pct[missing_pct > 50]
if len(high_missing) > 0:
issues.append(f"High missing values: {high_missing.to_dict()}")
# Check for duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
issues.append(f"Found {duplicates} duplicate rows")
# Check target balance
if 'target' in df.columns:
balance = df['target'].value_counts(normalize=True)
if balance.max() > 0.95:
issues.append(f"Highly imbalanced target: {balance.to_dict()}")
if issues:
print(f"⚠ Dataset quality issues for {dataset_name}:")
for issue in issues:
print(f" - {issue}")
else:
print(f"✓ Dataset {dataset_name} passed quality checks")
return len(issues) == 0
# Use validation
df = datasets.load_dataset("german_credit")
validate_dataset(df, "german_credit")
Check API credentials:
# Verify kaggle.json exists
ls -la ~/.kaggle/kaggle.json
# Test Kaggle API
kaggle datasets list | head -5
Solution:
# Reconfigure Kaggle API
mkdir -p ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
Check available datasets:
datasets = ObjMLDatasets()
available = datasets.list_available_datasets()
print(available.keys())
Verify configuration:
# Check ObjMLDatasets.yaml
grep -A 5 "your_dataset_name" factory.core/ObjMLDatasets.yaml
Check configuration:
my_dataset:
missing_value_strategy: "mean" # Ensure this is set
missing_value_columns_0_to_nan: ["Glucose", "BMI"] # Convert 0 to NaN first
Verify in Python:
df = datasets.load_dataset("my_dataset")
print(f"Missing values per column:")
print(df.isnull().sum())
Optimize loading:
# Use chunked loading for large datasets
import pandas as pd
chunk_size = 10000
chunks = []
for chunk in pd.read_csv("large_dataset.csv", chunksize=chunk_size):
# Process chunk
processed = preprocess_chunk(chunk)
chunks.append(processed)
df = pd.concat(chunks, ignore_index=True)
Cache processed datasets:
import pickle
# Save processed dataset
with open("local.documents/ml/processed_dataset.pkl", "wb") as f:
pickle.dump(df, f)
# Load from cache
with open("local.documents/ml/processed_dataset.pkl", "rb") as f:
df = pickle.load(f)
from ObjMLDatasets import ObjMLDatasets
from ObjML import ObjML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# Step 1: Initialize
print("=== Initializing ===")
datasets = ObjMLDatasets()
obj_ml = ObjML(0)
# Step 2: Load Dataset
print("\n=== Loading Dataset ===")
df = datasets.load_dataset("german_credit")
print(f"Loaded {len(df)} records")
# Step 3: Prepare Data
print("\n=== Preparing Data ===")
X = df.drop("class", axis=1)
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Step 4: Train Model
print("\n=== Training Model ===")
model, metrics, y_pred, X_test_out, y_test_out = obj_ml.train_cost_sensitive_classifier(
X=X_train,
y=y_train,
class_weights={0: 1, 1: 5},
model_type="LogisticRegression"
)
# Step 5: Evaluate
print("\n=== Evaluating Model ===")
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
# Step 6: Store Evaluation
print("\n=== Storing Evaluation ===")
evaluation_metrics = {
"accuracy": test_accuracy,
"precision": precision_score(y_test, y_test_pred),
"recall": recall_score(y_test, y_test_pred),
"classification_report": classification_report(y_test, y_test_pred, output_dict=True),
"training_date": "2025-12-26",
"model_type": "LogisticRegression",
"class_weights": "{0: 1, 1: 5}"
}
datasets.store_model_evaluation(
model_name="german_credit_model_v1",
dataset_name="german_credit",
evaluation_metrics=evaluation_metrics
)
# Step 7: Build and Store Scorecard
print("\n=== Building Scorecard ===")
scorecard = obj_ml.scorecard_scaling(
model=model,
pdo=20,
base_odds=50.0,
base_score=600
)
datasets.store_scorecard(
scorecard_name="german_credit_scorecard_v1",
dataset_name="german_credit",
model_name="german_credit_model_v1",
scorecard_data=scorecard
)
# Step 8: Store Features
print("\n=== Storing Feature Definitions ===")
for feature in X_train.columns:
datasets.define_feature(
feature_name=feature,
feature_type="original",
description=f"Feature from German Credit dataset",
provenance="UCI ML Repository"
)
print("\n=== Pipeline Complete ===")
print(f"Model: german_credit_model_v1")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Scorecard: german_credit_scorecard_v1")