import importlib
import seaborn as sns
import matplotlib.pyplot as plt
import src.models.MobileNet.runner_scripts.trainer as trainer
import src.models.MobileNet.classifier as classifier
import src.models.MobileNet.data_loader as data_loader
import src.models.MobileNet.metrics as metrics
import os
import Notebooks.utils.utils as utils
import Notebooks.utils.error_analysis as error_analysis
import pandas as pd
from IPython.core.display import display, HTML
from PIL import Image
from IPython.display import display, Image as IPImage
from matplotlib.image import imread
import numpy as np
import torch
import src.models.MobileNet.data_defs as data_defs
import src.models.MobileNet.metrics as metrics
import warnings

utils.fix_cwd()
sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")

VERBOSE = True

%%html
<iframe src="https://wandb.ai/qqwy/ag_classifier_main/reports/Best-Iteration-1-model-graceful-hill-257---Vmlldzo4ODIwODg5" style="border:none;height:1024px;width:100%">

data = {
    "Parameter": [
        "model_type",
        "lr_scheduler",
        "anneal_strategy",
        "base_lr",
        "batch_size",
        "div_factor",
        "dropout",
        "final_div_factor",
        "freeze_epochs",
        "l1_lambda",
        "max_lr",
        "num_epochs",
        "override_cycle_epoch_count",
        "weight_decay",
        "pct_start",
        "train_path",
        "val_path",
    ],
    "Value": [
        "mobilenet_v3_small",
        "one_cycle",
        "cos",
        0.0068893981577029285,
        256,
        24,
        0.1,
        2873,
        0,
        0.0001,
        0.012321315111072404,
        18,
        15,
        0.00019323262043373016,
        0.36685557351085574,
        "dataset/train_8_folds_first",
        "dataset/test_2_folds_last",
    ],
};

''

BASE_MODEL_NAME = "final_prod_z5yxudkl_graceful-hill-257_19_0.9310.pth"
OVERSAMPLE_AUG_MODEL_NAME = "full_aug_small_production_v1.pth"
NOT_PRETRAINED_MODEL_NAME = (
    "NO_WEIGHTS_full_dynamic_aug_tune_18_cycle+3_sage-planet-309_20_0.9259.pth"
)

test_config = {
    "ds_path": "dataset/test_2_folds_last",
    "batch_size": 512,
}

base_model = trainer.load_model(BASE_MODEL_NAME)
base_model.eval()
improved_model = trainer.load_model(OVERSAMPLE_AUG_MODEL_NAME)
improved_model.eval()

data_module_base = data_loader.create_dataloaders(test_config, mode="test")
data_module_base.setup("test")
predictions_base = classifier.predict_with_model(base_model, data_module_base);

data_module_improved = data_loader.create_dataloaders(test_config, mode="test")
data_module_improved.setup("test")
predictions_improved = classifier.predict_with_model(
    improved_model, data_module_improved
)

importlib.reload(error_analysis)

image_data_path = "dataset/image_entropy_summary.csv"
image_data = pd.read_csv(image_data_path)

merged_data_base = error_analysis.sync_predictions_with_image_data(
    predictions_base, image_data
)
merged_data_improved = error_analysis.sync_predictions_with_image_data(
    predictions_improved, image_data
)


image_quality_metrics_base = error_analysis.evaluate_by_image_quality(merged_data_base)
image_quality_metrics_improved = error_analysis.evaluate_by_image_quality(
    merged_data_improved
);

importlib.reload(metrics)
evaluation_results_improved = metrics.evaluate_predictions(predictions_improved)

evaluation_results_base = metrics.evaluate_predictions(predictions_base)
evaluation_results_base[("gender_metrics")]

evaluation_results_base["age_metrics"]

utils.get_baselines_table()

importlib.reload(utils)
utils.model_desc_table()

evaluation_results_base["age_statistics"]

importlib.reload(error_analysis)

error_analysis.confusion_matrix_plot_v2(
    merged_data_base,
    "true_gender",
    "gender_pred",
    title="Gender Classification",
    class_labels=["Male", "Female"],
)

<Axes: title={'center': 'Gender Classification'}, xlabel='Predicted label', ylabel='True label'>

importlib.reload(metrics)
metrics.display_binned_samples(merged_data_base)

importlib.reload(metrics)

vars_to_bin = [
    ("luminance", metrics.DEFAULT_LUMINANCE_BINS),
    ("brisque_score", metrics.DEFAULT_BRISQUE_BINS),
]

t = metrics.calculate_binned_metrics(merged_data_base, ["gender", "age"], vars_to_bin)
lum_bins_improved = metrics.calculate_binned_metrics(
    merged_data_improved, ["gender", "age"], vars_to_bin
)

lum_bins_base = t
lum_bins_base["gender"]["luminance_binned"]

lum_bins_base["age"]["luminance_binned"]

evaluation_results_base["gender_accuracy_by_age"]

importlib.reload(metrics)
evaluation_results_base["performance_by_age_bin"]

def process_age_groups(df, true_col, pred_col):
    age_groups = sorted(df[true_col].unique())
    ranges = [
        (
            float(g.split("-")[0]),
            float("inf") if g.endswith("inf") else float(g.split("-")[1]),
        )
        for g in age_groups
    ]

    df["true_group_index"] = (
        pd.Categorical(df[true_col], categories=age_groups).codes
    ).astype(int)
    df["pred_group_index"] = pd.cut(
        df[pred_col].map(lambda x: max(x, 0.01)),
        bins=[r[0] for r in ranges] + [float("inf")],
        labels=False,
    ).astype(int)

    return df, age_groups


df, class_labels = process_age_groups(merged_data_base, "age_group", "age_pred")

importlib.reload(error_analysis)
error_analysis.confusion_matrix_plot_v2(
    df,
    "true_group_index",
    "pred_group_index",
    class_labels=class_labels,
    title="Accuracy of Binned Age Prediction",
)

<Axes: title={'center': 'Accuracy of Binned Age Prediction'}, xlabel='Predicted label', ylabel='True label'>

merged_data_base["brisque_score"].describe()

DEFAULT_BRISQUE_BINS = [-np.inf, 25, 33, 41, np.inf]

importlib.reload(metrics)
metrics.display_binned_samples(
    merged_data_base, column_to_bin="brisque_score", bins=metrics.DEFAULT_BRISQUE_BINS
)

t["gender"]["brisque_score_binned"]

t["age"]["brisque_score_binned"]

importlib.reload(error_analysis)
warnings.filterwarnings("ignore")

error_analysis.evaluate_age_prediction(
    merged_data_base["true_age"],
    merged_data_base["age_pred"],
    bins=metrics.DEFAULT_AGE_BINS,
)

<module 'Notebooks.utils.error_analysis' from '/mnt/v/projects/DL_s3/Notebooks/utils/error_analysis.py'>

image_files = [
    "dataset/full/3_1_0_20170109193055962.jpg.chip.jpg",
    "dataset/full/15_0_0_20170104012346994.jpg.chip.jpg",
    "dataset/full/17_1_0_20170109214008165.jpg.chip.jpg",
    "dataset/full/31_1_4_20170117203039631.jpg.chip.jpg",
    "dataset/full/40_0_0_20170117151450653.jpg.chip.jpg",
    "dataset/full/50_0_0_20170111181750459.jpg.chip.jpg",
    "dataset/full/79_0_0_20170111222432817.jpg.chip.jpg",
    "dataset/full/110_0_0_20170112213500903.jpg.chip.jpg",
]

test_set = error_analysis.process_images(base_model, image_files)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

importlib.reload(error_analysis)
error_analysis.display_grid(test_set, scale=0.35)

Figure size: 840x2240 px

importlib.reload(error_analysis)
misclassified_files = error_analysis.get_misclassified_from_predictions(
    predictions_base, data_module_base, test_config, n=8
)

results_combined = error_analysis.process_images(
    base_model, misclassified_files.combined[:5]
)
results_age = error_analysis.process_images(
    base_model,
    [p for p in misclassified_files.age if not p in misclassified_files.combined],
)
results_gender = error_analysis.process_images(
    base_model,
    [p for p in misclassified_files.gender if not p in misclassified_files.combined],
)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

importlib.reload(error_analysis)
error_analysis.display_grid(results_combined)

Figure size: 840x1400 px

error_analysis.display_grid(results_age)

Figure size: 840x1400 px

error_analysis.display_grid(results_gender)

Figure size: 840x1960 px

sampling_data = {
    "Age_Group": [
        "0-9",
        "10-19",
        "20-29",
        "30-39",
        "40-49",
        "50-59",
        "60-69",
        "70-79",
        "80-89",
    ],
    "Initial": [2452, 1268, 5816, 3586, 1837, 1845, 1068, 543, 541],
    "After Augmentation": [4042, 3213, 6397, 4836, 3611, 3617, 3073, 2705, 2704],
}

sampling_data_df = pd.DataFrame(sampling_data)
sampling_data_df["Sample_Size_Change"] = (
    (sampling_data_df["After Augmentation"] - sampling_data_df["Initial"])
    / sampling_data_df["Initial"]
    * 100
)

fig, ax = plt.subplots(figsize=(12, 6))

x = range(len(sampling_data_df["Age_Group"]))
width = 0.35

ax.bar(
    [i - width / 2 for i in x],
    sampling_data_df["Initial"],
    width,
    label="Initial",
    color="blue",
    alpha=0.7,
)
ax.bar(
    [i + width / 2 for i in x],
    sampling_data_df["After Augmentation"],
    width,
    label="After Augmentation",
    color="red",
    alpha=0.7,
)

ax.set_ylabel("Count")
ax.set_title("Age Distribution: Initial vs After Augmentation")
ax.set_xticks(x)
ax.set_xticklabels(sampling_data_df["Age_Group"], rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

combined_df = pd.DataFrame(
    {
        "v1 Overall": evaluation_results_base["gender_metrics"]["Overall"],
        "v2 Overall": evaluation_results_improved["gender_metrics"]["Overall"],
    }
)

combined_df["Improvement"] = combined_df["v2 Overall"] - combined_df["v1 Overall"]
combined_df["% Improvement"] = (
    combined_df["Improvement"] / combined_df["v1 Overall"]
) * 100
combined_df["% Improvement"] = combined_df["% Improvement"].apply(lambda x: f"{x:.2f}%")

combined_df.rename(columns={"index": "Metric"}, inplace=True)
combined_df

combined_df = pd.DataFrame(
    {
        "v1": evaluation_results_base["age_metrics"]["Value"],
        "v2": evaluation_results_improved["age_metrics"]["Value"],
    }
)

combined_df["Improvement"] = combined_df["v2"] - combined_df["v1"]
combined_df["% Improvement"] = (combined_df["Improvement"] / combined_df["v1"]) * 100
combined_df["% Improvement"] = combined_df["% Improvement"].apply(lambda x: f"{x:.2f}%")

combined_df.rename(columns={"index": "Metric"}, inplace=True)
combined_df

evaluation_results_improved["age_metrics"]

evaluation_results_improved["age_statistics"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

error_analysis.confusion_matrix_plot_v2(
    merged_data_base,
    "true_gender",
    "gender_pred",
    title="Gender Prediction (v1/Base Model)",
    class_labels=["Male", "Female"],
    ax=ax1,
)

error_analysis.confusion_matrix_plot_v2(
    merged_data_improved,
    "true_gender",
    "gender_pred",
    title="Gender Prediction (v2/Improved Model)",
    class_labels=["Male", "Female"],
    ax=ax2,
)

plt.tight_layout()
plt.show()

evaluation_results_improved[("gender_metrics")]

result_gender = pd.merge(
    evaluation_results_base["gender_accuracy_by_age"]["Accuracy"],
    evaluation_results_improved["gender_accuracy_by_age"]["Accuracy"],
    left_index=True,
    right_index=True,
    suffixes=("_v1", "_v2"),
)

result_gender["Improvement"] = (
    (result_gender["Accuracy_v2"] - result_gender["Accuracy_v1"])
    / result_gender["Accuracy_v1"]
    * 100
)
result_gender["Improvement"] = result_gender["Improvement"].map("{:+.2f}%".format)
result_gender

result_age = pd.merge(
    evaluation_results_base["performance_by_age_bin"][["Age_Group", "Age_MAE"]],
    evaluation_results_improved["performance_by_age_bin"][["Age_Group", "Age_MAE"]],
    on="Age_Group",
    suffixes=("_v1", "_v2"),
)

result_age["Improvement"] = (
    (result_age["Age_MAE_v1"] - result_age["Age_MAE_v2"])
    / result_age["Age_MAE_v1"]
    * 100
)
result_age["Improvement"] = result_age["Improvement"].map("{:+.2f}%".format)

result_age = result_age[["Age_Group", "Age_MAE_v1", "Age_MAE_v2", "Improvement"]]
result_age

df_base, class_labels = process_age_groups(merged_data_base, "age_group", "age_pred")
df_improved, class_labels = process_age_groups(
    merged_data_improved, "age_group", "age_pred"
)

importlib.reload(error_analysis)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

importlib.reload(error_analysis)
error_analysis.confusion_matrix_plot_v3(
    df_base,
    "true_group_index",
    "pred_group_index",
    class_labels=class_labels,
    title="Age Prediction (v1)",
    ax=ax1,
    simplified=True,
)
error_analysis.confusion_matrix_plot_v3(
    df_improved,
    "true_group_index",
    "pred_group_index",
    class_labels=class_labels,
    title="Accuracy of Binned Age Prediction (v1/improved)",
    ax=ax2,
    simplified=True,
)
plt.tight_layout()
plt.show()

result_gender_lum = pd.merge(
    lum_bins_base["gender"]["luminance_binned"]["accuracy"],
    lum_bins_improved["gender"]["luminance_binned"]["accuracy"],
    left_index=True,
    right_index=True,
    suffixes=("_v1", "_v2"),
)

result_gender_lum["Improvement"] = (
    (result_gender_lum["accuracy_v2"] - result_gender_lum["accuracy_v1"])
    / result_gender_lum["accuracy_v1"]
    * 100
)
result_gender_lum["Improvement"] = result_gender_lum["Improvement"].map(
    "{:+.2f}%".format
)

result_gender_lum = result_gender_lum.reset_index()
result_gender_lum.columns = [
    "Luminance_Bin",
    "Accuracy_v1",
    "Accuracy_v2",
    "Improvement",
]
result_gender_lum

result_age_lum = pd.merge(
    lum_bins_base["age"]["luminance_binned"]["MAE"],
    lum_bins_improved["age"]["luminance_binned"]["MAE"],
    left_index=True,
    right_index=True,
    suffixes=("_v1", "_v2"),
)

result_age_lum["Improvement"] = (
    (result_age_lum["MAE_v1"] - result_age_lum["MAE_v2"])
    / result_age_lum["MAE_v1"]
    * 100
)
result_age_lum["Improvement"] = result_age_lum["Improvement"].map("{:+.2f}%".format)

result_age_lum = result_age_lum.reset_index()
result_age_lum.columns = ["Luminance_Bin", "MAE_v1", "MAE_v2", "Improvement"]
result_age_lum

result_gender_lum = pd.merge(
    lum_bins_base["gender"]["brisque_score_binned"]["accuracy"],
    lum_bins_improved["gender"]["brisque_score_binned"]["accuracy"],
    left_index=True,
    right_index=True,
    suffixes=("_v1", "_v2"),
)

result_gender_lum["Improvement"] = (
    (result_gender_lum["accuracy_v2"] - result_gender_lum["accuracy_v1"])
    / result_gender_lum["accuracy_v1"]
    * 100
)
result_gender_lum["Improvement"] = result_gender_lum["Improvement"].map(
    "{:+.2f}%".format
)

result_gender_lum = result_gender_lum.reset_index()
result_gender_lum.columns = ["BRISQUE_Bin", "Accuracy_v1", "Accuracy_v2", "Improvement"]
result_gender_lum

result_age_lum = pd.merge(
    lum_bins_base["age"]["brisque_score_binned"]["MAE"],
    lum_bins_improved["age"]["brisque_score_binned"]["MAE"],
    left_index=True,
    right_index=True,
    suffixes=("_v1", "_v2"),
)

result_age_lum["Improvement"] = (
    (result_age_lum["MAE_v1"] - result_age_lum["MAE_v2"])
    / result_age_lum["MAE_v1"]
    * 100
)
result_age_lum["Improvement"] = result_age_lum["Improvement"].map("{:+.2f}%".format)

result_age_lum = result_age_lum.reset_index()
result_age_lum.columns = ["BRISQUE_Bin", "MAE_v1", "MAE_v2", "Improvement"]
result_age_lum

base_data_wrong_pred_df_good_on_improved = merged_data_base[
    ((merged_data_base["gender_pred"] > 0.5) & (merged_data_base["true_gender"] == 0))
    | (
        (merged_data_base["gender_pred"] <= 0.5)
        & (merged_data_base["true_gender"] == 1)
    )
]

base_data_wrong_pred_df_good_on_improved = pd.merge(
    base_data_wrong_pred_df_good_on_improved,
    merged_data_improved[["image_path", "true_gender", "gender_pred"]],
    on="image_path",
    how="left",
)

base_data_wrong_pred_df_good_on_improved = base_data_wrong_pred_df_good_on_improved[
    (
        (
            (base_data_wrong_pred_df_good_on_improved["true_gender_x"] == 0)
            & (base_data_wrong_pred_df_good_on_improved["gender_pred_x"] >= 0.5)
        )
        | (
            (base_data_wrong_pred_df_good_on_improved["true_gender_x"] == 1)
            & (base_data_wrong_pred_df_good_on_improved["gender_pred_x"] < 0.5)
        )
    )
    & (
        (
            (base_data_wrong_pred_df_good_on_improved["true_gender_y"] == 0)
            & (base_data_wrong_pred_df_good_on_improved["gender_pred_y"] < 0.5)
        )
        | (
            (base_data_wrong_pred_df_good_on_improved["true_gender_y"] == 1)
            & (base_data_wrong_pred_df_good_on_improved["gender_pred_y"] >= 0.5)
        )
    )
]
# Calculate error magnitude
base_data_wrong_pred_df_good_on_improved["base_error"] = abs(
    base_data_wrong_pred_df_good_on_improved["gender_pred_x"]
    - base_data_wrong_pred_df_good_on_improved["true_gender_x"]
)

N = 5
top_N_wrong = base_data_wrong_pred_df_good_on_improved.sort_values(
    "base_error", ascending=False
).head(N)
improved_image_files = top_N_wrong["image_path"]

merged_data_base["age_error"] = abs(
    merged_data_base["age_pred"] - merged_data_base["true_age"]
)
merged_data_improved["age_error"] = abs(
    merged_data_improved["age_pred"] - merged_data_improved["true_age"]
)

age_comparison = pd.merge(
    merged_data_base[["image_path", "true_age", "age_pred", "age_error"]],
    merged_data_improved[["image_path", "age_pred", "age_error"]],
    on="image_path",
    suffixes=("_base", "_improved"),
)

age_comparison["error_reduction"] = (
    age_comparison["age_error_base"] - age_comparison["age_error_improved"]
)

N = 5
top_N_age_improved = age_comparison.sort_values(
    "error_reduction", ascending=False
).head(N)
improved_age_image_files = top_N_age_improved["image_path"]

results_gender_most_improved = [
    error_analysis.process_image_for_models(
        f"dataset/full/{img_file}", [base_model, improved_model]
    )
    for img_file in top_N_wrong["image_path"]
]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

importlib.reload(error_analysis)
error_analysis.display_grid_comparison(
    results_gender_most_improved,
    ["Base Model", "Improved Model"],
    comparison_type="gender",
)

Figure size: 840x1400 px

results_age_most_improved = [
    error_analysis.process_image_for_models(
        f"dataset/full/{img_file}", [base_model, improved_model]
    )
    for img_file in top_N_age_improved["image_path"]
]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

importlib.reload(error_analysis)
error_analysis.display_grid_comparison(
    results_age_most_improved,
    ["Base Model", "Improved Model"],
    comparison_type="age",
)

Figure size: 840x1400 px

#### Misclassified Age
base_worst_images_age = [
    "dataset/test_2_folds_last/111_1_0_20170120134646399.jpg.chip.jpg",
    "dataset/test_2_folds_last/9_0_0_20170110225030430.jpg.chip.jpg",
    "dataset/test_2_folds_last/41_1_1_20170117021604893.jpg.chip.jpg",
    "dataset/test_2_folds_last/8_0_1_20170114025855492.jpg.chip.jpg",
    "dataset/test_2_folds_last/80_1_0_20170110131953974.jpg.chip.jpg",
    "dataset/test_2_folds_last/15_0_0_20170116201332456.jpg.chip.jpg",
]

base_worst_images_gender = [
    "dataset/test_2_folds_last/26_1_1_20170116154712959.jpg.chip.jpg",
    "dataset/test_2_folds_last/111_1_0_20170120134646399.jpg.chip.jpg",
    "dataset/test_2_folds_last/9_0_0_20170110225030430.jpg.chip.jpg",
    "dataset/test_2_folds_last/8_0_1_20170114025855492.jpg.chip.jpg",
]

results_gender_worst_base = [
    error_analysis.process_image_for_models(f"{img_file}", [base_model, improved_model])
    for img_file in base_worst_images_gender
]

results_age_worst_base = [
    error_analysis.process_image_for_models(f"{img_file}", [base_model, improved_model])
    for img_file in base_worst_images_age
]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

importlib.reload(error_analysis)
error_analysis.display_grid_comparison(
    results_gender_worst_base,
    ["Base Model", "Improved Model"],
    comparison_type="gender",
)

Figure size: 840x1120 px

importlib.reload(error_analysis)
error_analysis.display_grid_comparison(
    results_age_worst_base, ["Base Model", "Improved Model"], comparison_type="age"
)

Figure size: 840x1680 px

importlib.reload(error_analysis)
error_analysis.evaluate_age_prediction(
    merged_data_improved["true_age"],
    merged_data_improved["age_pred"],
    bins=metrics.DEFAULT_AGE_BINS,
)

	Female	Male	Overall
Support	2353.000000	2387.000000	4740.000000
Accuracy	0.931013	0.931013	0.931013
Precision	0.924204	0.937925	0.931065
Recall	0.937952	0.924173	0.931062
F1-score	0.931027	0.930998	0.931013
AUC-ROC	NaN	NaN	0.980522
PR-AUC	NaN	NaN	0.977997
Log Loss	NaN	NaN	0.178862

	Value
MAE	5.105901
MSE	54.144762
RMSE	7.358312
R-squared	0.862191
MAPE	25.161557

	Model	Age Estimation (MAE)	Gender Classification (Accuracy)
0	XGBoost (+feat. extraction)	5.89	93.80
1	SVC(..)	5.49	94.64
2	VGG_f	4.86	93.42
3	ResNet50_f	4.65	94.64
4	SENet50_f	4.58	94.90

	VGG16	ResNet50	MobileNetV3-Small
Metric
Parameter Count	~138 million	~25.6 million	~2.5 million
Model Size (PyTorch, FP32)	~528 MB	~98 MB	~10 MB
Inference Speed (relative)	1x (baseline)	~2.5x faster	~10x faster
FLOPs	~15.5 billion	~4.1 billion	~56 million
Approx. Memory Usage (inference)	1x	~0.6x	~0.15x

	True Age	Predicted Age
Mean	33.308439	32.147823
Median	29.000000	28.514690
Min	1.000000	-2.139822
Max	116.000000	95.214233

3.0. Performance Analysis¶

3.1. V1/baseline Summary¶

3.1.1. Main Observations¶

3.2.0 Performance Metrics¶

3.2.1. Gender (binary classifier):¶

Age estimation:¶

Summary of Age Prediction:¶

Summary of Gender Prediction:¶

3.2.2. Performance by Binned Luminance (proxy for skin color)¶

3.2.3. Accuracy of Gender Prediction by Age Group¶

3.2.4. Age Prediction by Age Group¶

3.2.5. Age/Gender Accuracy Relative to Image Quality¶

3.2.6. More Detailed Age Prediction Plots¶

3.2.7. Analysing Individual Predictions with LIME¶

Most Misclassified Images (both gender/age)¶

Misclassified Gender¶

3.3 Building the Improved `v2` Model¶

Summary of issues with the initial model:¶

Augmentation-Based Oversampling¶

Potential issues:¶

Original vs Augmented Training Samples¶

Comparing Both Models¶

Gender Accuracy By Age Group:¶

Age Prediction Improvements By Age Group:¶

By Luminance Bin¶

By Image Quality¶

3.3.2. Individual Sample Analysis:¶

Parameter	Value
anneal_strategy	"cos"
base_lr	0.0068893981577029285
batch_size	256
div_factor	24
dropout	0.1
final_div_factor	2,873
freeze_epochs	0
gender_loss_weight	0.9
l1_lambda	0.0001
lr_scheduler	"one_cycle"
max_lr	0.012321315111072404
model_type	"mobilenet_v3_small"
num_epochs	18
override_cycle_epoch_count	15
pct_start	0.36685557351085574
prefix	"fixed_samples_final_full_split_15_cycle+3"
train_path	"dataset/train_8_folds_first"
use_dynamic_augmentation	false
val_path	"dataset/test_2_folds_last"
weight_decay	0.00019323262043373016

	sample_size	mean gender (std)	mean age (std)	accuracy	F1	log_loss
bin
0-85	412	0.45(0.50)	33.76(14.57)	0.951	0.947	0.156
85-105	748	0.49(0.50)	34.72(17.91)	0.934	0.933	0.188
105-120	846	0.51(0.50)	34.84(18.00)	0.937	0.939	0.151
120-135	892	0.49(0.50)	34.84(18.54)	0.959	0.958	0.142
135-150	805	0.51(0.50)	34.03(21.26)	0.911	0.911	0.179
150+	912	0.50(0.50)	28.26(23.62)	0.904	0.905	0.227

	sample_size	MAE	RMSE
bin
0-85	412	5.408	8.055
85-105	748	5.407	7.859
105-120	846	4.997	7.161
120-135	892	5.616	7.921
135-150	805	5.037	7.057
150+	912	4.459	6.540

	Total	Correct	Accuracy
Age_Group
0-4	444	307	0.6914
4-14	261	215	0.8238
14-24	636	604	0.9497
24-30	1228	1187	0.9666
30-40	865	837	0.9676
40-50	399	393	0.9850
50-60	420	409	0.9738
60-70	229	218	0.9520
70-80	156	149	0.9551
80+	102	94	0.9216

	Age_Group	Support	Age_MAE	Age_MSE	Age_RMSE	Age_R-squared	Age_MAPE
0	0-4	444	1.588580	11.325658	3.365361	-9.241579	99.745904
1	4-14	261	4.011655	34.033093	5.833789	-3.743251	46.700869
2	14-24	636	4.171022	32.965802	5.741585	-2.937213	21.156784
3	24-30	1228	3.720786	30.006521	5.477821	-10.167695	13.674633
4	30-40	865	6.270144	63.924114	7.995256	-7.162335	17.644973
5	40-50	399	7.749943	96.742555	9.835779	-10.194667	16.942367
6	50-60	420	7.311122	91.486462	9.564856	-11.248783	13.271226
7	60-70	229	6.725516	80.393407	8.966237	-8.236708	10.369088
8	70-80	156	7.617475	105.892985	10.290432	-11.530508	10.082188
9	80+	102	8.947648	173.258202	13.162758	-3.118748	9.777900

	sample_size	mean gender (std)	mean age (std)	accuracy	F1	log_loss
bin
-inf-25	1125	0.51(0.50)	37.86(24.56)	0.938	0.940	0.158
25-35	1457	0.48(0.50)	33.14(18.53)	0.933	0.930	0.170
35-45	1201	0.49(0.50)	30.78(17.09)	0.930	0.929	0.177
45-55	597	0.50(0.50)	31.08(17.19)	0.926	0.928	0.190
55+	235	0.54(0.50)	30.61(16.99)	0.898	0.905	0.255

	sample_size	MAE	RMSE
bin
-inf-25	1125	5.324	7.454
25-35	1457	4.876	6.761
35-45	1201	4.874	7.078
45-55	597	5.524	8.368
55+	235	5.899	9.296

	v1	v2	Improvement	% Improvement
MAE	5.105901	4.730945	-0.374956	-7.34%
MSE	54.144762	48.337331	-5.807431	-10.73%
RMSE	7.358312	6.952505	-0.405807	-5.51%
R-squared	0.862191	0.876972	0.014781	1.71%
MAPE	25.161557	20.222332	-4.939225	-19.63%

	True Age	Predicted Age
Mean	33.308439	33.636082
Median	29.000000	29.757675
Min	1.000000	-0.380083
Max	116.000000	96.447899

	Accuracy_v1	Accuracy_v2	Improvement
Age_Group
0-4	0.6914	0.6892	-0.32%
4-14	0.8238	0.8467	+2.78%
14-24	0.9497	0.9670	+1.82%
24-30	0.9666	0.9674	+0.08%
30-40	0.9676	0.9780	+1.07%
40-50	0.9850	0.9875	+0.25%
50-60	0.9738	0.9786	+0.49%
60-70	0.9520	0.9738	+2.29%
70-80	0.9551	0.9551	+0.00%
80+	0.9216	0.9412	+2.13%

	v1 Overall	v2 Overall	Improvement	% Improvement
Support	4740.000000	4740.000000	0.000000	0.00%
Accuracy	0.931013	0.938608	0.007595	0.82%
Precision	0.931065	0.938621	0.007556	0.81%
Recall	0.931062	0.938591	0.007529	0.81%
F1-score	0.931013	0.938603	0.007590	0.82%
AUC-ROC	0.980522	0.981255	0.000733	0.07%
PR-AUC	0.977997	0.976712	-0.001285	-0.13%
Log Loss	0.178862	0.192300	0.013438	7.51%

	Age_Group	Age_MAE_v1	Age_MAE_v2	Improvement
0	0-4	1.588580	1.014360	+36.15%
1	4-14	4.011655	3.195415	+20.35%
2	14-24	4.171022	3.587664	+13.99%
3	24-30	3.720786	4.186014	-12.50%
4	30-40	6.270144	6.002176	+4.27%
5	40-50	7.749943	6.352205	+18.04%
6	50-60	7.311122	6.273703	+14.19%
7	60-70	6.725516	6.505069	+3.28%
8	70-80	7.617475	6.595112	+13.42%
9	80+	8.947648	8.218197	+8.15%

	Luminance_Bin	MAE_v1	MAE_v2	Improvement
0	0-85	5.408	5.231	+3.27%
1	85-105	5.407	5.169	+4.40%
2	105-120	4.997	4.818	+3.58%
3	120-135	5.616	4.899	+12.77%
4	135-150	5.037	4.694	+6.81%
5	150+	4.459	4.002	+10.25%

	BRISQUE_Bin	MAE_v1	MAE_v2	Improvement
0	-inf-25	5.324	4.673	+12.23%
1	25-35	4.876	4.415	+9.45%
2	35-45	4.874	4.608	+5.46%
3	45-55	5.524	5.432	+1.67%
4	55+	5.899	6.080	-3.07%

	Luminance_Bin	Accuracy_v1	Accuracy_v2	Improvement
0	0-85	0.951	0.961	+1.05%
1	85-105	0.934	0.948	+1.50%
2	105-120	0.937	0.944	+0.75%
3	120-135	0.959	0.959	+0.00%
4	135-150	0.911	0.938	+2.96%
5	150+	0.904	0.899	-0.55%

	BRISQUE_Bin	Accuracy_v1	Accuracy_v2	Improvement
0	-inf-25	0.938	0.943	+0.53%
1	25-35	0.933	0.942	+0.96%
2	35-45	0.930	0.945	+1.61%
3	45-55	0.926	0.926	+0.00%
4	55+	0.898	0.902	+0.45%

3.0. Performance Analysis¶

3.1. V1/baseline Summary¶

3.1.1. Main Observations¶

3.2.0 Performance Metrics¶

3.2.1. Gender (binary classifier):¶

Age estimation:¶

Summary of Age Prediction:¶

Summary of Gender Prediction:¶

3.2.2. Performance by Binned Luminance (proxy for skin color)¶

3.2.3. Accuracy of Gender Prediction by Age Group¶

3.2.4. Age Prediction by Age Group¶

3.2.5. Age/Gender Accuracy Relative to Image Quality¶

3.2.6. More Detailed Age Prediction Plots¶

3.2.7. Analysing Individual Predictions with LIME¶

Most Misclassified Images (both gender/age)¶

Misclassified Gender¶

3.3 Building the Improved v2 Model¶

Summary of issues with the initial model:¶

Augmentation-Based Oversampling¶

Potential issues:¶

Original vs Augmented Training Samples¶

Comparing Both Models¶

Gender Accuracy By Age Group:¶

Age Prediction Improvements By Age Group:¶

By Luminance Bin¶

By Image Quality¶

3.3.2. Individual Sample Analysis:¶

3.3 Building the Improved `v2` Model¶