import logging
import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from fgvc.core.training import predict
from fgvc.datasets import get_dataloaders
from fgvc.special.threshold_analysis import (
class_wise_confidence_threshold_report,
estimate_optimal_confidence_thresholds,
)
from fgvc.utils.experiment import load_args, load_model, load_test_metadata
from fgvc.utils.utils import set_cuda_device
from fgvc.utils.wandb import log_summary_scores, resume_wandb, wandb
logger = logging.getLogger("script")
def add_arguments(parser):
"""Callback function that includes metadata args."""
parser.add_argument(
"--test-metadata",
help="Path to a test metadata file.",
type=str,
required=True,
)
parser.add_argument(
"--ignore-unfinished",
help="Do not check if the W&B run is finished and run evaluation either way.",
action="store_true",
)
parser.add_argument(
"--rerun",
help="Re-runs evaluation on test set even if the run already has test scores.",
action="store_true",
)
parser.add_argument(
"--label-col",
help="Name of column with target classes (labels) in the test set.",
type=str,
default=None,
)
def classification_report_df(
test_df: pd.DataFrame, preds: np.ndarray, targs: np.ndarray, label_col: str = None
) -> pd.DataFrame:
"""Create classification report with class-wise Precision, Recall, and F1 score metrics.
The method is based on the `sklearn.metrics.classification_report` method.
"""
report_df = pd.DataFrame.from_dict(
classification_report(
targs,
preds.argmax(1),
labels=np.arange(preds.shape[1]),
zero_division=0,
output_dict=True,
),
orient="index",
)
report_df.index.name = "class_id"
report_df = report_df.reset_index()
report_df = report_df.rename(
columns={
"precision": "Precision",
"recall": "Recall",
"f1-score": "F1",
"support": "Num Records",
}
)
if label_col is not None:
id2label = dict(zip(test_df["class_id"].astype(str), test_df[label_col]))
report_df.insert(1, label_col, report_df["class_id"].apply(id2label.get))
return report_df
def threhold_analysis_report_df(
test_df: pd.DataFrame, preds: np.ndarray, targs: np.ndarray, label_col: str = None
) -> pd.DataFrame:
"""Create threshold analysis report with class-wise confidence thresholds and metrics."""
confidence_thresholds = estimate_optimal_confidence_thresholds(preds, targs)
report_df = class_wise_confidence_threshold_report(preds, targs, confidence_thresholds)
report_df.index.name = "class_id"
report_df = report_df.reset_index()
if label_col is not None:
id2label = dict(zip(test_df["class_id"].astype(str), test_df[label_col]))
report_df.insert(1, label_col, report_df["class_id"].apply(id2label.get))
return report_df
def test_clf(
*,
test_metadata: str = None,
wandb_run_path: str = None,
cuda_devices: str = None,
ignore_unfinished: bool = False,
rerun: bool = False,
label_col: str = None,
**kwargs,
):
"""Test model on the classification task and log test scores as a run summary in W&B."""
if wandb is None:
raise ImportError("Package wandb is not installed.")
if test_metadata is None or wandb_run_path is None:
# load script args
args = load_args(add_arguments_fn=add_arguments, test_args=True)
test_metadata = args.test_metadata
wandb_run_path = args.wandb_run_path
cuda_devices = args.cuda_devices
ignore_unfinished = args.ignore_unfinished
rerun = args.rerun
label_col = args.label_col
# set device
device = set_cuda_device(cuda_devices)
# load metadata
test_df = load_test_metadata(test_metadata)
if label_col is not None:
assert label_col in test_df, f"Test dataframe is missing column '{label_col}'."
# connect to wandb and load run
logger.info(f"Loading W&B experiment run: {wandb_run_path}")
api = wandb.Api()
run = api.run(wandb_run_path)
config = run.config
run_is_finished = len(run.history()) >= config["epochs"] and run.state == "finished"
if not run_is_finished and not ignore_unfinished:
logger.warning(f"Run '{run.name}' is not finished yet. Exiting.")
sys.exit(0)
has_test_scores = "Test. Accuracy" in run.summary
if has_test_scores and not rerun:
logger.warning(f"Run '{run.name}' already has test scores. Exiting.")
sys.exit(0)
# load model
model_filename = os.path.join(run.config["exp_path"], "best_loss.pth")
logger.info(f"Loading fine-tuned model. Using model checkpoint from the file: {model_filename}")
model, model_mean, model_std = load_model(config, checkpoint_path=model_filename)
# create dataloaders
logger.info("Creating DataLoaders.")
_, testloader, _, _ = get_dataloaders(
None,
test_df,
augmentations=config["augmentations"],
image_size=config["image_size"],
model_mean=model_mean,
model_std=model_std,
batch_size=config["batch_size"],
num_workers=config["workers"],
)
# run inference
logger.info("Evaluating the model.")
preds, targs, _, scores = predict(model, testloader, device=device)
# log scores
scores_str = "\t".join([f"{k}: {v:.2%}" for k, v in scores.items()])
logger.info(f"Scores - {scores_str}")
logger.info("Logging scores to wandb.")
log_summary_scores(wandb_run_path, test_scores=scores, allow_new=True, prefix="Test. ")
# resume W&B run and log classification report to W&B
resume_wandb(run_id=run.id, entity=run.entity, project=run.project)
clf_report_df = classification_report_df(test_df, preds, targs, label_col=label_col)
th_report_df = threhold_analysis_report_df(test_df, preds, targs, label_col=label_col)
wandb.log(
{
"clf_report_table": wandb.Table(dataframe=clf_report_df),
"th_report_table": wandb.Table(dataframe=th_report_df),
}
)
# store predictions and targets in the experiment dir
eval_path = os.path.join(run.config["exp_path"], "evaluation")
os.makedirs(eval_path, exist_ok=True)
preds_filepath = os.path.join(eval_path, "predictions.npy")
logger.info(f"Storing predictions to: {preds_filepath}")
np.save(
preds_filepath,
{
"metadata_file": test_metadata,
"wandb_run_path": wandb_run_path,
"preds": preds,
"targs": targs,
},
)
if __name__ == "__main__":
test_clf()