import gradio as gr import pandas as pd import json from pathlib import Path from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS, EU_LANGUAGES, MULTILINGUAL_TAB_TEXT, LONGFORM_TAB_TEXT, PRIVATE_DATA_TAB_TEXT, CHANGELOG_TEXT from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub from utils_display import AutoEvalColumn, MultilingualColumn, LongformColumn, PrivateDataColumn, fields, make_clickable_model, styled_error, styled_message import numpy as np import plotly.graph_objects as go import re from datetime import datetime, timezone # Extract the most recent date from the changelog (first bold date entry) _changelog_date_match = re.search(r"\*\*(.+?)\*\*", CHANGELOG_TEXT) LAST_UPDATED = _changelog_date_match.group(1) if _changelog_date_match else "19 April 2026" # Global variable to store detailed benchmark data benchmark_details = {} expanded_languages = set() # Track which languages are expanded column_names = { "MODEL": "Model", "RTFx": "RTFx ⬆️️", "AMI WER": "AMI", "Earnings22 WER": "Earnings22", "Gigaspeech WER": "Gigaspeech", "LS Clean WER": "LS Clean", "LS Other WER": "LS Other", "SPGISpeech WER": "SPGISpeech", "Voxpopuli WER": "Voxpopuli", } always_visible = ["model", "Average WER ⬇️", "Rank Δ"] default_datasets = ["AMI", "Earnings22", "Gigaspeech", "LS Clean", "LS Other", "SPGISpeech", "Voxpopuli"] AUDIO_LM_MODELS = { "nvidia/canary-qwen-2.5b", "mistralai/Voxtral-Small-24B-2507", "mistralai/Voxtral-Mini-3B-2507", "microsoft/VibeVoice-ASR-HF", "ibm-granite/granite-speech-4.1-2b", "ibm-granite/granite-4.0-1b-speech", "ibm-granite/granite-speech-3.3-8b", "ibm-granite/granite-speech-3.3-2b", "microsoft/Phi-4-multimodal-instruct", } METADATA_COLUMNS = { "Model", "License", "Size (B)", "# Languages", "Encoder", "Decoder", } AVG_COLUMNS = { "Average WER ⬇️", "RTFx ⬆️️", "Rank Δ", } EXCLUDED_AVG_COLS = METADATA_COLUMNS | AVG_COLUMNS eval_queue_repo, requested_models, csv_results, multilingual_csv_path, longform_csv_path, appen_csv_path, dataocean_csv_path = load_all_info_from_dataset_hub() if not csv_results.exists(): raise Exception(f"CSV file {csv_results} does not exist locally") # Get csv with data and parse columns original_df = pd.read_csv(csv_results) def _compute_average_wer_from_default_datasets(df): """Compute Average WER from the default leaderboard datasets.""" df = df.copy() wer_cols = [c for c in default_datasets if c in df.columns] if wer_cols: def compute_avg(row): values = [] for col in wer_cols: value = row[col] if value == "NA" or value is None: return "NA" try: values.append(float(value)) except (TypeError, ValueError): return "NA" return round(np.mean(values), 2) if values else "NA" df["Average WER ⬇️"] = df.apply(compute_avg, axis=1) else: df["Average WER ⬇️"] = "NA" return df # Formats the columns def formatter(x, col=None): # Special rule for "# Languages" if col == "# Languages": try: if pd.isna(x) or str(x).strip() in ["", "0", "0.0", "-1", "NA"]: return 1 return int(float(x)) # safer conversion except (ValueError, TypeError): return 1 # fallback if anything unexpected # Generic NA handling if x is None or pd.isna(x) or str(x).strip() in ["", "0", "0.0", "-1"]: return "NA" # Keep strings if isinstance(x, str): return x # Numeric return round(x, 2) for col in original_df.columns: if col == "model": original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) else: original_df[col] = original_df[col].apply(lambda x: formatter(x, col)) original_df.rename(columns=column_names, inplace=True) if "Avg. WER" in original_df.columns: original_df = original_df.drop(columns=["Avg. WER"]) original_df = _compute_average_wer_from_default_datasets(original_df) original_df = original_df.sort_values(by='Average WER ⬇️', key=lambda col: pd.to_numeric(col, errors="coerce"), na_position="last") COLS = [c.name for c in fields(AutoEvalColumn)] TYPES = [c.type for c in fields(AutoEvalColumn)] # Multilingual columns (dynamic based on expansion state) MULTILINGUAL_COLS = [c.name for c in fields(MultilingualColumn)] # Longform columns LONGFORM_COLS = [c.name for c in fields(LongformColumn)] LONGFORM_TYPES = [c.type for c in fields(LongformColumn)] # Private data columns PRIVATE_DATA_COLS = [c.name for c in fields(PrivateDataColumn)] PRIVATE_DATA_TYPES = [c.type for c in fields(PrivateDataColumn)] # Dataset classification for private data APPEN_SCRIPTED = ["Scripted-US", "Scripted-AU", "Scripted-CA", "Scripted-IN"] APPEN_CONVERSATIONAL = ["Conversational-US003", "Conversational-US004", "Conversational-IN"] APPEN_US = ["Scripted-US", "Conversational-US003", "Conversational-US004"] APPEN_NON_US = ["Scripted-AU", "Scripted-CA", "Scripted-IN", "Conversational-IN"] DATAOCEAN_SCRIPTED = ["Scripted-US", "Scripted-GB"] DATAOCEAN_CONVERSATIONAL = ["Conversational-US", "Conversational-GB"] DATAOCEAN_US = ["Scripted-US", "Conversational-US"] DATAOCEAN_NON_US = ["Scripted-GB", "Conversational-GB"] def create_multilingual_dataframe(): """Create multilingual dataframe with CoVoST, MLS, and FLEURS benchmark data""" global benchmark_details, expanded_languages if multilingual_csv_path is None or not multilingual_csv_path.exists(): raise Exception("Multilingual CSV file not found") # Load CSV data multilingual_raw_df = pd.read_csv(multilingual_csv_path) # Store detailed benchmark data for click functionality benchmark_details = {} multilingual_data = [] for _, row_data in multilingual_raw_df.iterrows(): model_name = row_data['model'] model_details = {} row = {"Model": make_clickable_model(model_name)} # Process data for each language and collect all individual datapoints all_datapoints = [] # Collect all individual dataset scores across all languages for lang_code, lang_info in EU_LANGUAGES.items(): # Get individual benchmark scores from CSV, using None for missing values # Special cases: de doesn't have MLS, pt doesn't have CoVoST if lang_code == "pt": covost_score = None # pt doesn't have CoVoST data else: covost_score = row_data.get(f"{lang_code}_covost", None) if lang_code == "de": mls_score = None # de doesn't have MLS data else: mls_score = row_data.get(f"{lang_code}_mls", None) fleurs_score = row_data.get(f"{lang_code}_fleurs", None) # Convert string zeros or empty values to None for score_name, score_val in [("covost", covost_score), ("mls", mls_score), ("fleurs", fleurs_score)]: if score_val is not None and (score_val == 0.0 or score_val == "" or str(score_val).strip() == "0" or str(score_val).strip() == ""): if score_name == "covost": covost_score = None elif score_name == "mls": mls_score = None elif score_name == "fleurs": fleurs_score = None # Add individual datapoints to the global list if covost_score is not None and covost_score > 0: all_datapoints.append(covost_score) if mls_score is not None and mls_score > 0: all_datapoints.append(mls_score) if fleurs_score is not None and fleurs_score > 0: all_datapoints.append(fleurs_score) # Calculate average only from available scores for this language (for display) available_scores = [s for s in [covost_score, mls_score, fleurs_score] if s is not None and s > 0] if available_scores: avg_score = round(sum(available_scores) / len(available_scores), 2) else: avg_score = None # Store individual scores for detailed view (only store existing datasets) lang_data = {"average": avg_score if avg_score is not None else "NA"} # Only store datasets that exist for this language if lang_code != "pt" and covost_score is not None: # pt doesn't have CoVoST lang_data["CoVoST"] = covost_score if lang_code != "de" and mls_score is not None: # de doesn't have MLS lang_data["MLS"] = mls_score if fleurs_score is not None: # All languages have FLEURS lang_data["FLEURS"] = fleurs_score model_details[lang_code] = lang_data # Calculate overall multilingual average from all individual datapoints if all_datapoints: row["Average WER ⬇️"] = round(np.mean(all_datapoints), 2) else: row["Average WER ⬇️"] = 0.0 # Add RTFx from the CSV (it should be a single value per model) rtfx_value = row_data.get("rtfx", row_data.get("RTFx", 0.0)) # Convert 0 or -1 values to "NA" like in the English leaderboard if rtfx_value == 0.0 or rtfx_value == -1 or rtfx_value == 0 or rtfx_value == "0" or rtfx_value == "0.0": row["RTFx ⬆️️"] = "NA" else: row["RTFx ⬆️️"] = rtfx_value # Add language columns based on expansion state for lang_code, lang_info in EU_LANGUAGES.items(): lang_col_name = f"{lang_info['flag']} {lang_info['name']}" model_data = model_details[lang_code] if lang_code in expanded_languages: # Show average column AND detailed columns row[f"{lang_col_name} Avg"] = model_data["average"] # Only show columns for datasets that actually exist in the data if "CoVoST" in model_data: row[f"{lang_col_name} CoVoST"] = model_data["CoVoST"] if "MLS" in model_data: row[f"{lang_col_name} MLS"] = model_data["MLS"] if "FLEURS" in model_data: row[f"{lang_col_name} FLEURS"] = model_data["FLEURS"] else: # Show only average column row[lang_col_name] = model_data["average"] # Store model details for click functionality benchmark_details[model_name] = model_details multilingual_data.append(row) multilingual_df = pd.DataFrame(multilingual_data) multilingual_df = multilingual_df.sort_values(by='Average WER ⬇️') return multilingual_df def get_multilingual_datatypes(df): """Generate appropriate datatypes for multilingual dataframe columns""" datatypes = [] for col in df.columns: if col == "Model": datatypes.append("markdown") # This allows HTML rendering else: datatypes.append("number") return datatypes def get_language_details(model, language_code): """Get detailed breakdown for a specific model and language""" global benchmark_details if model not in benchmark_details or language_code not in benchmark_details[model]: return None language_info = EU_LANGUAGES.get(language_code, {}) language_name = language_info.get("name", "Unknown") model_data = benchmark_details[model][language_code] details = { "Language": f"{language_info.get('flag', '')} {language_name}", "Model": model, "CoVoST WER": model_data["CoVoST"], "MLS WER": model_data["MLS"], "FLEURS WER": model_data["FLEURS"], "Average WER": model_data["average"] } return details def toggle_language_expansion(language_code): """Toggle expansion of language columns when button is clicked""" global expanded_languages # Toggle expansion state if language_code in expanded_languages: expanded_languages.remove(language_code) else: expanded_languages.add(language_code) # Recreate dataframe with new expansion state updated_df = create_multilingual_dataframe() updated_datatypes = get_multilingual_datatypes(updated_df) return gr.update(value=updated_df, datatype=updated_datatypes) # Initialize multilingual dataframe multilingual_df = create_multilingual_dataframe() def create_longform_dataframe(): """Create longform dataframe from CSV data""" if longform_csv_path is not None and longform_csv_path.exists(): longform_raw_df = pd.read_csv(longform_csv_path) longform_data = [] for _, row_data in longform_raw_df.iterrows(): model_name = row_data['model_id'] # Get values from CSV, similar to other tabs earnings21_wer = row_data.get('earnings21', -1) earnings22_wer = row_data.get('earnings22', -1) coraal_wer = row_data.get('coraal_avg', -1) rtfx_value = row_data.get('RTFx', 0) # Calculate average WER from available datasets available_wers = [w for w in [earnings21_wer, earnings22_wer, coraal_wer] if w != -1 and w > 0] avg_wer = round(np.mean(available_wers), 2) if available_wers else 0.0 row = { "Model": make_clickable_model(model_name), "Average WER ⬇️": avg_wer, "RTFx ⬆️️": rtfx_value if rtfx_value > 0 else "NA", "Earnings21": earnings21_wer if earnings21_wer != -1 else "NA", "Earnings22": earnings22_wer if earnings22_wer != -1 else "NA", "CORAAL": coraal_wer if coraal_wer != -1 else "NA", } longform_data.append(row) longform_df = pd.DataFrame(longform_data) longform_df = longform_df.sort_values(by='Average WER ⬇️') return longform_df # Initialize longform dataframe longform_df = create_longform_dataframe() def _safe_mean(values): """Compute mean of non-NA, positive values. Return None if empty.""" valid = [v for v in values if v is not None and v != "NA" and v > 0] return round(np.mean(valid), 2) if valid else None def create_private_data_dataframe(): """Create private data dataframe by merging Appen and DataoceanAI benchmarks. Averages are computed as follows: - Avg WER: macro-average of per-provider averages (equal weight per provider) - Avg Scripted: macro-average of all scripted datasets - Avg Conversational: macro-average of all conversational datasets - Avg US: macro-average of all US-accent datasets - Avg non-US: macro-average of all non-US-accent datasets """ appen_df = None dataocean_df = None # Only keep the model column and the raw per-dataset score columns; # ignore any pre-computed average columns in the CSVs. appen_keep_cols = ["model"] + APPEN_SCRIPTED + APPEN_CONVERSATIONAL dataocean_keep_cols = ["model"] + DATAOCEAN_SCRIPTED + DATAOCEAN_CONVERSATIONAL if appen_csv_path is not None and Path(appen_csv_path).exists(): appen_df = pd.read_csv(appen_csv_path) appen_df = appen_df[[c for c in appen_keep_cols if c in appen_df.columns]] if dataocean_csv_path is not None and Path(dataocean_csv_path).exists(): dataocean_df = pd.read_csv(dataocean_csv_path) dataocean_df = dataocean_df[[c for c in dataocean_keep_cols if c in dataocean_df.columns]] if appen_df is None and dataocean_df is None: raise Exception("No private data CSV files found") # Determine the set of models present in either provider models = set() if appen_df is not None: models |= set(appen_df["model"].tolist()) if dataocean_df is not None: models |= set(dataocean_df["model"].tolist()) private_data_rows = [] private_avg_wer_map = {} # model -> avg WER for the main leaderboard column private_scripted_map = {} # model -> avg scripted WER private_conversational_map = {} # model -> avg conversational WER for model_name in sorted(models): appen_row = None dataocean_row = None if appen_df is not None and model_name in appen_df["model"].values: appen_row = appen_df[appen_df["model"] == model_name].iloc[0] if dataocean_df is not None and model_name in dataocean_df["model"].values: dataocean_row = dataocean_df[dataocean_df["model"] == model_name].iloc[0] # Collect all scores per category all_scripted = [] all_conversational = [] all_us = [] all_non_us = [] appen_all = [] dataocean_all = [] if appen_row is not None: for col in APPEN_SCRIPTED: v = appen_row.get(col, None) if v is not None and v > 0: all_scripted.append(v) appen_all.append(v) for col in APPEN_CONVERSATIONAL: v = appen_row.get(col, None) if v is not None and v > 0: all_conversational.append(v) appen_all.append(v) for col in APPEN_US: v = appen_row.get(col, None) if v is not None and v > 0: all_us.append(v) for col in APPEN_NON_US: v = appen_row.get(col, None) if v is not None and v > 0: all_non_us.append(v) if dataocean_row is not None: for col in DATAOCEAN_SCRIPTED: v = dataocean_row.get(col, None) if v is not None and v > 0: all_scripted.append(v) dataocean_all.append(v) for col in DATAOCEAN_CONVERSATIONAL: v = dataocean_row.get(col, None) if v is not None and v > 0: all_conversational.append(v) dataocean_all.append(v) for col in DATAOCEAN_US: v = dataocean_row.get(col, None) if v is not None and v > 0: all_us.append(v) for col in DATAOCEAN_NON_US: v = dataocean_row.get(col, None) if v is not None and v > 0: all_non_us.append(v) # Avg WER = macro-average of per-provider averages provider_avgs = [] if appen_all: provider_avgs.append(np.mean(appen_all)) if dataocean_all: provider_avgs.append(np.mean(dataocean_all)) avg_wer = round(np.mean(provider_avgs), 2) if provider_avgs else "NA" avg_scripted = _safe_mean(all_scripted) if all_scripted else "NA" avg_conversational = _safe_mean(all_conversational) if all_conversational else "NA" avg_us = _safe_mean(all_us) if all_us else "NA" avg_non_us = _safe_mean(all_non_us) if all_non_us else "NA" private_data_rows.append({ "Model": make_clickable_model(model_name), "Average WER ⬇️": avg_wer, "Avg Scripted": avg_scripted, "Avg Conversational": avg_conversational, "Avg US": avg_us, "Avg non-US": avg_non_us, }) private_avg_wer_map[model_name] = avg_wer private_scripted_map[model_name] = avg_scripted private_conversational_map[model_name] = avg_conversational private_df = pd.DataFrame(private_data_rows) # Convert to numeric for sorting; "NA" strings become NaN and sort to the end private_df = private_df.sort_values( by="Average WER ⬇️", key=lambda col: pd.to_numeric(col, errors="coerce"), na_position="last", ) return private_df, private_avg_wer_map, private_scripted_map, private_conversational_map # Initialize private data dataframe private_data_df, private_avg_wer_map, private_scripted_map, private_conversational_map = create_private_data_dataframe() # Add "Private data (scripted)" and "Private data (conversational)" columns to main leaderboard # Match by plain model name (strip HTML from original_df's model column) def _get_plain_model_name(html_str): import re return re.sub(r"<[^>]+>", "", str(html_str)) original_df["Private data (scripted)"] = original_df[original_df.columns[0]].apply( lambda x: private_scripted_map.get(_get_plain_model_name(x), "NA") ) original_df["Private data (conversational)"] = original_df[original_df.columns[0]].apply( lambda x: private_conversational_map.get(_get_plain_model_name(x), "NA") ) def request_model(model_text, chbcoco2017): # Determine the selected checkboxes dataset_selection = [] if chbcoco2017: dataset_selection.append("ESB Datasets tests only") if len(dataset_selection) == 0: return styled_error("You need to select at least one dataset") base_model_on_hub, error_msg = is_model_on_hub(model_text) if not base_model_on_hub: return styled_error(f"Base model '{model_text}' {error_msg}") # Construct the output dictionary current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") required_datasets = ', '.join(dataset_selection) eval_entry = { "date": current_time, "model": model_text, "datasets_selected": required_datasets } # Prepare file path DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) fn_datasets = '@ '.join(dataset_selection) filename = model_text.replace("/","@") + "@@" + fn_datasets if filename in requested_models: return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") try: filename_ext = filename + ".txt" out_filepath = DIR_OUTPUT_REQUESTS / filename_ext # Write the results to a text file with open(out_filepath, "w") as f: f.write(json.dumps(eval_entry)) upload_file(filename, out_filepath) # Include file in the list of uploaded files requested_models.append(filename) # Remove the local file out_filepath.unlink() return styled_message("🤗 Your request has been submitted and will be evaluated soon!
") except Exception as e: return styled_error(f"Error submitting request!") # Default ranking is computed lazily after default_columns is known _default_rank = {} def _compute_default_rank(): """Compute the baseline ranking using the default dataset selection.""" global _default_rank if _default_rank: return # already computed # Use default_datasets as the source of truth for which WER columns define the baseline df = original_df.copy() wer_cols = [c for c in default_datasets if c in df.columns] if wer_cols: for c in wer_cols: df = df[df[c] != "NA"] def compute_avg(row): vals = [float(row[c]) for c in wer_cols] return round(np.mean(vals), 2) if vals else "NA" df["Average WER ⬇️"] = df.apply(compute_avg, axis=1) df = df.sort_values(by="Average WER ⬇️") model_col = df.columns[0] for rank, (_, row) in enumerate(df.iterrows(), start=1): _default_rank[row[model_col]] = rank def filter_main_table(search_query, show_proprietary, show_llm, selected_columns): # Uncomment show_llm parameter for show_llm feature filtered_df = original_df.copy() # Filter by model name search (case-insensitive, supports comma-separated terms) model_col = filtered_df.columns[0] if search_query: terms = [t.strip() for t in search_query.split(",") if t.strip()] if terms: mask = filtered_df[model_col].str.lower().apply( lambda cell: any(term.lower() in cell for term in terms) ) filtered_df = filtered_df[mask] # Filter proprietary models if needed if not show_proprietary and "License" in filtered_df.columns: filtered_df = filtered_df[filtered_df["License"].str.lower() != "proprietary"] # Uncomment below for show_llm feature # Filter to only audio language models if checked # if show_llm: # filtered_df = filtered_df[filtered_df[model_col].apply( # lambda x: strip_html(x) in AUDIO_LM_MODELS # )] # Hide toggleable columns that the user unchecked all_toggleable = [c for c in filtered_df.columns if c not in always_visible] columns_to_hide = set(all_toggleable) - set(selected_columns) filtered_df = filtered_df[[c for c in filtered_df.columns if c not in columns_to_hide]] # Recompute average WER from the selected WER benchmark columns (exclude RTFx) wer_cols = [c for c in selected_columns if c in filtered_df.columns and c not in EXCLUDED_AVG_COLS] if wer_cols: # Drop models that have NA for any selected WER benchmark column for c in wer_cols: filtered_df = filtered_df[filtered_df[c] != "NA"] def compute_avg(row): vals = [float(row[c]) for c in wer_cols] return round(np.mean(vals), 2) if vals else "NA" filtered_df["Average WER ⬇️"] = filtered_df.apply(compute_avg, axis=1) else: filtered_df["Average WER ⬇️"] = "NA" filtered_df = filtered_df.sort_values(by="Average WER ⬇️") # Compute Rank Δ compared to default ranking, but only among models # present in the current filtered view (apples-to-apples comparison). model_col = filtered_df.columns[0] current_models = set(filtered_df[model_col].tolist()) # Re-rank the default ordering using only the models in the current view default_subset_rank = {} rank = 1 for model_html in sorted(_default_rank, key=_default_rank.get): if model_html in current_models: default_subset_rank[model_html] = rank rank += 1 new_ranks = {} for new_rank, (_, row) in enumerate(filtered_df.iterrows(), start=1): new_ranks[row[model_col]] = new_rank def rank_delta(model_html): old = default_subset_rank.get(model_html) new = new_ranks.get(model_html) if old is None or new is None: return "—" delta = old - new # positive = model rose, negative = model dropped if delta > 0: return f"▲{delta}" elif delta < 0: return f"▼{abs(delta)}" return "—" filtered_df.insert(2, "Rank Δ", filtered_df[model_col].apply(rank_delta)) # Reorder columns: model, Avg WER, Rank Δ, RTFx, then metadata (if toggled on), # then non-default datasets, then default datasets. visible_cols = list(filtered_df.columns) model_col_name = filtered_df.columns[0] # "model" (lowercase, from CSV) header = [model_col_name, "Average WER ⬇️", "Rank Δ", "RTFx ⬆️️"] header_cols = [c for c in header if c in visible_cols] rest = [c for c in visible_cols if c not in header_cols] metadata = [c for c in rest if c in METADATA_COLUMNS] non_default_wer = [c for c in rest if c not in metadata and c not in default_datasets and c not in EXCLUDED_AVG_COLS] default_wer = [c for c in rest if c in default_datasets] reordered = header_cols + metadata + non_default_wer + default_wer filtered_df = filtered_df[reordered] return filtered_df def strip_html(text): """Remove HTML tags to get plain model name.""" return re.sub(r"<[^>]+>", "", str(text)) def create_pareto_plot(): """Create a Pareto plot of Average WER vs RTFx from the main leaderboard.""" df = original_df.copy() model_col = df.columns[0] # Extract plain model names and numeric WER/RTFx df["_name"] = df[model_col].apply(strip_html) df["_wer"] = pd.to_numeric(df["Average WER ⬇️"], errors="coerce") df["_rtfx"] = pd.to_numeric(df["RTFx ⬆️️"], errors="coerce") # Drop rows with missing WER or RTFx plot_df = df.dropna(subset=["_wer", "_rtfx"]).copy() plot_df = plot_df[plot_df["_rtfx"] > 0] # Compute Pareto front: no other model has both lower WER AND higher RTFx wer = plot_df["_wer"].values rtfx = plot_df["_rtfx"].values is_pareto = np.ones(len(wer), dtype=bool) for i in range(len(wer)): for j in range(len(wer)): if i != j and wer[j] <= wer[i] and rtfx[j] >= rtfx[i] and (wer[j] < wer[i] or rtfx[j] > rtfx[i]): is_pareto[i] = False break pareto_df = plot_df[is_pareto].sort_values("_wer") non_pareto_df = plot_df[~is_pareto] fig = go.Figure() # Non-Pareto models: no text labels, name shown on hover fig.add_trace(go.Scatter( x=non_pareto_df["_wer"], y=non_pareto_df["_rtfx"], mode="markers", marker=dict(size=8, color="lightblue", opacity=0.7), text=non_pareto_df["_name"], hovertemplate="%{text}