from pathlib import Path # Directory where request by models are stored DIR_OUTPUT_REQUESTS = Path("requested_models") EVAL_REQUESTS_PATH = Path("eval_requests") ########################## # Text definitions # ########################## banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/asr_leaderboard.png" BANNER = f'

' TITLE = "

🤗 Open Automatic Speech Recognition Leaderboard " INTRODUCTION_TEXT = "📐 The 🤗 Open ASR Leaderboard evaluates open-source and proprietary speech recognition models on English and multiple European languages. \ \nWe report the Average WER (⬇️ lower the better) and RTFx (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the '🤗 About' tab to understand how models are evaluated. \ \nTo promote transparancy and reproducibility, evaluation scripts can be found on GitHub. Further trends and observations can be found in our paper. \ \n\nIf you want results for a model (or dataset) not listed here, you can directly open a pull request (PR) on GitHub!" CITATION_TEXT = """@misc{srivastav2025openasrleaderboardreproducible, title={Open ASR Leaderboard: Towards Reproducible and Transparent Multilingual and Long-Form Speech Recognition Evaluation}, author={Vaibhav Srivastav and Steven Zheng and Eric Bezzam and Eustache Le Bihan and Nithin Koluguri and Piotr Żelasko and Somshubra Majumdar and Adel Moumen and Sanchit Gandhi}, year={2026}, eprint={2510.06961}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2510.06961}, } """ # For new changes, add a bullet with the date at the start (it is extracted for the "Last updated..." info in the UI) CHANGELOG_TEXT = """ - 20 May 2026 - Removed Tedlium v3 from main and longform tabs due to license change in original data. Related commit for updated leaderboard results: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard-results/commit/f3ff7c9d583f4beaf908f2b2c18f3055040e515b - 5 May 2026 — Added 🔒 Private Data tab with benchmarks from Appen Inc. and DataoceanAI (11 datasets covering scripted and conversational speech across US, British, Australian, Canadian, and Indian accents). Private data average WER is now available as a toggleable column in the main leaderboard. Added rank column to show how ordering changes. """ METRICS_TAB_TEXT = """ Here you will find details about the speech recognition metrics and datasets used in our leaderboard. ## Benchmark datasets For each task, we take a macro-average over all datasets. The "Leaderboard" tab, namely short-form English, uses the test split of the following datasets. For convenience, they are aggregated into a single dataset [here](https://huggingface.co/datasets/hf-audio/open-asr-leaderboard). | Dataset | Duration [h] | License | Source | Style | Transcriptions | | --------------------------------------------------------------------------------------- | ------------ | --------------- | --------------------------- | -------------------- | ------------------------------- | | [AMI Meeting Corpus](https://huggingface.co/datasets/edinburghcstr/ami) | 9 | CC-BY-4.0 | Meetings | Spontaneous | Punctuated, cased, disfluencies | | [Earnings22](https://huggingface.co/datasets/distil-whisper/earnings22) | 119 | CC-BY-SA-4.0 | Earnings calls | Oratory, spontaneous | Punctuated, cased, disfluencies | | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | 40 | Apache-2.0 | Audiobook, podcast, YouTube | Read, spontaneous | Punctuated, disfluencies | | [LibriSpeech (clean)](https://huggingface.co/datasets/openslr/librispeech_asr) | 5.4 | CC-BY-4.0 | Audiobooks | Read | Normalized | | [LibriSpeech (other)](https://huggingface.co/datasets/openslr/librispeech_asr) | 5.1 | CC-BY-4.0 | Audiobooks (noisier) | Read | Normalized | | [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) | 100 | User Agreement | Financial meetings | Oratory, spontaneous | Punctuated, cased | | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | 5 | CC0 | European Parliament | Oratory | Punctuated | The "Multilingual" tab uses the test split of the following datasets. For convenience, they are aggregated into a single dataset [here](https://huggingface.co/datasets/nithinraok/asr-leaderboard-datasets). | Dataset | Languages | Duration [h] | License | Source | Style | Transcriptions | | ---------------------------------------------------------------- | ------------------ | ------------ | ------------ | ----------- | ----- | ----------------- | | [CoVoST-2](https://huggingface.co/datasets/facebook/covost2) | de, fr, it, es | 5.3–23 | CC-BY-NC-4.0 | Open domain | Read | Punctuated, cased | | [FLEURS](https://huggingface.co/datasets/google/fleurs) | de, fr, it, es, pt | 2.0–3.5 | CC-BY-4.0 | Wikipedia | Read | Punctuated, cased | | [MLS](https://www.openslr.org/94) | fr, it, es, pt | 0.8–6.3 | CC-BY-4.0 | Audiobooks | Read | Normalized | The "Longform" tab uses the test split of the following datasets, and the entire CORAAL dataset. For convenience, they are aggregated [here](https://huggingface.co/datasets/hf-audio/asr-leaderboard-longform). CORAAL is [separate](https://huggingface.co/datasets/bezzam/coraal) as each split has a different metadata structure. | Dataset | Duration [h] | License | Source | Style | Transcriptions | | ---------------------------------------------------------------------------- | ------------ | --------------- | -------------------------- | -------------------- | ------------------------------- | | [CORAAL](https://oraal.github.io/coraal) | 159 | CC-BY-NC-4.0 | Sociolinguistic interviews | Spontaneous | Punctuated, cased, disfluencies | | [Earnings21](https://huggingface.co/datasets/Revai/earnings21) | 39 | CC-BY-SA-4.0 | Earnings calls | Oratory, spontaneous | Punctuated, cased, disfluencies | | [Earnings22](https://huggingface.co/datasets/distil-whisper/earnings22) | 119 | CC-BY-SA-4.0 | Earnings calls | Oratory, spontaneous | Punctuated, cased, disfluencies | The "Private data" tab uses the following datasets, which are not publicly available. | Dataset | Accent | Duration [h] | Male (%) / Female (%) | Style | Transcription | | --- | --- | --- | --- | --- | --- | | Appen Scripted AU | Australian | 1.42 | 49 / 51 | Read | Punctuated, cased. | | Appen Scripted CA | Canadian | 1.53 | 52 / 48 | Read | Punctuated, cased. | | Appen Scripted IN | Indian | 1.02 | 49 / 51 | Read | Punctuated, cased. | | Appen Scripted US | American | 1.45 | 49 / 51 | Read | Punctuated, cased. | | Appen Conversational IN | Indian | 1.37 | 51 / 49 | Conversational, spontaneous | Punctuated, disfluencies. | | Appen Conversational US003 | American | 1.64 | 49 / 51 | Conversational, spontaneous | Punctuated, cased, disfluencies. | | Appen Conversational US004 | American | 1.65 | 49 / 51 | Conversational, spontaneous | Punctuated, disfluencies. | | DataoceanAI Scripted US | American | 2.43 | 54 / 46 | Read | Punctuated, cased (proper nouns), disfluencies. | | DataoceanAI Scripted GB | British | 2.43 | 47 / 53 | Read | Punctuated, disfluencies. | | DataoceanAI Conversational US | American | 8.82 | NA | Conversational, spontaneous | Punctuated, disfluencies. | | DataoceanAI Conversational GB | British | 5.96 | NA | Conversational, spontaneous | Punctuated, disfluencies. | Below are sample audio showing the variety of content (scripted, conversational, acronyms, disfluencies, proper nouns). ## Metrics Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based on their WER, lowest to highest. Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold: 1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish. 2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios). For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model). ### Word Error Rate (WER) Word Error Rate is used to measure the accuracy of automatic speech recognition systems. It calculates the percentage of words in the system's output that differ from the reference (correct) transcript. A lower WER value indicates higher accuracy. Take the following example: | Reference: | the | cat | sat | on | the | mat | |-------------|-----|-----|---------|-----|-----|-----| | Prediction: | the | cat | sit | on | the | | | | Label: | ✅ | ✅ | S | ✅ | ✅ | D | Here, we have: * 1 substitution ("sit" instead of "sat") * 0 insertions * 1 deletion ("mat" is missing) This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our reference (N), which for this example is 6: ``` WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333 ``` Giving a WER of 0.33, or 33%. Not all models produce transcripts with punctuation, casing, or disfluencies; in particular, some models explicitly remove the latter. To account for discrepancies between model outputs and dataset transcriptions, we normalize all text prior to computing WER. This normalization removes punctuation and casing, and applies a pipeline closely following that of Whisper. See [here](https://github.com/huggingface/open_asr_leaderboard/tree/main/normalizer) for the relevant code. The pipeline includes number normalization (\eg ``zero'' to ``0''), spelling standardization, and the removal of filler words. ### Inverse Real Time Factor (RTFx) Inverse Real Time Factor is a measure of the latency of automatic speech recognition systems, i.e. how long it takes an model to process a given amount of speech. It is defined as: ``` RTFx = (number of seconds of audio inferred) / (compute time in seconds) ``` Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time. Thus, a higher RTFx value indicates lower latency. ## How to reproduce our results The Open ASR Leaderboard is a continuous and community effort to benchmark speech recognition models. Along with the leaderboard we've open-sourced the codebase used for running these evaluations. For more details head over to our repo at: https://github.com/huggingface/open_asr_leaderboard We'd love to know which other models (or datasets) you'd like us to benchmark next. Contributions are more than welcome! ♥️ """ # Multilingual benchmark definitions EU_LANGUAGES = { "de": {"name": "German", "flag": "🇩🇪", "datasets": ["fleurs", "covost"]}, "fr": {"name": "French", "flag": "🇫🇷", "datasets": ["mls", "fleurs", "covost"]}, "it": {"name": "Italian", "flag": "🇮🇹", "datasets": ["mls", "fleurs", "covost"]}, "es": {"name": "Spanish", "flag": "🇪🇸", "datasets": ["mls", "fleurs", "covost"]}, "pt": {"name": "Portuguese", "flag": "🇵🇹", "datasets": ["mls", "fleurs"]} } MULTILINGUAL_TAB_TEXT = """ ## 🌍 Multilingual ASR Evaluation """ LONGFORM_TAB_TEXT = """ ## 📝 Long-form ASR Evaluation """ PRIVATE_DATA_TAB_TEXT = """ ## 🔒 Private Data ASR Evaluation This tab evaluates models on private (non-public) speech datasets from multiple data providers. The purpose of this benchmark is to address risks of overfitting to public datasets, and to incentivize the development of models that generalize well to real-world data. For more information, check out our [blog post](https://huggingface.co/blog/open-asr-leaderboard-private-data). Scores are aggregated so that no individual data provider or split is exposed. - Average WER: Macro-average of per-provider averages (providers weighted equally). - Avg Scripted: Macro-average across all scripted-speech datasets. - Avg Conversational: Macro-average across all conversational-speech datasets. - Avg US: Macro-average across all US-accent datasets. - Avg non-US: Macro-average across all non-US-accent datasets. For details on the benchmark datasets used in this tab, see the 🤗 About tab. ### Acknowledgements We would like to thank [Appen Inc.](https://huggingface.co/AppenAIResearch) and [DataoceanAI](https://huggingface.co/DataoceanAI1) for generously providing private evaluation data for this benchmark 🙏 """ LEADERBOARD_CSS = """ #leaderboard-table th, #leaderboard-table td { min-width: 70px; } #leaderboard-table th .header-content { white-space: nowrap; } #leaderboard-table td { white-space: nowrap; overflow: visible !important; text-overflow: clip !important; max-width: none !important; } #leaderboard-table { width: 100% !important; table-layout: auto !important; } #multilingual-table th, #multilingual-table td { min-width: 70px; } #multilingual-table th .header-content { white-space: nowrap; } #multilingual-table td { white-space: nowrap; overflow: visible !important; text-overflow: clip !important; max-width: none !important; } #multilingual-table { width: 100% !important; table-layout: auto !important; } #multilingual-table th:hover { background-color: var(--table-row-focus); } #longform-table th, #longform-table td { min-width: 70px; } #longform-table th .header-content { white-space: nowrap; } #longform-table td { white-space: nowrap; overflow: visible !important; text-overflow: clip !important; max-width: none !important; } #longform-table { width: 100% !important; table-layout: auto !important; } #longform-table th:hover { background-color: var(--table-row-focus); } #private-data-table th, #private-data-table td { min-width: 70px; } #private-data-table th .header-content { white-space: nowrap; } #private-data-table td { white-space: nowrap; overflow: visible !important; text-overflow: clip !important; max-width: none !important; } #private-data-table { width: 100% !important; table-layout: auto !important; } #private-data-table th:hover { background-color: var(--table-row-focus); } .language-detail-modal { background: var(--background-fill-primary); border: 1px solid var(--border-color-primary); border-radius: 8px; padding: 1rem; margin: 1rem 0; } """