From 40dd08ea61b334e376bef2e78609dd8f5fe00f5f Mon Sep 17 00:00:00 2001 From: Vaibhav Srivastav Date: Wed, 4 Jan 2023 19:38:01 +0100 Subject: [PATCH] Created using Colaboratory --- Complete_guide_to_audio_datasets.ipynb | 1757 ++++++++++++++++++++++++ 1 file changed, 1757 insertions(+) create mode 100644 Complete_guide_to_audio_datasets.ipynb diff --git a/Complete_guide_to_audio_datasets.ipynb b/Complete_guide_to_audio_datasets.ipynb new file mode 100644 index 0000000..f75cecb --- /dev/null +++ b/Complete_guide_to_audio_datasets.ipynb @@ -0,0 +1,1757 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "gpuClass": "standard", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "36a167954b254a00a4e16d18294cf7c3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_49d6d159e000483c9fe4c032a055f992", + "IPY_MODEL_5876f3247ae24ec5b8a7838f4d5bc2be", + "IPY_MODEL_68f09fd174f7462aab5510bee77fe160", + "IPY_MODEL_df79fe1fe5264c09a91260932317d38f", + "IPY_MODEL_b97b4f99d22d40a6b227bed5c20294ec" + ], + "layout": "IPY_MODEL_508c19158b644ab9a78773d02eee49ee" + } + }, + "49d6d159e000483c9fe4c032a055f992": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3bbca4ffd06e445295a0f139aa3178c4", + "placeholder": "​", + "style": "IPY_MODEL_5d9776d2fe254ee4aa0d69ead623e9c8", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "5876f3247ae24ec5b8a7838f4d5bc2be": { + "model_module": "@jupyter-widgets/controls", + "model_name": "PasswordModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_ac4e7402cb0a49988e7460c2a70a6cf5", + "placeholder": "​", + "style": "IPY_MODEL_6f6bb7c449384917a94106a5f83ea3ca", + "value": "" + } + }, + "68f09fd174f7462aab5510bee77fe160": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_44028a87160a4f15ab0293a514a5668b", + "style": "IPY_MODEL_cfad88cb21394f8ea9509c05be6daef5", + "value": true + } + }, + "df79fe1fe5264c09a91260932317d38f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_4c8951ef8dda42bf84c70e9c8c94a954", + "style": "IPY_MODEL_d4ba888c575f4a3abd3b8e93c474efe4", + "tooltip": "" + } + }, + "b97b4f99d22d40a6b227bed5c20294ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_34707c463518415e8262ba32134db84a", + "placeholder": "​", + "style": "IPY_MODEL_cd33610fb5f749a6a52bf66b1d676a11", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "508c19158b644ab9a78773d02eee49ee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "3bbca4ffd06e445295a0f139aa3178c4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d9776d2fe254ee4aa0d69ead623e9c8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ac4e7402cb0a49988e7460c2a70a6cf5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f6bb7c449384917a94106a5f83ea3ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "44028a87160a4f15ab0293a514a5668b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cfad88cb21394f8ea9509c05be6daef5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c8951ef8dda42bf84c70e9c8c94a954": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d4ba888c575f4a3abd3b8e93c474efe4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "34707c463518415e8262ba32134db84a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd33610fb5f749a6a52bf66b1d676a11": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Complete Guide to Audio Datasets [Colab Edition]\n", + "\n", + "by: [Vaibhav (VB) Srivastav](https://twitter.com/reach_vb) and [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi)\n", + "\n", + "The objective of this Colab is to reinforce the 🤗 Datasets concepts covered in the accompanying [blog post](https://huggingface.co/blog/audio-datasets) through more 'hands-on' examples. The reader is advised to read this blog post prior to running this Colab. In this Colab, we'll extend the concepts from the blog post to build an end-to-end speech recogntion pipeline.\n", + "\n", + "Automatic Speech Recogntion (ASR) models are measured by their performance on unseen audio data. In this Colab we'll measure the performance of OpenAI's [Whisper model](https://openai.com/blog/whisper/) on **8 ASR datasets** with one script. Using streaming mode, we'll require no more than 20GB of disk space to achieve this." + ], + "metadata": { + "id": "-sRoTHHrVlh5" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Prepare Environment\n", + "\n", + "Let's begin by installing the packages we'll need to process audio datasets. We require the Unix package `ffmpeg` version 4. We'll also need the Python package `datasets`, as well as some other popular Hugging Face libraries like `transformers` and `evaluate` for our ASR pipeline.\n", + "\n", + "*Note*: Do make sure to select a GPU runtime if you haven't already!" + ], + "metadata": { + "id": "ecG9wrKjV1VA" + } + }, + { + "cell_type": "code", + "source": [ + "!add-apt-repository -y ppa:jonathonf/ffmpeg-4 && apt update && apt install -y ffmpeg\n", + "!pip install --quiet datasets transformers evaluate huggingface_hub jiwer" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yF4v_-jQpziy", + "outputId": "1db8a6e4-2b83-4f23-b0c2-ae859da3dbd8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\r0% [Working]\r \rGet:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", + "Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]\n", + "Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease\n", + "Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", + "Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease [1,581 B]\n", + "Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", + "Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", + "Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", + "Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", + "Get:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]\n", + "Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", + "Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,561 kB]\n", + "Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", + "Get:14 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic InRelease [15.9 kB]\n", + "Get:15 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [3,067 kB]\n", + "Get:16 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Packages [1,038 kB]\n", + "Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,338 kB]\n", + "Get:19 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,493 kB]\n", + "Get:20 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 Packages [12.5 kB]\n", + "Fetched 11.8 MB in 9s (1,303 kB/s)\n", + "Reading package lists... Done\n", + "Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease\n", + "Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease\n", + "Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", + "Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", + "Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", + "Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", + "Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease\n", + "Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease\n", + "Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", + "Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease\n", + "Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", + "Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", + "Hit:13 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic InRelease\n", + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "19 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following package was automatically installed and is no longer required:\n", + " libnvidia-common-460\n", + "Use 'apt autoremove' to remove it.\n", + "The following additional packages will be installed:\n", + " libaom0 libavcodec58 libavdevice58 libavfilter7 libavformat58 libavresample4\n", + " libavutil56 libcodec2-0.7 liblilv-0-0 libmysofa1 libpocketsphinx3\n", + " libpostproc55 librabbitmq4 libserd-0-0 libsord-0-0 libsphinxbase3\n", + " libsratom-0-0 libsrt1-gnutls libswresample3 libswscale5 libvidstab1.1\n", + " libx264-155 libx265-192 libzimg2\n", + "Suggested packages:\n", + " ffmpeg-doc serdi sordi\n", + "Recommended packages:\n", + " pocketsphinx-hmm-en-hub4wsj | pocketsphinx-hmm-zh-tdt\n", + " | pocketsphinx-hmm-en-tidigits pocketsphinx-lm-en-hub4\n", + " | pocketsphinx-lm-zh-hans-gigatdt | pocketsphinx-lm-zh-hant-gigatdt\n", + "The following NEW packages will be installed:\n", + " libaom0 libavcodec58 libavdevice58 libavfilter7 libavformat58 libavresample4\n", + " libavutil56 libcodec2-0.7 liblilv-0-0 libmysofa1 libpocketsphinx3\n", + " libpostproc55 librabbitmq4 libserd-0-0 libsord-0-0 libsphinxbase3\n", + " libsratom-0-0 libsrt1-gnutls libswresample3 libswscale5 libvidstab1.1\n", + " libx264-155 libx265-192 libzimg2\n", + "The following packages will be upgraded:\n", + " ffmpeg\n", + "1 upgraded, 24 newly installed, 0 to remove and 18 not upgraded.\n", + "Need to get 13.4 MB of archives.\n", + "After this operation, 49.3 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libcodec2-0.7 amd64 0.7-1 [202 kB]\n", + "Get:2 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libaom0 amd64 1.0.0.errata1-3~18.04.york0 [1,165 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 librabbitmq4 amd64 0.8.0-1ubuntu0.18.04.2 [33.9 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libserd-0-0 amd64 0.28.0~dfsg0-1 [37.0 kB]\n", + "Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libsord-0-0 amd64 0.16.0~dfsg0-1 [20.2 kB]\n", + "Get:6 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libsratom-0-0 amd64 0.6.0~dfsg0-1 [15.8 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu bionic/universe amd64 liblilv-0-0 amd64 0.24.2~dfsg0-1 [38.0 kB]\n", + "Get:8 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libsphinxbase3 amd64 0.8+5prealpha+1-1 [118 kB]\n", + "Get:9 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libpocketsphinx3 amd64 0.8.0+real5prealpha-1ubuntu2 [122 kB]\n", + "Get:10 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavutil56 amd64 7:4.3.2-0york0~18.04 [295 kB]\n", + "Get:11 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libswresample3 amd64 7:4.3.2-0york0~18.04 [70.0 kB]\n", + "Get:12 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libx264-155 amd64 2:0.155.2917+git0a84d98-2~18.04.york0 [529 kB]\n", + "Get:13 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libx265-192 amd64 3.4-0york0~18.04 [1,086 kB]\n", + "Get:14 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavcodec58 amd64 7:4.3.2-0york0~18.04 [4,952 kB]\n", + "Get:15 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libsrt1-gnutls amd64 1.4.1-5~18.04.york0 [235 kB]\n", + "Get:16 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavformat58 amd64 7:4.3.2-0york0~18.04 [1,043 kB]\n", + "Get:17 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libmysofa1 amd64 1.0~dfsg0-2~18.04.york0 [39.3 kB]\n", + "Get:18 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libpostproc55 amd64 7:4.3.2-0york0~18.04 [65.0 kB]\n", + "Get:19 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libswscale5 amd64 7:4.3.2-0york0~18.04 [171 kB]\n", + "Get:20 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libvidstab1.1 amd64 1.1.0-2~18.04.york1 [36.6 kB]\n", + "Get:21 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libzimg2 amd64 3.0.1-0york0~18.04 [183 kB]\n", + "Get:22 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavfilter7 amd64 7:4.3.2-0york0~18.04 [1,254 kB]\n", + "Get:23 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavdevice58 amd64 7:4.3.2-0york0~18.04 [90.0 kB]\n", + "Get:24 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 libavresample4 amd64 7:4.3.2-0york0~18.04 [67.3 kB]\n", + "Get:25 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu bionic/main amd64 ffmpeg amd64 7:4.3.2-0york0~18.04 [1,556 kB]\n", + "Fetched 13.4 MB in 9s (1,456 kB/s)\n", + "Selecting previously unselected package libaom0:amd64.\n", + "(Reading database ... 123991 files and directories currently installed.)\n", + "Preparing to unpack .../00-libaom0_1.0.0.errata1-3~18.04.york0_amd64.deb ...\n", + "Unpacking libaom0:amd64 (1.0.0.errata1-3~18.04.york0) ...\n", + "Selecting previously unselected package libavutil56:amd64.\n", + "Preparing to unpack .../01-libavutil56_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavutil56:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libcodec2-0.7:amd64.\n", + "Preparing to unpack .../02-libcodec2-0.7_0.7-1_amd64.deb ...\n", + "Unpacking libcodec2-0.7:amd64 (0.7-1) ...\n", + "Selecting previously unselected package libswresample3:amd64.\n", + "Preparing to unpack .../03-libswresample3_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libswresample3:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libx264-155:amd64.\n", + "Preparing to unpack .../04-libx264-155_2%3a0.155.2917+git0a84d98-2~18.04.york0_amd64.deb ...\n", + "Unpacking libx264-155:amd64 (2:0.155.2917+git0a84d98-2~18.04.york0) ...\n", + "Selecting previously unselected package libx265-192:amd64.\n", + "Preparing to unpack .../05-libx265-192_3.4-0york0~18.04_amd64.deb ...\n", + "Unpacking libx265-192:amd64 (3.4-0york0~18.04) ...\n", + "Selecting previously unselected package libavcodec58:amd64.\n", + "Preparing to unpack .../06-libavcodec58_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavcodec58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package librabbitmq4:amd64.\n", + "Preparing to unpack .../07-librabbitmq4_0.8.0-1ubuntu0.18.04.2_amd64.deb ...\n", + "Unpacking librabbitmq4:amd64 (0.8.0-1ubuntu0.18.04.2) ...\n", + "Selecting previously unselected package libsrt1-gnutls:amd64.\n", + "Preparing to unpack .../08-libsrt1-gnutls_1.4.1-5~18.04.york0_amd64.deb ...\n", + "Unpacking libsrt1-gnutls:amd64 (1.4.1-5~18.04.york0) ...\n", + "Selecting previously unselected package libavformat58:amd64.\n", + "Preparing to unpack .../09-libavformat58_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavformat58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libserd-0-0:amd64.\n", + "Preparing to unpack .../10-libserd-0-0_0.28.0~dfsg0-1_amd64.deb ...\n", + "Unpacking libserd-0-0:amd64 (0.28.0~dfsg0-1) ...\n", + "Selecting previously unselected package libsord-0-0:amd64.\n", + "Preparing to unpack .../11-libsord-0-0_0.16.0~dfsg0-1_amd64.deb ...\n", + "Unpacking libsord-0-0:amd64 (0.16.0~dfsg0-1) ...\n", + "Selecting previously unselected package libsratom-0-0:amd64.\n", + "Preparing to unpack .../12-libsratom-0-0_0.6.0~dfsg0-1_amd64.deb ...\n", + "Unpacking libsratom-0-0:amd64 (0.6.0~dfsg0-1) ...\n", + "Selecting previously unselected package liblilv-0-0.\n", + "Preparing to unpack .../13-liblilv-0-0_0.24.2~dfsg0-1_amd64.deb ...\n", + "Unpacking liblilv-0-0 (0.24.2~dfsg0-1) ...\n", + "Selecting previously unselected package libmysofa1:amd64.\n", + "Preparing to unpack .../14-libmysofa1_1.0~dfsg0-2~18.04.york0_amd64.deb ...\n", + "Unpacking libmysofa1:amd64 (1.0~dfsg0-2~18.04.york0) ...\n", + "Selecting previously unselected package libsphinxbase3:amd64.\n", + "Preparing to unpack .../15-libsphinxbase3_0.8+5prealpha+1-1_amd64.deb ...\n", + "Unpacking libsphinxbase3:amd64 (0.8+5prealpha+1-1) ...\n", + "Selecting previously unselected package libpocketsphinx3:amd64.\n", + "Preparing to unpack .../16-libpocketsphinx3_0.8.0+real5prealpha-1ubuntu2_amd64.deb ...\n", + "Unpacking libpocketsphinx3:amd64 (0.8.0+real5prealpha-1ubuntu2) ...\n", + "Selecting previously unselected package libpostproc55:amd64.\n", + "Preparing to unpack .../17-libpostproc55_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libpostproc55:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libswscale5:amd64.\n", + "Preparing to unpack .../18-libswscale5_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libswscale5:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libvidstab1.1:amd64.\n", + "Preparing to unpack .../19-libvidstab1.1_1.1.0-2~18.04.york1_amd64.deb ...\n", + "Unpacking libvidstab1.1:amd64 (1.1.0-2~18.04.york1) ...\n", + "Selecting previously unselected package libzimg2.\n", + "Preparing to unpack .../20-libzimg2_3.0.1-0york0~18.04_amd64.deb ...\n", + "Unpacking libzimg2 (3.0.1-0york0~18.04) ...\n", + "Selecting previously unselected package libavfilter7:amd64.\n", + "Preparing to unpack .../21-libavfilter7_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavfilter7:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libavdevice58:amd64.\n", + "Preparing to unpack .../22-libavdevice58_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavdevice58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Selecting previously unselected package libavresample4:amd64.\n", + "Preparing to unpack .../23-libavresample4_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking libavresample4:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Preparing to unpack .../24-ffmpeg_7%3a4.3.2-0york0~18.04_amd64.deb ...\n", + "Unpacking ffmpeg (7:4.3.2-0york0~18.04) over (7:3.4.11-0ubuntu0.1) ...\n", + "Setting up libx264-155:amd64 (2:0.155.2917+git0a84d98-2~18.04.york0) ...\n", + "Setting up libsphinxbase3:amd64 (0.8+5prealpha+1-1) ...\n", + "Setting up libavutil56:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libzimg2 (3.0.1-0york0~18.04) ...\n", + "Setting up libpocketsphinx3:amd64 (0.8.0+real5prealpha-1ubuntu2) ...\n", + "Setting up libpostproc55:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libavresample4:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libsrt1-gnutls:amd64 (1.4.1-5~18.04.york0) ...\n", + "Setting up libmysofa1:amd64 (1.0~dfsg0-2~18.04.york0) ...\n", + "Setting up libvidstab1.1:amd64 (1.1.0-2~18.04.york1) ...\n", + "Setting up libcodec2-0.7:amd64 (0.7-1) ...\n", + "Setting up librabbitmq4:amd64 (0.8.0-1ubuntu0.18.04.2) ...\n", + "Setting up libaom0:amd64 (1.0.0.errata1-3~18.04.york0) ...\n", + "Setting up libserd-0-0:amd64 (0.28.0~dfsg0-1) ...\n", + "Setting up libx265-192:amd64 (3.4-0york0~18.04) ...\n", + "Setting up libswscale5:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libswresample3:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libsord-0-0:amd64 (0.16.0~dfsg0-1) ...\n", + "Setting up libsratom-0-0:amd64 (0.6.0~dfsg0-1) ...\n", + "Setting up libavcodec58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up liblilv-0-0 (0.24.2~dfsg0-1) ...\n", + "Setting up libavformat58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libavfilter7:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up libavdevice58:amd64 (7:4.3.2-0york0~18.04) ...\n", + "Setting up ffmpeg (7:4.3.2-0york0~18.04) ...\n", + "Removing obsolete conffile /etc/ffserver.conf ...\n", + "Processing triggers for libc-bin (2.27-3ubuntu1.6) ...\n", + "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", + "\u001b[K |████████████████████████████████| 451 kB 5.2 MB/s \n", + "\u001b[K |████████████████████████████████| 5.5 MB 58.7 MB/s \n", + "\u001b[K |████████████████████████████████| 72 kB 1.0 MB/s \n", + "\u001b[K |████████████████████████████████| 182 kB 60.6 MB/s \n", + "\u001b[K |████████████████████████████████| 115 kB 63.5 MB/s \n", + "\u001b[K |████████████████████████████████| 212 kB 62.8 MB/s \n", + "\u001b[K |████████████████████████████████| 127 kB 65.8 MB/s \n", + "\u001b[K |████████████████████████████████| 7.6 MB 48.5 MB/s \n", + "\u001b[K |████████████████████████████████| 1.4 MB 55.3 MB/s \n", + "\u001b[K |████████████████████████████████| 2.2 MB 45.2 MB/s \n", + "\u001b[?25h" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We strongly advise you link the notebook to the [Hugging Face Hub](https://huggingface.co). This enables you to login and access \"gated\" datasets on the Hub.\n", + "\n", + "Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. Find your Hub authentication token [here](https://huggingface.co/settings/tokens):" + ], + "metadata": { + "id": "DqO7h-G3LSE_" + } + }, + { + "cell_type": "code", + "source": [ + "from huggingface_hub import login\n", + "\n", + "login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 304, + "referenced_widgets": [ + "36a167954b254a00a4e16d18294cf7c3", + "49d6d159e000483c9fe4c032a055f992", + "5876f3247ae24ec5b8a7838f4d5bc2be", + "68f09fd174f7462aab5510bee77fe160", + "df79fe1fe5264c09a91260932317d38f", + "b97b4f99d22d40a6b227bed5c20294ec", + "508c19158b644ab9a78773d02eee49ee", + "3bbca4ffd06e445295a0f139aa3178c4", + "5d9776d2fe254ee4aa0d69ead623e9c8", + "ac4e7402cb0a49988e7460c2a70a6cf5", + "6f6bb7c449384917a94106a5f83ea3ca", + "44028a87160a4f15ab0293a514a5668b", + "cfad88cb21394f8ea9509c05be6daef5", + "4c8951ef8dda42bf84c70e9c8c94a954", + "d4ba888c575f4a3abd3b8e93c474efe4", + "34707c463518415e8262ba32134db84a", + "cd33610fb5f749a6a52bf66b1d676a11" + ] + }, + "id": "9E7zQZ0kMtln", + "outputId": "4896b65e-b0dc-41ef-d279-ca4955e32c4f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Token is valid.\n", + "Your token has been saved in your configured git credential helpers (store).\n", + "Your token has been saved to /root/.huggingface/token\n", + "Login successful\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Load & Prepare an Audio Dataset\n", + "\n", + "With 🤗 Datasets, we can load and prepare an audio dataset with just one line of Python code. \n", + "\n", + "In this section we'll load the [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) dataset from SpeechColab. Make sure you've accepted the dataset's terms of use if you haven't done so already: https://huggingface.co/datasets/speechcolab/gigaspeech" + ], + "metadata": { + "id": "qbUXUbP9Y1wU" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Load the dataset\n", + "\n", + "Audio datasets are very large. This causes two issues:\n", + "1. They require a significant amount of *storage* to download\n", + "2. They take a significant amount of *time* to download and process\n", + "\n", + "The storage and time requirements present limitations to most speech researchers. However, both can be solved with 🤗 Datasets.\n", + "\n", + "With streaming mode, we can download and prepare chunks of the dataset at a time. Since the data is downloaded progressively as we iterate over the dataset, we can get started with a dataset without waiting for the entire dataset to download. Once we're done with a chunk, it's automatically deleted. This way, we only have the data when we need it, and not when we don't!\n", + "\n", + "Let's load the test split of the GigaSpeech dataset with streaming mode:" + ], + "metadata": { + "id": "uc7e4BhZamQ6" + } + }, + { + "cell_type": "code", + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\n", + " \"speechcolab/gigaspeech\", \"xs\", split=\"test\", streaming=True, use_auth_token=True\n", + ")" + ], + "metadata": { + "id": "y9XvdOSCauRA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Great! We have the dataset ready to download the first chunk. Let's stream the first sample:" + ], + "metadata": { + "id": "aZ_WIQYRbZYG" + } + }, + { + "cell_type": "code", + "source": [ + "print(next(iter(dataset)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "88QtKUpvbcxX", + "outputId": "ca3b583e-7f45-4b59-cb72-63125f6287f0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'segment_id': 'YOU1000000134_S0000042', 'speaker': 'N/A', 'text': 'ONE OF THEIR STANFORD PROFESSORS USED TO SAY WELL THE DIFFERENCE BETWEEN THE TWO OF THEM WAS THAT SERGEI WOULD JUST BURST INTO MY OFFICE WITHOUT ASKING LARRY WOULD KNOCK AND THEN BURST IN ', 'audio': {'path': 'test_chunks_0000/YOU1000000134_S0000042.wav', 'array': array([-0.00210571, -0.00164795, -0.00253296, ..., 0.00012207,\n", + " -0.00064087, -0.0012207 ]), 'sampling_rate': 16000}, 'begin_time': 223.662, 'end_time': 233.533, 'audio_id': 'YOU1000000134', 'title': 'YOU1000000134', 'url': 'N/A', 'source': 2, 'category': 10, 'original_full_path': 'audio/youtube/P0000/YOU1000000134.opus'}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Great! Now we can take a listen of what the audio sounds like and print the text:" + ], + "metadata": { + "id": "WA3l0kQHhBRb" + } + }, + { + "cell_type": "code", + "source": [ + "import IPython.display as ipd\n", + "\n", + "sample = next(iter(dataset))\n", + "audio = sample[\"audio\"]\n", + "\n", + "print(sample[\"text\"])\n", + "ipd.Audio(data=audio[\"array\"], autoplay=True, rate=audio[\"sampling_rate\"])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 89 + }, + "id": "958H4DCDhKbT", + "outputId": "2d5e6a9a-637d-4680-8854-dabfb2f2d84b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ONE OF THEIR STANFORD PROFESSORS USED TO SAY WELL THE DIFFERENCE BETWEEN THE TWO OF THEM WAS THAT SERGEI WOULD JUST BURST INTO MY OFFICE WITHOUT ASKING LARRY WOULD KNOCK AND THEN BURST IN \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 68 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Lovely! We can see that it matches the corresponding transcription as expected." + ], + "metadata": { + "id": "mcwnvAfnjTn-" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Pre-Process the Dataset\n", + "\n", + "Most ASR systems expect the audio inputs to be sampled at 16KHz. We can set the sampling rate of our audio dataset through the [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column) method. This doesn't change the dataset in-place, but resamples the dataset on the fly the first time a sample is loaded." + ], + "metadata": { + "id": "ca1YGACicMbe" + } + }, + { + "cell_type": "code", + "source": [ + "from datasets import Audio\n", + "\n", + "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))" + ], + "metadata": { + "id": "RnhQA3lpcH30" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We then define a 'helper function' that gets the correct transcription column from our dataset. We'll use this function to automatically get the right column names when we perform multi-dataset evaluation." + ], + "metadata": { + "id": "CSrpJNhUI9mI" + } + }, + { + "cell_type": "code", + "source": [ + "def get_text(sample):\n", + " if \"text\" in sample:\n", + " return sample[\"text\"]\n", + " elif \"sentence\" in sample:\n", + " return sample[\"sentence\"]\n", + " elif \"normalized_text\" in sample:\n", + " return sample[\"normalized_text\"]\n", + " elif \"transcript\" in sample:\n", + " return sample[\"transcript\"]\n", + " else:\n", + " raise ValueError(f\"Sample: {sample.keys()} has no transcript.\")" + ], + "metadata": { + "id": "FjBVrFve1iXF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Evaluate Whisper\n", + "\n", + "With the processed dataset ready, we can create an ASR evaluation pipeline using 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines) method. `pipeline` will take care of the data pre-processing and the text generation. All we have to do is pass the audio inputs to pipeline and assess the returned predictions against the reference transcriptions!\n", + "\n", + "We'll evaluate the official OpenAI [Whisper tiny.en](https://huggingface.co/openai/whisper-tiny.en) checkpoint.\n", + "\n", + "P.S. You can load use `pipeline` with any ASR model on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads), including different Whisper checkpoints or even Wav2Vec2. Simply switch the model identifier for the model checkpoint you wish to evaluate." + ], + "metadata": { + "id": "wd-MAGkAqiNf" + } + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline\n", + "\n", + "whisper_asr = pipeline(\n", + " \"automatic-speech-recognition\", model=\"openai/whisper-tiny.en\", device=0\n", + ")" + ], + "metadata": { + "id": "1wk3yAmsYwoF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "whisper_asr.model.config.suppress_tokens.remove(6)\n", + "whisper_asr.model.config.suppress_tokens.remove(12)" + ], + "metadata": { + "id": "VQ9FgkihyHCe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Load the Word Error Rate metric\n", + "\n", + "We'll assess our system using the [Word Error Rate (WER)](https://huggingface.co/spaces/evaluate-metric/wer) metric, the 'de-facto' metric for assessing ASR systems. We'll load the WER metric from the 🤗 Evaluate library:" + ], + "metadata": { + "id": "9Qari1Lu0Dxl" + } + }, + { + "cell_type": "code", + "source": [ + "import evaluate\n", + "\n", + "wer_metric = evaluate.load(\"wer\")" + ], + "metadata": { + "id": "XXEe44U1z7Te" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Bonus: You can also try other evaluation methods like the [Character Error Rate (CER)](https://huggingface.co/spaces/evaluate-metric/cer). For the CER, update the above statement to `evaluate.load(\"cer\")`" + ], + "metadata": { + "id": "ia6-sRSClqqh" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Normalisation\n", + "\n", + "The [Whisper paper](https://cdn.openai.com/papers/whisper.pdf) demonstrates the drastic effect that normalising the text outputs have on WER. The normalisation step is important as it removes errors unrelated to the speech recognition task, such as casing and punctuation. It also makes the formatting consistent between references and predictions by converting spelled out numbers to symbollic form (e.g. \"two\" -> \"2\") and British spellings to American (e.g. \"grey\" -> \"gray\").\n", + "\n", + "We first write a function to normalise the reference of a single sample according to the Whisper English text normaliser:" + ], + "metadata": { + "id": "63i2XhSSr3xk" + } + }, + { + "cell_type": "code", + "source": [ + "whisper_norm = whisper_asr.tokenizer._normalize\n", + "\n", + "def normalise(batch):\n", + " batch[\"norm_text\"] = whisper_norm(get_text(batch))\n", + " return batch" + ], + "metadata": { + "id": "TZ83oC4ZihEq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We then use 🤗 Datasets' [`map`](https://huggingface.co/docs/datasets/v2.6.1/en/process#map) method to apply our normalising function across the entire dataset:" + ], + "metadata": { + "id": "sCazNy16qHh1" + } + }, + { + "cell_type": "code", + "source": [ + "dataset = dataset.map(normalise)" + ], + "metadata": { + "id": "rJ9qZeI2owgG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We need to remove any empty reference transcriptions from our dataset, as these will give a divide by 0 error in the WER calculation.\n", + "\n", + "We write a function that indicates which samples to keep, and which to discard. This function, `is_target_text_in_range`, returns a boolean: reference transcriptions that are not empty return True, and those are empty return False:" + ], + "metadata": { + "id": "cx4jbs1Epv4k" + } + }, + { + "cell_type": "code", + "source": [ + "def is_target_text_in_range(ref):\n", + " if ref.strip() == \"ignore time segment in scoring\":\n", + " return False\n", + " else:\n", + " return ref.strip() != \"\"" + ], + "metadata": { + "id": "p_IohuslpiTi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We can apply this filtering function to all of our training examples using 🤗 Datasets' [`filter`](https://huggingface.co/docs/datasets/process#select-and-filter) \n", + "method, keeping all references that are not empty (True) and discarding those that are (False):" + ], + "metadata": { + "id": "nzagKJ5PqfNY" + } + }, + { + "cell_type": "code", + "source": [ + "dataset = dataset.filter(is_target_text_in_range, input_columns=[\"norm_text\"])" + ], + "metadata": { + "id": "FMI7bq1qpipo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Single Dataset Evaluation\n", + "\n", + "Since we're in streaming mode, we won't run inference in place, but rather signal to 🤗 Datasets to perform inference _on the fly_ the first time the dataset is iterated.\n", + "\n", + "We first define a generator that iterates over the dataset and yields the audio samples and reference text:" + ], + "metadata": { + "id": "xoBfogS4u8tz" + } + }, + { + "cell_type": "code", + "source": [ + "def data(dataset):\n", + " for i, item in enumerate(dataset):\n", + " yield {**item[\"audio\"], \"reference\": item[\"norm_text\"]}" + ], + "metadata": { + "id": "H2tx5TUSabCz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We then set our batch size. We also restrict the number of samples for evaluation to 128 for the purpose of this blog. If you want to run on the full dataset to get the official results, comment out or remove this line from the proceeding code cell!" + ], + "metadata": { + "id": "BPYdZQTSLGcK" + } + }, + { + "cell_type": "code", + "source": [ + "# set the batch size in accordance to your device\n", + "BATCH_SIZE = 16\n", + "\n", + "# only for debugging, restricts the number of rows to numeric value in brackets\n", + "dataset = dataset.take(128)" + ], + "metadata": { + "id": "H9tf38XkLLD9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We pass the generator to the pipeline to run inference:" + ], + "metadata": { + "id": "ovXwWPJovDw-" + } + }, + { + "cell_type": "code", + "source": [ + "predictions = []\n", + "references = []\n", + "\n", + "# run streamed inference\n", + "for out in whisper_asr(data(dataset), batch_size=BATCH_SIZE):\n", + " predictions.append(whisper_norm(out[\"text\"]))\n", + " references.append(out[\"reference\"][0])" + ], + "metadata": { + "id": "I6rjGMqOvA24" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We can now pass on our list of references and predictions to the WER evaluate function to compute the WER:" + ], + "metadata": { + "id": "tU50pcw-drht" + } + }, + { + "cell_type": "code", + "source": [ + "wer = wer_metric.compute(references=references, predictions=predictions)\n", + "wer = round(100 * wer, 2)\n", + "\n", + "print(\"WER:\", wer)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XgzMsEWKdA7u", + "outputId": "a31e39e4-9edd-4e0f-fd4d-0c5c356c7649" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WER: 11.67\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Pretty good! If we run the Whisper tiny.en model on the full test set we can expect to achieve a WER of 14.07%. State-of-the-art models achieve 10.5% WER on the same test set (_c.f._ [GigaSpeech Leaderboard](https://github.com/SpeechColab/GigaSpeech#leaderboard)).\n", + "\n", + "We could certainly improve our zero-shot result with fine-tuning. The [ESB paper](https://arxiv.org/abs/2210.13352) achieves 10.5% WER fine-tuning the [medium.en](https://huggingface.co/openai/whisper-medium.en) checkpoint on GigaSpeech, equalling state-of-the-art. See the blog post [\"Fine-Tune Whisper\"](https://huggingface.co/blog/fine-tune-whisper) for a guide to fine-tuning Whisper with 🤗 Transformers." + ], + "metadata": { + "id": "bZDaCnhH7VS_" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Evalaution on 8 Datasets\n", + "\n", + "Compared to evaluating on a single dataset, multi-dataset evaluation gives a better metric for the generalisation abilities of a speech recognition system (_c.f._ [End-to-end Speech Benchmark (ESB)](https://arxiv.org/abs/2210.13352)). An ASR model should not only work well on one set of audio conditions (e.g. narrated audiobooks), but should be able to handle the full spectrum of background noise, speakers, accents and domains." + ], + "metadata": { + "id": "Qq53b1A2tRz2" + } + }, + { + "cell_type": "markdown", + "source": [ + "First, we'll load the nine test sets from the ESB benchmark in streaming mode:" + ], + "metadata": { + "id": "dSXV-XPXM2wF" + } + }, + { + "cell_type": "code", + "source": [ + "librispeech_clean = load_dataset(\"librispeech_asr\", \"all\", split=\"test.clean\", streaming=True)\n", + "librispeech_other = load_dataset(\"librispeech_asr\", \"all\", split=\"test.other\", streaming=True)\n", + "\n", + "common_voice = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"en\", revision=\"streaming\", split=\"test\", streaming=True, use_auth_token=True)\n", + "\n", + "voxpopuli = load_dataset(\"facebook/voxpopuli\", \"en\", split=\"test\", streaming=True)\n", + "\n", + "tedlium = load_dataset(\"LIUM/tedlium\", \"release3\", split=\"test\", streaming=True)\n", + "\n", + "gigaspeech = load_dataset(\"speechcolab/gigaspeech\", \"xs\", split=\"test\", streaming=True, use_auth_token=True)\n", + "\n", + "spgispeech = load_dataset(\"kensho/spgispeech\", \"S\", split=\"test\", streaming=True, use_auth_token=True)\n", + "\n", + "earnings22 = load_dataset(\"anton-l/earnings22_baseline_5_gram\", split=\"test\", streaming=True)\n", + "\n", + "ami = load_dataset(\"edinburghcstr/ami\", \"ihm\", split=\"test\", streaming=True)" + ], + "metadata": { + "id": "pkyYctmItQTF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f8e8bec0-8001-45ec-a3f1-7d6a93cd6e53" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:datasets.builder:No config specified, defaulting to: earnings22_baseline_5_gram/chunked\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Next, we create a dictionary of dataset names and dataset objects. This gives us an easy lookup table in our evaluation loop." + ], + "metadata": { + "id": "109nsEXFF-sd" + } + }, + { + "cell_type": "code", + "source": [ + "esb_datasets = {\n", + " \"LibriSpeech Clean\": librispeech_clean,\n", + " \"LibriSpeech Other\": librispeech_other,\n", + " \"Common Voice\": common_voice,\n", + " \"VoxPopuli\": voxpopuli,\n", + " \"TEDLIUM\": tedlium,\n", + " \"GigaSpeech\": gigaspeech,\n", + " \"SPGISpeech\": spgispeech,\n", + " \"Earnings-22\": earnings22,\n", + " \"AMI\": ami\n", + "}" + ], + "metadata": { + "id": "BmcVXn4amR69" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Finally, we loop over the ESB datasets and compute the individual WER scores, combining the single-dataset evaluation steps into one loop. We store the WER results in a separate list to display later.\n", + "\n", + "Again, we only evaluate on the first 128 samples for each dataset. If you want to evaluate on the entire dataset, comment out or remove this line!" + ], + "metadata": { + "id": "rmaJ0TDPE0wg" + } + }, + { + "cell_type": "code", + "source": [ + "wer_results = []\n", + "\n", + "# loop over all the datasets in the ESB benchmark\n", + "for dataset_name, dataset in esb_datasets.items():\n", + " # only for debugging, restricts the number of rows to numeric value in brackets\n", + " dataset = dataset.take(128)\n", + "\n", + " # resample to 16kHz\n", + " dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n", + "\n", + " # normalise references\n", + " dataset = dataset.map(normalise)\n", + "\n", + " # remove any empty references\n", + " dataset = dataset.filter(is_target_text_in_range, input_columns=[\"norm_text\"])\n", + "\n", + " # placeholders for predictions and references\n", + " predictions = []\n", + " references = []\n", + "\n", + " # run streamed inference\n", + " for out in whisper_asr(data(dataset), batch_size=BATCH_SIZE):\n", + " predictions.append(whisper_norm(out[\"text\"]))\n", + " references.append(out[\"reference\"][0])\n", + "\n", + " # compute the WER\n", + " wer = wer_metric.compute(references=references, predictions=predictions)\n", + " wer = round(100 * wer, 2)\n", + "\n", + " wer_results.append(wer)" + ], + "metadata": { + "id": "8qLWCi_MoVrv", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f8950beb-bcf7-4de9-c245-dd2368aa28a8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Reading metadata...: 16354it [00:00, 25204.56it/s]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Alright! In one code cell we managed to evaluate over nine different test sets! Let's print the results in tabular form:" + ], + "metadata": { + "id": "flGyTQVUE-qg" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.DataFrame({\"Dataset\": esb_datasets.keys(), \"WER\": wer_results})\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328 + }, + "id": "2pw9sz3Xujdj", + "outputId": "21fadc58-c366-4bca-f8b5-21ae96b81766" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Dataset WER\n", + "0 LibriSpeech Clean 4.73\n", + "1 LibriSpeech Other 16.17\n", + "2 Common Voice 63.27\n", + "3 VoxPopuli 10.22\n", + "4 TEDLIUM 5.16\n", + "5 GigaSpeech 10.62\n", + "6 SPGISpeech 6.67\n", + "7 Earnings-22 48.45\n", + "8 AMI 24.93" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetWER
0LibriSpeech Clean4.73
1LibriSpeech Other16.17
2Common Voice63.27
3VoxPopuli10.22
4TEDLIUM5.16
5GigaSpeech10.62
6SPGISpeech6.67
7Earnings-2248.45
8AMI24.93
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We ran the above evaluation script for the Whisper tiny.en and small.en models on the full datasets from the ESB benchmark. The results of our run are as follows:\n", + "\n", + "| **Dataset name** | **Whisper tiny.en** | **Whisper small.en** |\n", + "|-------------------|---------------------|----------------------|\n", + "| LibriSpeech Clean | 5.66 | 3.05 |\n", + "| LibriSpeech Other | 15.38 | 7.53 |\n", + "| Common Voice | 31.17 | 15.20 |\n", + "| VoxPopuli | 12.58 | 8.45 |\n", + "| TEDLIUM | 14.28 | 12.21 |\n", + "| GigaSpeech | 14.07 | 11.36 |\n", + "| SPGISpeech | 5.82 | 3.63 |\n", + "| Earnings-22 | 13.79 | 16.40 |\n", + "| AMI | 24.68 | 17.88 |" + ], + "metadata": { + "id": "DQrte8nKPeZJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Your chance now! \n", + "\n", + "Go ahead and repeat the above loop with a different ASR checkpoint and your choice of datasets. How does your model compare to Whisper?" + ], + "metadata": { + "id": "LiS-PA3vFJ2m" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "aHc4o6crzm_K" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file