k2-fsa · kinanmartin · Apr 9, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 13, 2025
diff --git a/egs/mls_english/ASR/README.md b/egs/mls_english/ASR/README.md
@@ -0,0 +1,19 @@
+# Introduction
+
+
+
+**Multilingual LibriSpeech (MLS)** is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It includes about 44.5K hours of English and a total of about 6K hours for other languages. This icefall training recipe was created for the restructured version of the English split of the dataset available on Hugging Face below.
+
+
+The dataset is available on Hugging Face. For more details, please visit:
+
+- Dataset: https://huggingface.co/datasets/parler-tts/mls_eng
+- Original MLS dataset link: https://www.openslr.org/94
+
+
+## On-the-fly feature computation
+
+This recipe currently only supports on-the-fly feature bank computation, since `lhotse` manifests and feature banks are not pre-calculated in this recipe. This should mean that the dataset can be streamed from Hugging Face, but we have not tested this yet. We may add a version that supports pre-calculating features to better match existing recipes.\
+<br>
+
+[./RESULTS.md](./RESULTS.md) contains the latest results. This MLS English recipe was primarily developed for use in the ```multi_ja_en``` Japanese-English bilingual pipeline, which is based on MLS English and ReazonSpeech.
diff --git a/egs/mls_english/ASR/RESULTS.md b/egs/mls_english/ASR/RESULTS.md
@@ -0,0 +1,41 @@
+## Results
+
+### MLS-English training results (Non-streaming) on zipformer model
+
+#### Non-streaming
+
+**WER on Test Set (Epoch 20)**
+
+| Type          | Greedy | Beam search |
+|---------------|--------|-------------|
+| Non-streaming | 6.65   | 6.57        |
+
+
+The training command:
+
+```
+./zipformer/train.py \
+--world-size 8 \
+--num-epochs 20 \
+--start-epoch 9 \
+--use-fp16 1 \
+--exp-dir zipformer/exp \
+--lang-dir data/lang/bpe_2000/
+```
+
+The decoding command:
+
+```
+./zipformer/decode.py \
+    --epoch 20 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang/bpe_2000/ \
+    --decoding-method greedy_search
+```
+
+
+The pre-trained model is available here : [reazon-research/mls-english
+](https://huggingface.co/reazon-research/mls-english)
+
+
+Please note that this recipe was developed primarily as the source of English input in the bilingual Japanese-English recipe `multi_ja_en`, which uses ReazonSpeech and MLS English. 
diff --git a/...n/ASR/local/compute_fbank_reazonspeech.py → ...sh/ASR/local/compute_fbank_mls_english.py b/...n/ASR/local/compute_fbank_reazonspeech.py → ...sh/ASR/local/compute_fbank_mls_english.py
@@ -33,6 +33,7 @@
     RecordingSet,
     SupervisionSet,
 )
+from lhotse.utils import is_module_available
 
 # fmt: on
 
@@ -48,55 +49,54 @@
 
 
 def make_cutset_blueprints(
-    manifest_dir: Path,
+    mls_eng_hf_dataset_path: str = "parler-tts/mls_eng",
 ) -> List[Tuple[str, CutSet]]:
     cut_sets = []
 
+    if not is_module_available("datasets"):
+        raise ImportError(
+            "To process the MLS English HF corpus, please install optional dependency: pip install datasets"
+        )
+
+    from datasets import load_dataset
+
+    print(f"{mls_eng_hf_dataset_path=}")
+    dataset = load_dataset(str(mls_eng_hf_dataset_path))
+
     # Create test dataset
     logging.info("Creating test cuts.")
     cut_sets.append(
         (
             "test",
-            CutSet.from_manifests(
-                recordings=RecordingSet.from_file(
-                    manifest_dir / "reazonspeech_recordings_test.jsonl.gz"
-                ),
-                supervisions=SupervisionSet.from_file(
-                    manifest_dir / "reazonspeech_supervisions_test.jsonl.gz"
-                ),
-            ),
+            CutSet.from_huggingface_dataset(dataset["test"], text_key="transcript"),
         )
     )
 
     # Create dev dataset
     logging.info("Creating dev cuts.")
-    cut_sets.append(
-        (
-            "dev",
-            CutSet.from_manifests(
-                recordings=RecordingSet.from_file(
-                    manifest_dir / "reazonspeech_recordings_dev.jsonl.gz"
-                ),
-                supervisions=SupervisionSet.from_file(
-                    manifest_dir / "reazonspeech_supervisions_dev.jsonl.gz"
+    try:
+        cut_sets.append(
+            (
+                "dev",
+                CutSet.from_huggingface_dataset(dataset["dev"], text_key="transcript"),
+            )
+        )
+    except KeyError:
+        cut_sets.append(
+            (
+                "dev",
+                CutSet.from_huggingface_dataset(
+                    dataset["validation"], text_key="transcript"
                 ),
-            ),
+            )
         )
-    )
 
     # Create train dataset
     logging.info("Creating train cuts.")
     cut_sets.append(
         (
             "train",
-            CutSet.from_manifests(
-                recordings=RecordingSet.from_file(
-                    manifest_dir / "reazonspeech_recordings_train.jsonl.gz"
-                ),
-                supervisions=SupervisionSet.from_file(
-                    manifest_dir / "reazonspeech_supervisions_train.jsonl.gz"
-                ),
-            ),
+            CutSet.from_huggingface_dataset(dataset["train"], text_key="transcript"),
         )
     )
     return cut_sets
@@ -107,6 +107,8 @@ def get_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument("-m", "--manifest-dir", type=Path)
+    parser.add_argument("-a", "--audio-dir", type=Path)
+    parser.add_argument("-d", "--dl-dir", type=Path)
     return parser.parse_args()
 
 
@@ -120,26 +122,33 @@ def main():
 
     logging.basicConfig(format=formatter, level=logging.INFO)
 
-    if (args.manifest_dir / ".reazonspeech-fbank.done").exists():
+    if (args.manifest_dir / ".mls-eng-fbank.done").exists():
         logging.info(
-            "Previous fbank computed for ReazonSpeech found. "
-            f"Delete {args.manifest_dir / '.reazonspeech-fbank.done'} to allow recomputing fbank."
+            "Previous fbank computed for MLS English found. "
+            f"Delete {args.manifest_dir / '.mls-eng-fbank.done'} to allow recomputing fbank."
         )
         return
     else:
-        cut_sets = make_cutset_blueprints(args.manifest_dir)
+        mls_eng_hf_dataset_path = args.dl_dir  # "/root/datasets/parler-tts--mls_eng"
+        cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path)
         for part, cut_set in cut_sets:
             logging.info(f"Processing {part}")
+            cut_set = cut_set.save_audios(
+                num_jobs=num_jobs,
+                storage_path=(args.audio_dir / part).as_posix(),
+            )  # makes new cutset that loads audio from paths to actual audio files
+
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
                 num_jobs=num_jobs,
                 storage_path=(args.manifest_dir / f"feats_{part}").as_posix(),
                 storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_file(args.manifest_dir / f"reazonspeech_cuts_{part}.jsonl.gz")
 
-        logging.info("All fbank computed for ReazonSpeech.")
-        (args.manifest_dir / ".reazonspeech-fbank.done").touch()
+            cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz")
+
+        logging.info("All fbank computed for MLS English.")
+        (args.manifest_dir / ".mls-eng-fbank.done").touch()
 
 
 if __name__ == "__main__":

diff --git a/egs/mls_english/ASR/local/compute_fbank_musan.py b/egs/mls_english/ASR/local/compute_fbank_musan.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
diff --git a/.../ASR/local/display_manifest_statistics.py → .../ASR/local/display_manifest_statistics.py b/.../ASR/local/display_manifest_statistics.py → .../ASR/local/display_manifest_statistics.py
@@ -45,8 +45,8 @@ def get_parser():
 def main():
     args = get_parser()
 
-    for part in ["train", "dev"]:
-        path = args.manifest_dir / f"reazonspeech_cuts_{part}.jsonl.gz"
+    for part in ["dev", "test", "train"]:
+        path = args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz"
         cuts: CutSet = load_manifest(path)
 
         print("\n---------------------------------\n")

diff --git a/egs/mls_english/ASR/local/train_bpe_model.py b/egs/mls_english/ASR/local/train_bpe_model.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+# Copyright    2024  Xiaomi Corp.        (authors: Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# You can install sentencepiece via:
+#
+#  pip install sentencepiece
+#
+# Due to an issue reported in
+# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
+#
+# Please install a version >=0.1.96
+
+import argparse
+import shutil
+from pathlib import Path
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--byte-fallback",
+        action="store_true",
+        help="""Whether to enable byte_fallback when training bpe.""",
+    )
+
+    parser.add_argument(
+        "--character-coverage",
+        type=float,
+        default=1.0,
+        help="Character coverage in vocabulary.",
+    )
+
+    parser.add_argument(
+        "--transcript",
+        type=str,
+        help="Training transcript.",
+    )
+
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    lang_dir = Path(args.lang_dir)
+
+    model_type = "bpe"
+
+    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
+    train_text = args.transcript
+    input_sentence_size = 100000000
+
+    user_defined_symbols = ["<blk>", "<sos/eos>"]
+    unk_id = len(user_defined_symbols)
+    # Note: unk_id is fixed to 2.
+    # If you change it, you should also change other
+    # places that are using it.
+
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=args.character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            byte_fallback=args.byte_fallback,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+    else:
+        print(f"{model_file} exists - skipping")
+        return
+
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+
+
-    model_file = Path(model_prefix + ".model")
-    if not model_file.is_file():
-        spm.SentencePieceTrainer.train(
-            input=train_text,
-            vocab_size=vocab_size,
-            model_type=model_type,
-            model_prefix=model_prefix,
-            input_sentence_size=input_sentence_size,
-            character_coverage=args.character_coverage,
-            user_defined_symbols=user_defined_symbols,
-            byte_fallback=args.byte_fallback,
-            unk_id=unk_id,
-            bos_id=-1,
-            eos_id=-1,
-        )
-    else:
-        print(f"{model_file} exists - skipping")
-        return
-
-    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=args.character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            byte_fallback=args.byte_fallback,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+    else:
+        print(f"{model_file} exists - skipping training")
+    # Ensure canonical symlink/copy is updated
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
-    model_file = Path(model_prefix + ".model")
-    if not model_file.is_file():
-        spm.SentencePieceTrainer.train(
-            input=train_text,
-            vocab_size=vocab_size,
-            model_type=model_type,
-            model_prefix=model_prefix,
-            input_sentence_size=input_sentence_size,
-            character_coverage=args.character_coverage,
-            user_defined_symbols=user_defined_symbols,
-            byte_fallback=args.byte_fallback,
-            unk_id=unk_id,
-            bos_id=-1,
-            eos_id=-1,
-        )
-    else:
-        print(f"{model_file} exists - skipping")
-        return
-
-    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=args.character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            byte_fallback=args.byte_fallback,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+    else:
+        print(f"{model_file} exists - skipping training")
+    # Ensure canonical symlink/copy is updated
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../librispeech/ASR/local/compute_fbank_musan.py