first commit

2025-04-18 19:56:58 +08:00 · 2025-04-18 19:56:58 +08:00 · 9dbcf5c730
commit 9dbcf5c730
76 changed files with 3263 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/README.md
+++ b/README.md
@ -0,0 +1,97 @@
 # 前言
 使用环境：
 - Anaconda 3
 - Python 3.8
 - Pytorch 1.13.1
 - Windows 10 or Ubuntu 18.04
 # 项目特性
 1. 支持模型：EcapaTdnn、TDNN、Res2Net、ResNetSE
 2. 支持池化层：AttentiveStatsPool(ASP)、SelfAttentivePooling(SAP)、TemporalStatisticsPooling(TSP)、TemporalAveragePooling(TAP)
 3. 支持损失函数：AAMLoss、AMLoss、ARMLoss、CELoss
 4. 支持预处理方法：MelSpectrogram、Spectrogram、MFCC
 ## 安装环境
 - 首先安装的是Pytorch的GPU版本，如果已经安装过了，请跳过。
 ```shell
 conda install pytorch==11.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia
 ```
 - 安装ppvector库。
 使用pip安装，命令如下：
 ```shell
 python -m pip install mvector -U -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 # 使用指南
 ## 1. 环境准备
 ### 1.1 安装依赖
 ```shell
 # 使用conda创建环境（可选）
 conda create -n voiceprint python=3.8
 conda activate voiceprint
 # 安装项目依赖
 pip install -r requirements.txt
 ```
 ### 1.2 准备音频数据
 - 在`audio_db/`目录存放注册语音（建议16kHz单通道wav格式）
 - 测试音频建议存放至`test_audio/`目录
 ## 2. 核心功能使用
 ### 2.1 训练声纹模型
 ```shell
 python train.py \
    --config_path configs/ecapa_tdnn.yml \
    --augmentation_config configs/augmentation.json \
    --save_dir models/
 ```
 ### 2.2 声纹注册入库
 ```python
 from mvector import MVector
 mvector = MVector()
 mvector.register_user(name="user1", audio_path="audio_db/user1.wav")
 ```
 ### 2.3 实时声纹识别
 ```shell
 python infer_recognition.py \
    --model_path models/ecapa_tdnn.pth \
    --audio_path test_audio/unknown.wav
 ```
 ### 2.4 声纹对比验证
 ```shell
 python infer_contrast.py \
    --audio1 audio_db/user1.wav \
    --audio2 test_audio/sample.wav \
    --threshold 0.7
 ```
 ## 3. 降噪预处理
 ```python
 from Reduction_Noise import NoiseReducer
 reducer = NoiseReducer("Reduction_Noise/pytorch_model.bin")
 clean_audio = reducer.process("noisy_audio.wav")
 ```
 ## 4. 模型评估
 ```shell
 python eval.py \
    --model_path models/ecapa_tdnn.pth \
    --test_csv eval_samples.csv \
    --batch_size 32
 ```
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit cfc4b6a2433a4a6f0d2d1cda5f944d677e072ef4
--- a/audio_db/output1.wav
+++ b/audio_db/output1.wav
--- a/audio_db/output2.wav
+++ b/audio_db/output2.wav
--- a/audio_db/test.wav
+++ b/audio_db/test.wav
--- a/audio_db/test_Re.wav
+++ b/audio_db/test_Re.wav
--- a/configs/augmentation.json
+++ b/configs/augmentation.json
@ -0,0 +1,72 @@
 [
  {
    "type": "noise",
    "aug_type": "audio",
    "params": {
      "min_snr_dB": 10,
      "max_snr_dB": 50,
      "repetition": 2,
      "noise_dir": "dataset/noise/"
    },
    "prob": 0.0
  },
  {
    "type": "resample",
    "aug_type": "audio",
    "params": {
      "new_sample_rate": [8000, 32000, 44100, 48000]
    },
    "prob": 0.0
  },
  {
    "type": "speed",
    "aug_type": "audio",
    "params": {
      "min_speed_rate": 0.9,
      "max_speed_rate": 1.1,
      "num_rates": 3
    },
    "prob": 0.5
  },
  {
    "type": "shift",
    "aug_type": "audio",
    "params": {
      "min_shift_ms": -5,
      "max_shift_ms": 5
    },
    "prob": 0.0
  },
  {
    "type": "volume",
    "aug_type": "audio",
    "params": {
      "min_gain_dBFS": -15,
      "max_gain_dBFS": 15
    },
    "prob": 0.5
  },
  {
    "type": "specaug",
    "aug_type": "feature",
    "params": {
      "inplace": true,
      "max_time_warp": 5,
      "max_t_ratio": 0.01,
      "n_freq_masks": 2,
      "max_f_ratio": 0.05,
      "n_time_masks": 2,
      "replace_with_zero": true
    },
    "prob": 0.5
  },
  {
    "type": "specsub",
    "aug_type": "feature",
    "params": {
      "max_t": 10,
      "num_t_sub": 2
    },
    "prob": 0.0
  }
 ]
--- a/configs/ecapa_tdnn.yml
+++ b/configs/ecapa_tdnn.yml
@ -0,0 +1,54 @@
 # 数据集参数
 dataset_conf:
  # 训练的批量大小
  batch_size: 256
  # 说话人数量，即分类大小
  num_speakers: 3242
  # 读取数据的线程数量
  num_workers: 12
  # 过滤最短的音频长度
  min_duration: 0.5
  # 最长的音频长度，大于这个长度会裁剪掉
  max_duration: 6
  # 是否裁剪静音片段
  do_vad: False
  # 音频的采样率
  sample_rate: 16000
  # 是否对音频进行音量归一化
  use_dB_normalization: False
  # 对音频进行音量归一化的音量分贝值
  target_dB: -20
  # 训练数据的数据列表路径
  train_list: 'dataset/train_list.txt'
  # 测试数据的数据列表路径
  test_list: 'dataset/test_list.txt'
  # 标签列表
  label_list_path: 'dataset/label_list.txt'
 # 数据预处理参数
 preprocess_conf:
  # 音频预处理方法，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
  feature_method: 'Fbank'
 feature_conf:
  sample_frequency: 16000
  num_mel_bins: 80
 optimizer_conf:
  # 优化方法，支持Adam、AdamW、SGD
  optimizer: 'Adam'
  # 初始学习率的大小
  learning_rate: 0.001
  weight_decay: 1e-6
 model_conf:
  embd_dim: 192
  channels: 512
 train_conf:
  # 训练的轮数
  max_epoch: 30
  log_interval: 100
 # 所使用的模型
 use_model: 'ecapa_tdnn'
--- a/create_data.py
+++ b/create_data.py
@ -0,0 +1,104 @@
 import json
 import os
 import time
 from multiprocessing import Pool, cpu_count
 from datetime import timedelta
 from pydub import AudioSegment
 # 生成数据列表
 def get_data_list(infodata_path, zhvoice_path):
    print('正在读取标注文件...')
    with open(infodata_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    data = []
    speakers = []
    speakers_dict = {}
    for line in lines:
        line = json.loads(line.replace('\n', ''))
        duration_ms = line['duration_ms']
        if duration_ms < 1300:
            continue
        speaker = line['speaker']
        if speaker not in speakers:
            speakers_dict[speaker] = len(speakers)
            speakers.append(speaker)
        label = speakers_dict[speaker]
        sound_path = os.path.join(zhvoice_path, line['index'])
        data.append([sound_path.replace('\\', '/'), label])
    print(f'一共有{len(data)}条数据！')
    return data
 def mp32wav(num, data_list):
    start = time.time()
    for i, data in enumerate(data_list):
        sound_path, label = data
        if os.path.exists(sound_path):
            save_path = sound_path.replace('.mp3', '.wav')
            if not os.path.exists(save_path):
                wav = AudioSegment.from_mp3(sound_path)
                wav.export(save_path, format="wav")
                os.remove(sound_path)
        if i % 100 == 0:
            eta_sec = ((time.time() - start) / 100 * (len(data_list) - i))
            start = time.time()
            eta_str = str(timedelta(seconds=int(eta_sec)))
            print(f'进程{num}进度：[{i}/{len(data_list)}]，剩余时间：{eta_str}')
 def split_data(list_temp, n):
    length = len(list_temp) // n
    for i in range(0, len(list_temp), length):
        yield list_temp[i:i + length]
 def main(infodata_path, list_path, zhvoice_path, to_wav=True, num_workers=2):
    if to_wav:
        text = input(f'音频文件将会转换为wav格式，这个过程可能很长，而且最终文件大小接近100G，是否继续？(y/n)')
        if text is None or text != 'y':
            return
    else:
        text = input(f'将会直接使用MP3格式文件，但读取速度会比wav格式慢，是否继续？(y/n)')
        if text is None or text != 'y':
            return
    data_all = []
    data = get_data_list(infodata_path=infodata_path, zhvoice_path=zhvoice_path)
    if to_wav:
        print('准备把MP3总成WAV格式...')
        split_d = split_data(data, num_workers)
        pool = Pool(num_workers)
        for i, d in enumerate(split_d):
            pool.apply_async(mp32wav, (i, d))
        pool.close()
        pool.join()
        for d in data:
            sound_path, label = d
            sound_path = sound_path.replace('.mp3', '.wav')
            if os.path.exists(sound_path):
                data_all.append([sound_path, label])
    else:
        for d in data:
            sound_path, label = d
            if os.path.exists(sound_path):
                data_all.append(d)
    f_train = open(os.path.join(list_path, 'train_list.txt'), 'w')
    f_test = open(os.path.join(list_path, 'test_list.txt'), 'w')
    for i, d in enumerate(data_all):
        sound_path, label = d
        if i % 200 == 0:
            f_test.write(f'{sound_path}\t{label}\n')
        else:
            f_train.write(f'{sound_path}\t{label}\n')
    f_test.close()
    f_train.close()
 if __name__ == '__main__':
    main(infodata_path='dataset/zhvoice/text/infodata.json',
         list_path='dataset',
         zhvoice_path='dataset/zhvoice',
         to_wav=False,
         num_workers=cpu_count())
--- a/eval.py
+++ b/eval.py
@ -0,0 +1,25 @@
 import argparse
 import functools
 import time
 from mvector.trainer import MVectorTrainer
 from mvector.utils.utils import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('configs',          str,   'configs/ecapa_tdnn.yml',    "配置文件")
 add_arg("use_gpu",          bool,  True,                        "是否使用GPU评估模型")
 add_arg('save_image_path',  str,   'output/images/',            "保存结果图的路径")
 add_arg('resume_model',     str,   'models/ecapa_tdnn_MFCC/best_model/',  "模型的路径")
 args = parser.parse_args()
 print_arguments(args=args)
 # 获取训练器
 trainer = MVectorTrainer(configs=args.configs, use_gpu=args.use_gpu)
 # 开始评估
 start = time.time()
 tpr, fpr, eer, threshold = trainer.evaluate(resume_model=args.resume_model, save_image_path=args.save_image_path)
 end = time.time()
 print('评估消耗时间：{}s，threshold：{:.2f}，tpr：{:.5f}, fpr: {:.5f}, eer: {:.5f}'
      .format(int(end - start), threshold, tpr, fpr, eer))
--- a/infer_contrast.py
+++ b/infer_contrast.py
@ -0,0 +1,52 @@
 import argparse
 import functools
 import itertools
 from mvector.predict import MVectorPredictor
 from mvector.utils.utils import add_arguments, print_arguments
 from mvector.data_utils.audio import AudioSegment
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('configs',          str,    'configs/ecapa_tdnn.yml',   '配置文件')
 add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
 add_arg('audio_path1',      str,    'dataset/source/王翔/wx1_5.wav',          '预测第一个音频')
 add_arg('audio_path2',      str,    'dataset/source/刘云杰/lyj_no_5.wav',          '预测第二个音频')
 add_arg('threshold',        float,  0.7,                        '判断是否为同一个人的阈值')
 add_arg('model_path',       str,    'models/test_model', '导出的预测模型文件路径')
 args = parser.parse_args()
 # print_arguments(args=args)
 # 获取识别器
 predictor = MVectorPredictor(configs=args.configs,
                             model_path=args.model_path,
                             use_gpu=args.use_gpu)
 def load_audio_paths(file_path):
    with open(file_path, 'r',encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]
 def compare_audio_files(audio_paths, threshold):
    # itertools.combinations 生成所有可能的两两组合，无重复
    for audio1, audio2 in itertools.combinations(audio_paths, 2):
        dist = predictor.contrast(audio1, audio2)
        if dist > threshold:
            print(f"{audio1} 和 {audio2} 为同一个人，相似度为：{dist}")
        # else:
        #     print(f"{audio1} 和 {audio2} 不是同一个人，相似度为：{dist}")
 file_path = 'dataset/ces.txt'  # 假设音频路径存储在此文件中
 # 执行比对
 audio_paths = load_audio_paths(file_path)
 compare_audio_files(audio_paths, args.threshold)
 # # AudioSegment.silent_semoval(args.audio_path1, args.audio_path1)
 # # AudioSegment.silent_semoval(args.audio_path2, args.audio_path2)
 # dist = predictor.contrast(args.audio_path1, args.audio_path2)
 # if dist > args.threshold:
 #     print(f"{args.audio_path1} 和 {args.audio_path2} 为同一个人，相似度为：{dist}")
 # else:
 #     print(f"{args.audio_path1} 和 {args.audio_path2} 不是同一个人，相似度为：{dist}")
--- a/infer_recognition.py
+++ b/infer_recognition.py
@ -0,0 +1,49 @@
 import argparse
 import functools
 from mvector.predict import MVectorPredictor
 from mvector.utils.record import RecordAudio
 from mvector.utils.utils import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('configs',          str,    'configs/ecapa_tdnn.yml',   '配置文件')
 add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
 add_arg('audio_db_path',    str,    'audio_db/',                '音频库的路径')
 add_arg('record_seconds',   int,    3,                          '录音长度')
 add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
 add_arg('model_path',       str,    'models/ecapa_tdnn_MelSpectrogram/best_model/', '导出的预测模型文件路径')
 args = parser.parse_args()
 print_arguments(args=args)
 # 获取识别器
 predictor = MVectorPredictor(configs=args.configs,
                             threshold=args.threshold,
                             audio_db_path=args.audio_db_path,
                             model_path=args.model_path,
                             use_gpu=args.use_gpu)
 record_audio = RecordAudio()
 while True:
    select_fun = int(input("请选择功能，0为注册音频到声纹库，1为执行声纹识别，2为删除用户："))
    if select_fun == 0:
        input(f"按下回车键开机录音，录音{args.record_seconds}秒中：")
        audio_data = record_audio.record(record_seconds=args.record_seconds)
        name = input("请输入该音频用户的名称：")
        if name == '': continue
        predictor.register(user_name=name, audio_data=audio_data, sample_rate=record_audio.sample_rate)
    elif select_fun == 1:
        input(f"按下回车键开机录音，录音{args.record_seconds}秒中：")
        audio_data = record_audio.record(record_seconds=args.record_seconds)
        name = predictor.recognition(audio_data, sample_rate=record_audio.sample_rate)
        if name:
            print(f"识别说话的为：{name}")
        else:
            print(f"没有识别到说话人，可能是没注册。")
    elif select_fun == 2:
        name = input("请输入该音频用户的名称：")
        if name == '': continue
        predictor.remove_user(user_name=name)
    else:
        print('请正确选择功能')
--- a/main.py
+++ b/main.py
@ -0,0 +1,40 @@
 from fastapi import FastAPI, File, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from mvector.predict import MVectorPredictor
 import os
 app = FastAPI()
 # 允许跨域
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
 )
 predictor = MVectorPredictor(configs='configs/ecapa_tdnn.yml')
@app.post("/recognize")
 async def recognize(file: UploadFile = File(...)):
    try:
        audio_bytes = await file.read()
        result = predictor.recognition(audio_bytes)
        return {"status": 200, "data": result}
    except Exception as e:
        return {"status": 500, "error": str(e)}
@app.post("/compare") 
 async def compare(file1: UploadFile = File(...), file2: UploadFile = File(...)):
    try:
        score = predictor.contrast(
            await file1.read(),
            await file2.read()
        )
        return {"similarity": float(score)}
    except Exception as e:
        return {"status": 500, "error": str(e)}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=9001)
--- a/models/.DS_Store
+++ b/models/.DS_Store
--- a/models/pytorch_model.bin
+++ b/models/pytorch_model.bin
--- a/mvector/init.py
+++ b/mvector/init.py
@ -0,0 +1,3 @@
 __version__ = "0.3.9"
 # 项目支持的模型
 SUPPORT_MODEL = ['ecapa_tdnn', 'EcapaTdnn', 'Res2Net', 'ResNetSE', 'TDNN']
--- a/mvector/pycache/init.cpython-311.pyc
+++ b/mvector/pycache/init.cpython-311.pyc
--- a/mvector/pycache/init.cpython-37.pyc
+++ b/mvector/pycache/init.cpython-37.pyc
--- a/mvector/pycache/predict.cpython-37.pyc
+++ b/mvector/pycache/predict.cpython-37.pyc
--- a/mvector/pycache/trainer.cpython-311.pyc
+++ b/mvector/pycache/trainer.cpython-311.pyc
--- a/mvector/pycache/trainer.cpython-37.pyc
+++ b/mvector/pycache/trainer.cpython-37.pyc
--- a/mvector/data_utils/init.py
+++ b/mvector/data_utils/init.py
--- a/mvector/data_utils/pycache/init.cpython-37.pyc
+++ b/mvector/data_utils/pycache/init.cpython-37.pyc
--- a/mvector/data_utils/pycache/audio.cpython-37.pyc
+++ b/mvector/data_utils/pycache/audio.cpython-37.pyc
--- a/mvector/data_utils/pycache/collate_fn.cpython-37.pyc
+++ b/mvector/data_utils/pycache/collate_fn.cpython-37.pyc
--- a/mvector/data_utils/pycache/featurizer.cpython-37.pyc
+++ b/mvector/data_utils/pycache/featurizer.cpython-37.pyc
--- a/mvector/data_utils/pycache/reader.cpython-37.pyc
+++ b/mvector/data_utils/pycache/reader.cpython-37.pyc
--- a/mvector/data_utils/pycache/utils.cpython-37.pyc
+++ b/mvector/data_utils/pycache/utils.cpython-37.pyc
--- a/mvector/data_utils/audio.py
+++ b/mvector/data_utils/audio.py
@ -0,0 +1,565 @@
 import copy
 import io
 import os
 import random
 import numpy as np
 import resampy
 import soundfile
 from mvector.data_utils.utils import buf_to_float, vad, decode_audio
 class AudioSegment(object):
    """Monaural audio segment abstraction.
    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
    """
    def __init__(self, samples, sample_rate):
        """Create audio segment from samples.
        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)
    def __eq__(self, other):
        """返回两个对象是否相等"""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True
    def __ne__(self, other):
        """返回两个对象是否不相等"""
        return not self.__eq__(other)
    def __str__(self):
        """返回该音频的信息"""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, self.duration, self.rms_db))
    @classmethod
    def from_file(cls, file):
        """从音频文件创建音频段
        :param file: 文件路径，或者文件对象
        :type file: str, BufferedReader
        :return: 音频片段实例
        :rtype: AudioSegment
        """
        assert os.path.exists(file), f'文件不存在，请检查路径：{file}'
        try:
            samples, sample_rate = soundfile.read(file, dtype='float32')
        except:
            # 支持更多格式数据
            sample_rate = 16000
            samples = decode_audio(file=file, sample_rate=sample_rate)
        return cls(samples, sample_rate)
    @classmethod
    def slice_from_file(cls, file, start=None, end=None):
        """只加载一小段音频，而不需要将整个文件加载到内存中，这是非常浪费的。
        :param file: 输入音频文件路径或文件对象
        :type file: str|file
        :param start: 开始时间，单位为秒。如果start是负的，则它从末尾开始计算。如果没有提供，这个函数将从最开始读取。
        :type start: float
        :param end: 结束时间，单位为秒。如果end是负的，则它从末尾开始计算。如果没有提供，默认的行为是读取到文件的末尾。
        :type end: float
        :return: AudioSegment输入音频文件的指定片的实例。
        :rtype: AudioSegment
        :raise ValueError: 如开始或结束的设定不正确，例如时间不允许。
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = round(float(len(sndfile)) / sample_rate, 3)
        start = 0. if start is None else round(start, 3)
        end = duration if end is None else round(end, 3)
        # 从末尾开始计
        if start < 0.0: start += duration
        if end < 0.0: end += duration
        # 保证数据不越界
        if start < 0.0: start = 0.0
        if end > duration: end = duration
        if end < 0.0:
            raise ValueError("切片结束位置(%f s)越界" % end)
        if start > end:
            raise ValueError("切片开始位置(%f s)晚于切片结束位置(%f s)" % (start, end))
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
        return cls(data, sample_rate)
    @classmethod
    def from_bytes(cls, data):
        """从包含音频样本的字节创建音频段
        :param data: 包含音频样本的字节
        :type data: bytes
        :return: 音频部分实例
        :rtype: AudioSegment
        """
        samples, sample_rate = soundfile.read(io.BytesIO(data), dtype='float32')
        return cls(samples, sample_rate)
    @classmethod
    def from_pcm_bytes(cls, data, channels=1, samp_width=2, sample_rate=16000):
        """从包含无格式PCM音频的字节创建音频
        :param data: 包含音频样本的字节
        :type data: bytes
        :param channels: 音频的通道数
        :type channels: int
        :param samp_width: 音频采样的宽度，如np.int16为2
        :type samp_width: int
        :param sample_rate: 音频样本采样率
        :type sample_rate: int
        :return: 音频部分实例
        :rtype: AudioSegment
        """
        samples = buf_to_float(data, n_bytes=samp_width)
        if channels > 1:
            samples = samples.reshape(-1, channels)
        return cls(samples, sample_rate)
    @classmethod
    def from_ndarray(cls, data, sample_rate=16000):
        """从numpy.ndarray创建音频段
        :param data: numpy.ndarray类型的音频数据
        :type data: ndarray
        :param sample_rate: 音频样本采样率
        :type sample_rate: int
        :return: 音频部分实例
        :rtype: AudioSegment
        """
        return cls(data, sample_rate)
    @classmethod
    def concatenate(cls, *segments):
        """将任意数量的音频片段连接在一起
        :param *segments: 输入音频片段被连接
        :type *segments: tuple of AudioSegment
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
        """
        # Perform basic sanity-checks.
        if len(segments) == 0:
            raise ValueError("没有音频片段被给予连接")
        sample_rate = segments[0]._sample_rate
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("能用不同的采样率连接片段")
            if type(seg) is not cls:
                raise TypeError("只有相同类型的音频片段可以连接")
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """创建给定持续时间和采样率的静音音频段
        :param duration: 静音的时间，以秒为单位
        :type duration: float
        :param sample_rate: 音频采样率
        :type sample_rate: float
        :return: 给定持续时间的静音AudioSegment实例
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
        return cls(samples, sample_rate)
    def to_wav_file(self, filepath, dtype='float32'):
        """保存音频段到磁盘为wav文件
        :param filepath: WAV文件路径或文件对象，以保存音频段
        :type filepath: str|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])
    def superimpose(self, other):
        """将另一个段的样本添加到这个段的样本中(以样本方式添加，而不是段连接)。
        :param other: 包含样品的片段被添加进去
        :type other: AudioSegments
        :raise TypeError: 如果两个片段的类型不匹配
        :raise ValueError: 不能添加不同类型的段
        """
        if not isinstance(other, type(self)):
            raise TypeError("不能添加不同类型的段: %s 和 %s" % (type(self), type(other)))
        if self._sample_rate != other._sample_rate:
            raise ValueError("采样率必须匹配才能添加片段")
        if len(self._samples) != len(other._samples):
            raise ValueError("段长度必须匹配才能添加段")
        self._samples += other._samples
    def to_bytes(self, dtype='float32'):
        """创建包含音频内容的字节字符串
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()
    def to(self, dtype='int16'):
        """类型转换
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: np.ndarray containing `dtype` audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples
    def gain_db(self, gain):
        """对音频施加分贝增益。
        Note that this is an in-place transformation.
        :param gain: Gain in decibels to apply to samples. 
        :type gain: float|1darray
        """
        self._samples *= 10.**(gain / 20.)
    def change_speed(self, speed_rate):
        """通过线性插值改变音频速度
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate == 1.0:
            return
        if speed_rate <= 0:
            raise ValueError("速度速率应大于零")
        old_length = self._samples.shape[0]
        new_length = int(old_length / speed_rate)
        old_indices = np.arange(old_length)
        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        self._samples = np.interp(new_indices, old_indices, self._samples).astype(np.float32)
    def normalize(self, target_db=-20, max_gain_db=300.0):
        """将音频归一化，使其具有所需的有效值(以分贝为单位)
        :param target_db: Target RMS value in decibels. This value should be
                          less than 0.0 as 0.0 is full-scale audio.
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
                            normalization. This is to prevent nans when
                            attempting to normalize a signal consisting of
                            all zeros.
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
        """
        if -np.inf == self.rms_db: return
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
                "无法将段规范化到 %f dB，因为可能的增益已经超过max_gain_db (%f dB)" % (target_db, max_gain_db))
        self.gain_db(min(max_gain_db, target_db - self.rms_db))
    def resample(self, target_sample_rate, filter='kaiser_best'):
        """按目标采样率重新采样音频
        Note that this is an in-place transformation.
        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
        :param filter: The resampling filter to use one of {'kaiser_best', 'kaiser_fast'}.
        :type filter: str
        """
        self._samples = resampy.resample(self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate
    def pad_silence(self, duration, sides='both'):
        """在这个音频样本上加一段静音
        Note that this is an in-place transformation.
        :param duration: Length of silence in seconds to pad.
        :type duration: float
        :param sides: Position for padding:
                     'beginning' - adds silence in the beginning;
                     'end' - adds silence in the end;
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
        :raises ValueError: If sides is not supported.
        """
        if duration == 0.0:
            return self
        cls = type(self)
        silence = self.make_silence(duration, self._sample_rate)
        if sides == "beginning":
            padded = cls.concatenate(silence, self)
        elif sides == "end":
            padded = cls.concatenate(self, silence)
        elif sides == "both":
            padded = cls.concatenate(silence, self, silence)
        else:
            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples
    def shift(self, shift_ms):
        """音频偏移。如果shift_ms为正，则随时间提前移位;如果为负，则随时间延迟移位。填补静音以保持持续时间不变。
        Note that this is an in-place transformation.
        :param shift_ms: Shift time in millseconds. If positive, shift with
                         time advance; if negative; shift with time delay.
        :type shift_ms: float
        :raises ValueError: If shift_ms is longer than audio duration.
        """
        if abs(shift_ms) / 1000.0 > self.duration:
            raise ValueError("shift_ms的绝对值应该小于音频持续时间")
        shift_samples = int(shift_ms * self._sample_rate / 1000)
        if shift_samples > 0:
            # time advance
            self._samples[:-shift_samples] = self._samples[shift_samples:]
            self._samples[-shift_samples:] = 0
        elif shift_samples < 0:
            # time delay
            self._samples[-shift_samples:] = self._samples[:shift_samples]
            self._samples[:-shift_samples] = 0
    def subsegment(self, start_sec=None, end_sec=None):
        """在给定的边界之间切割音频片段
        Note that this is an in-place transformation.
        :param start_sec: Beginning of subsegment in seconds.
        :type start_sec: float
        :param end_sec: End of subsegment in seconds.
        :type end_sec: float
        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
                           of bounds in time.
        """
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        if start_sec < 0.0:
            raise ValueError("切片起始位置(%f s)越界" % start_sec)
        if end_sec < 0.0:
            raise ValueError("切片结束位置(%f s)越界" % end_sec)
        if start_sec > end_sec:
            raise ValueError("切片的起始位置(%f s)晚于结束位置(%f s)" % (start_sec, end_sec))
        if end_sec > self.duration:
            raise ValueError("切片结束位置(%f s)越界(> %f s)" % (end_sec, self.duration))
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        self._samples = self._samples[start_sample:end_sample]
    def random_subsegment(self, subsegment_length):
        """随机剪切指定长度的音频片段
        Note that this is an in-place transformation.
        :param subsegment_length: Subsegment length in seconds.
        :type subsegment_length: float
        :raises ValueError: If the length of subsegment is greater than
                            the origineal segemnt.
        """
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = random.uniform(0.0, self.duration - subsegment_length)
        self.subsegment(start_time, start_time + subsegment_length)
    def add_noise(self,
                  noise,
                  snr_dB,
                  max_gain_db=300.0):
        """以特定的信噪比添加给定的噪声段。如果噪声段比该噪声段长，则从该噪声段中采样匹配长度的随机子段。
        Note that this is an in-place transformation.
        :param noise: Noise signal to add.
        :type noise: AudioSegment
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
        :type snr_dB: float
        :param max_gain_db: Maximum amount of gain to apply to noise signal
                            before adding it in. This is to prevent attempting
                            to apply infinite gain to a zero signal.
        :type max_gain_db: float
        :raises ValueError: If the sample rate does not match between the two
                            audio segments, or if the duration of noise segments
                            is shorter than original audio segments.
        """
        if noise.sample_rate != self.sample_rate:
            raise ValueError("噪声采样率(%d Hz)不等于基信号采样率(%d Hz)" % (noise.sample_rate, self.sample_rate))
        if noise.duration < self.duration:
            raise ValueError("噪声信号(%f秒)必须至少与基信号(%f秒)一样长" % (noise.duration, self.duration))
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration)
        noise_new.gain_db(noise_gain_db)
        self.superimpose(noise_new)
    def vad(self, top_db=20, overlap=0):
        self._samples = vad(wav=self._samples, top_db=top_db, overlap=overlap)
    def crop(self, duration, mode='eval'):
        if self.duration > duration:
            if mode == 'train':
                self.random_subsegment(duration)
            else:
                self.subsegment(end_sec=duration)
    @property
    def samples(self):
        """返回音频样本
        :return: Audio samples.
        :rtype: ndarray
        """
        return self._samples.copy()
    @property
    def sample_rate(self):
        """返回音频采样率
        :return: Audio sample rate.
        :rtype: int
        """
        return self._sample_rate
    @property
    def num_samples(self):
        """返回样品数量
        :return: Number of samples.
        :rtype: int
        """
        return self._samples.shape[0]
    @property
    def duration(self):
        """返回音频持续时间
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
    @property
    def rms_db(self):
        """返回以分贝为单位的音频均方根能量
        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples ** 2)
        return 10 * np.log10(mean_square)
    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2 ** (bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples
    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2 ** (bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)
    def save(self, path, dtype='float32'):
        """保存音频段到磁盘为wav文件
        :param path: WAV文件路径或文件对象，以保存音频段
        :type path: str|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        self.to_wav_file(path, dtype)
    # 静音去除
    @classmethod
    def silent_semoval(self, inputpath, outputpath):
        # 读取音频文件
        audio = AudioSegment.from_file(inputpath)
        # 语音活动检测
        audio.vad()
        # 保存裁剪后的音频
        audio.save(outputpath)
--- a/mvector/data_utils/augmentor/init.py
+++ b/mvector/data_utils/augmentor/init.py
--- a/mvector/data_utils/augmentor/pycache/init.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/init.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/augmentation.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/augmentation.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/base.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/base.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/noise_perturb.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/noise_perturb.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/resample.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/resample.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/shift_perturb.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/shift_perturb.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/speed_perturb.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/speed_perturb.cpython-37.pyc
--- a/mvector/data_utils/augmentor/pycache/volume_perturb.cpython-37.pyc
+++ b/mvector/data_utils/augmentor/pycache/volume_perturb.cpython-37.pyc
--- a/mvector/data_utils/augmentor/augmentation.py
+++ b/mvector/data_utils/augmentor/augmentation.py
@ -0,0 +1,115 @@
 """Contains the data augmentation pipeline."""
 import json
 import random
 from mvector.data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
 from mvector.data_utils.augmentor.resample import ResampleAugmentor
 from mvector.data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
 from mvector.data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
 from mvector.data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
 from mvector.utils.logger import setup_logger
 logger = setup_logger(__name__)
 class AugmentationPipeline(object):
    """Build a pre-processing pipeline with various augmentation models.Such a
    data augmentation pipeline is oftern leveraged to augment the training
    samples to make the model invariant to certain types of perturbations in the
    real world, improving model's generalization ability.
    The pipeline is built according the the augmentation configuration in json
    string, e.g.
    .. code-block::
    [
      {
        "type": "noise",
        "params": {
          "min_snr_dB": 10,
          "max_snr_dB": 50,
          "noise_manifest_path": "dataset/manifest.noise"
        },
        "prob": 0.5
      },
      {
        "type": "speed",
        "params": {
          "min_speed_rate": 0.9,
          "max_speed_rate": 1.1,
          "num_rates": 3
        },
        "prob": 1.0
      },
      {
        "type": "shift",
        "params": {
          "min_shift_ms": -5,
          "max_shift_ms": 5
        },
        "prob": 1.0
      },
      {
        "type": "volume",
        "params": {
          "min_gain_dBFS": -15,
          "max_gain_dBFS": 15
        },
        "prob": 1.0
      }
    ]
    This augmentation configuration inserts two augmentation models
    into the pipeline, with one is VolumePerturbAugmentor and the other
    SpeedPerturbAugmentor. "prob" indicates the probability of the current
    augmentor to take effect. If "prob" is zero, the augmentor does not take
    effect.
    :param augmentation_config: Augmentation configuration in json string.
    :type augmentation_config: str
    """
    def __init__(self, augmentation_config):
        self._augmentors, self._rates = self._parse_pipeline_from(augmentation_config, aug_type='audio')
    def transform_audio(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        for augmentor, rate in zip(self._augmentors, self._rates):
            if random.random() < rate:
                augmentor.transform_audio(audio_segment)
    def _parse_pipeline_from(self, config_json, aug_type):
        """Parse the config json to build a augmentation pipelien."""
        try:
            configs = []
            configs_temp = json.loads(config_json)
            for config in configs_temp:
                if config['aug_type'] != aug_type: continue
                logger.info('数据增强配置：%s' % config)
                configs.append(config)
            augmentors = [self._get_augmentor(config["type"], config["params"]) for config in configs]
            rates = [config["prob"] for config in configs]
        except Exception as e:
            raise ValueError("Failed to parse the augmentation config json: %s" % str(e))
        return augmentors, rates
    def _get_augmentor(self, augmentor_type, params):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(**params)
        elif augmentor_type == "shift":
            return ShiftPerturbAugmentor(**params)
        elif augmentor_type == "speed":
            return SpeedPerturbAugmentor(**params)
        elif augmentor_type == "resample":
            return ResampleAugmentor(**params)
        elif augmentor_type == "noise":
            return NoisePerturbAugmentor(**params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/mvector/data_utils/augmentor/base.py
+++ b/mvector/data_utils/augmentor/base.py
@ -0,0 +1,30 @@
 """Contains the abstract base class for augmentation models."""
 from abc import ABCMeta, abstractmethod
 class AugmentorBase(object):
    """Abstract base class for augmentation model (augmentor) class.
    All augmentor classes should inherit from this class, and implement the
    following abstract methods.
    """
    __metaclass__ = ABCMeta
    @abstractmethod
    def __init__(self):
        pass
    @abstractmethod
    def transform_audio(self, audio_segment):
        """Adds various effects to the input audio segment. Such effects
        will augment the training data to make the model invariant to certain
        types of perturbations in the real world, improving model's
        generalization ability.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        pass
--- a/mvector/data_utils/augmentor/noise_perturb.py
+++ b/mvector/data_utils/augmentor/noise_perturb.py
@ -0,0 +1,57 @@
 """Contains the noise perturb augmentation model."""
 import os
 import random
 import numpy as np
 from mvector.data_utils.augmentor.base import AugmentorBase
 from mvector.data_utils.audio import AudioSegment
 class NoisePerturbAugmentor(AugmentorBase):
    """用于添加背景噪声的增强模型
    :param min_snr_dB: Minimal signal noise ratio, in decibels.
    :type min_snr_dB: float
    :param max_snr_dB: Maximal signal noise ratio, in decibels.
    :type max_snr_dB: float
    :param repetition: repetition noise sum
    :type repetition: int
    :param noise_dir: noise audio file dir.
    :type noise_dir: str
    """
    def __init__(self, min_snr_dB, max_snr_dB, repetition, noise_dir):
        self._min_snr_dB = min_snr_dB
        self._max_snr_dB = max_snr_dB
        self.repetition = repetition
        self.noises_path = []
        if os.path.exists(noise_dir):
            for file in os.listdir(noise_dir):
                self.noises_path.append(os.path.join(noise_dir, file))
    def transform_audio(self, audio_segment: AudioSegment):
        """Add background noise audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet
        """
        if len(self.noises_path) > 0:
            for _ in range(random.randint(1, self.repetition)):
                # 随机选择一个noises_path中的一个
                noise_path = random.sample(self.noises_path, 1)[0]
                # 读取噪声音频
                noise_segment = AudioSegment.from_file(noise_path)
                # 如果噪声采样率不等于audio_segment的采样率，则重采样
                if noise_segment.sample_rate != audio_segment.sample_rate:
                    noise_segment.resample(audio_segment.sample_rate)
                # 随机生成snr_dB的值
                snr_dB = random.uniform(self._min_snr_dB, self._max_snr_dB)
                # 如果噪声的长度小于audio_segment的长度，则将噪声的前面的部分填充噪声末尾补长
                if noise_segment.duration < audio_segment.duration:
                    diff_duration = audio_segment.num_samples - noise_segment.num_samples
                    noise_segment._samples = np.pad(noise_segment.samples, (0, diff_duration), 'wrap')
                # 将噪声添加到audio_segment中，并将snr_dB调整到最小值和最大值之间
                audio_segment.add_noise(noise_segment, snr_dB)
--- a/mvector/data_utils/augmentor/resample.py
+++ b/mvector/data_utils/augmentor/resample.py
@ -0,0 +1,31 @@
 """Contain the resample augmentation model."""
 import numpy as np
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.augmentor.base import AugmentorBase
 class ResampleAugmentor(AugmentorBase):
    """重采样的增强模型
    See more info here:
    https://ccrma.stanford.edu/~jos/resample/index.html
    :param new_sample_rate: New sample rate in Hz.
    :type new_sample_rate: int
    """
    def __init__(self, new_sample_rate: list):
        self._new_sample_rate = new_sample_rate
    def transform_audio(self, audio_segment: AudioSegment):
        """Resamples the input audio to a target sample rate.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        _new_sample_rate = np.random.choice(self._new_sample_rate)
        audio_segment.resample(_new_sample_rate)
--- a/mvector/data_utils/augmentor/shift_perturb.py
+++ b/mvector/data_utils/augmentor/shift_perturb.py
@ -0,0 +1,31 @@
 """Contains the volume perturb augmentation model."""
 import random
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.augmentor.base import AugmentorBase
 class ShiftPerturbAugmentor(AugmentorBase):
    """添加随机位移扰动的增强模型
    :param min_shift_ms: Minimal shift in milliseconds.
    :type min_shift_ms: float
    :param max_shift_ms: Maximal shift in milliseconds.
    :type max_shift_ms: float
    """
    def __init__(self, min_shift_ms, max_shift_ms):
        self._min_shift_ms = min_shift_ms
        self._max_shift_ms = max_shift_ms
    def transform_audio(self, audio_segment: AudioSegment):
        """Shift audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        shift_ms = random.uniform(self._min_shift_ms, self._max_shift_ms)
        audio_segment.shift(shift_ms)
--- a/mvector/data_utils/augmentor/speed_perturb.py
+++ b/mvector/data_utils/augmentor/speed_perturb.py
@ -0,0 +1,50 @@
 """Contain the speech perturbation augmentation model."""
 import random
 import numpy as np
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.augmentor.base import AugmentorBase
 class SpeedPerturbAugmentor(AugmentorBase):
    """添加速度扰动的增强模型
    See reference paper here:
    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
    :param min_speed_rate: Lower bound of new speed rate to sample and should
                           not be smaller than 0.9.
    :type min_speed_rate: float
    :param max_speed_rate: Upper bound of new speed rate to sample and should
                           not be larger than 1.1.
    :type max_speed_rate: float
    """
    def __init__(self, min_speed_rate=0.9, max_speed_rate=1.1, num_rates=3):
        if min_speed_rate < 0.9:
            raise ValueError("Sampling speed below 0.9 can cause unnatural effects")
        if max_speed_rate > 1.1:
            raise ValueError("Sampling speed above 1.1 can cause unnatural effects")
        self._min_speed_rate = min_speed_rate
        self._max_speed_rate = max_speed_rate
        self._num_rates = num_rates
        if num_rates > 0:
            self._rates = np.linspace(self._min_speed_rate, self._max_speed_rate, self._num_rates, endpoint=True)
    def transform_audio(self, audio_segment: AudioSegment):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        if self._num_rates < 0:
            speed_rate = random.uniform(self._min_speed_rate, self._max_speed_rate)
        else:
            speed_rate = random.choice(self._rates)
        if speed_rate == 1.0: return
        audio_segment.change_speed(speed_rate)
--- a/mvector/data_utils/augmentor/volume_perturb.py
+++ b/mvector/data_utils/augmentor/volume_perturb.py
@ -0,0 +1,37 @@
 """Contains the volume perturb augmentation model."""
 import random
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.augmentor.base import AugmentorBase
 class VolumePerturbAugmentor(AugmentorBase):
    """添加随机音量扰动的增强模型
    This is used for multi-loudness training of PCEN. See
    https://arxiv.org/pdf/1607.05666v1.pdf
    for more details.
    :param min_gain_dBFS: Minimal gain in dBFS.
    :type min_gain_dBFS: float
    :param max_gain_dBFS: Maximal gain in dBFS.
    :type max_gain_dBFS: float
    """
    def __init__(self, min_gain_dBFS, max_gain_dBFS):
        self._min_gain_dBFS = min_gain_dBFS
        self._max_gain_dBFS = max_gain_dBFS
    def transform_audio(self, audio_segment: AudioSegment):
        """Change audio loadness.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        gain = random.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
        audio_segment.gain_db(gain)
--- a/mvector/data_utils/collate_fn.py
+++ b/mvector/data_utils/collate_fn.py
@ -0,0 +1,25 @@
 import numpy as np
 import torch
 # 对一个batch的数据处理
 def collate_fn(batch):
    # 找出音频长度最长的
    batch = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True)
    max_audio_length = batch[0][0].shape[0]
    batch_size = len(batch)
    # 以最大的长度创建0张量
    inputs = np.zeros((batch_size, max_audio_length), dtype='float32')
    input_lens_ratio = []
    labels = []
    for x in range(batch_size):
        sample = batch[x]
        tensor = sample[0]
        labels.append(sample[1])
        seq_length = tensor.shape[0]
        # 将数据插入都0张量中，实现了padding
        inputs[x, :seq_length] = tensor[:]
        input_lens_ratio.append(seq_length/max_audio_length)
    input_lens_ratio = np.array(input_lens_ratio, dtype='float32')
    labels = np.array(labels, dtype='int64')
    return torch.tensor(inputs), torch.tensor(labels), torch.tensor(input_lens_ratio)
--- a/mvector/data_utils/featurizer.py
+++ b/mvector/data_utils/featurizer.py
@ -0,0 +1,103 @@
 import torch
 from torch import nn
 from torchaudio.transforms import MelSpectrogram, Spectrogram, MFCC
 import torchaudio.compliance.kaldi as Kaldi
 class AudioFeaturizer(nn.Module):
    """音频特征器
    :param feature_method: 所使用的预处理方法
    :type feature_method: str
    :param feature_conf: 预处理方法的参数
    :type feature_conf: dict
    """
    def __init__(self, feature_method='MelSpectrogram', feature_conf={}):
        super().__init__()
        self._feature_conf = feature_conf
        self._feature_method = feature_method
        if feature_method == 'MelSpectrogram':
            self.feat_fun = MelSpectrogram(**feature_conf)
        elif feature_method == 'Spectrogram':
            self.feat_fun = Spectrogram(**feature_conf)
        elif feature_method == 'MFCC':
            melkwargs = feature_conf.copy()
            del melkwargs['sample_rate']
            del melkwargs['n_mfcc']
            self.feat_fun = MFCC(sample_rate=self._feature_conf.sample_rate,
                                 n_mfcc=self._feature_conf.n_mfcc,
                                 melkwargs=melkwargs)
        elif feature_method == 'Fbank':
            self.feat_fun = KaldiFbank(**feature_conf)
        else:
            raise Exception(f'预处理方法 {self._feature_method} 不存在!')
    def forward(self, waveforms, input_lens_ratio):
        """从AudioSegment中提取音频特征
        :param waveforms: Audio segment to extract features from.
        :type waveforms: AudioSegment
        :param input_lens_ratio: input length ratio
        :type input_lens_ratio: tensor
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
        """
        feature = self.feat_fun(waveforms)
        feature = feature.transpose(2, 1)
        # 归一化
        mean = torch.mean(feature, 1, keepdim=True)
        std = torch.std(feature, 1, keepdim=True)
        feature = (feature - mean) / (std + 1e-5)
        # 对掩码比例进行扩展
        input_lens = (input_lens_ratio * feature.shape[1])
        mask_lens = torch.round(input_lens).long()
        mask_lens = mask_lens.unsqueeze(1)
        input_lens = input_lens.int()
        # 生成掩码张量
        idxs = torch.arange(feature.shape[1], device=feature.device).repeat(feature.shape[0], 1)
        mask = idxs < mask_lens
        mask = mask.unsqueeze(-1)
        # 对特征进行掩码操作
        feature_masked = torch.where(mask, feature, torch.zeros_like(feature))
        return feature_masked, input_lens
    @property
    def feature_dim(self):
        """返回特征大小
        :return: 特征大小
        :rtype: int
        """
        if self._feature_method == 'LogMelSpectrogram':
            return self._feature_conf.n_mels
        elif self._feature_method == 'MelSpectrogram':
            return self._feature_conf.n_mels
        elif self._feature_method == 'Spectrogram':
            return self._feature_conf.n_fft // 2 + 1
        elif self._feature_method == 'MFCC':
            return self._feature_conf.n_mfcc
        elif self._feature_method == 'Fbank':
            return self._feature_conf.num_mel_bins
        else:
            raise Exception('没有{}预处理方法'.format(self._feature_method))
 class KaldiFbank(nn.Module):
    def __init__(self, **kwargs):
        super(KaldiFbank, self).__init__()
        self.kwargs = kwargs
    def forward(self, waveforms):
        """
        :param waveforms: [Batch, Length]
        :return: [Batch, Length, Feature]
        """
        log_fbanks = []
        for waveform in waveforms:
            if len(waveform.shape) == 1:
                waveform = waveform.unsqueeze(0)
            log_fbank = Kaldi.fbank(waveform, **self.kwargs)
            log_fbank = log_fbank.transpose(0, 1)
            log_fbanks.append(log_fbank)
        log_fbank = torch.stack(log_fbanks)
        return log_fbank
--- a/mvector/data_utils/reader.py
+++ b/mvector/data_utils/reader.py
@ -0,0 +1,73 @@
 import numpy as np
 from torch.utils.data import Dataset
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.augmentor.augmentation import AugmentationPipeline
 from mvector.utils.logger import setup_logger
 logger = setup_logger(__name__)
 class CustomDataset(Dataset):
    def __init__(self,
                 data_list_path,
                 do_vad=True,
                 max_duration=6,
                 min_duration=0.5,
                 augmentation_config='{}',
                 mode='train',
                 sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20):
        """音频数据加载器
        Args:
            data_list_path: 包含音频路径和标签的数据列表文件的路径
            do_vad: 是否对音频进行语音活动检测（VAD）来裁剪静音部分
            max_duration: 最长的音频长度，大于这个长度会裁剪掉
            min_duration: 过滤最短的音频长度
            augmentation_config: 用于指定音频增强的配置
            mode: 数据集模式。在训练模式下，数据集可能会进行一些数据增强的预处理
            sample_rate: 采样率
            use_dB_normalization: 是否对音频进行音量归一化
            target_dB: 音量归一化的大小
        """
        super(CustomDataset, self).__init__()
        self.do_vad = do_vad
        self.max_duration = max_duration
        self.min_duration = min_duration
        self.mode = mode
        self._target_sample_rate = sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
        self._augmentation_pipeline = AugmentationPipeline(augmentation_config=augmentation_config)
        # 获取数据列表
        with open(data_list_path, 'r') as f:
            self.lines = f.readlines()
    def __getitem__(self, idx):
        # 分割音频路径和标签
        audio_path, label = self.lines[idx].replace('\n', '').split('\t')
        # 读取音频
        audio_segment = AudioSegment.from_file(audio_path)
        # 裁剪静音
        if self.do_vad:
            audio_segment.vad()
        # 数据太短不利于训练
        if self.mode == 'train':
            if audio_segment.duration < self.min_duration:
                return self.__getitem__(idx + 1 if idx < len(self.lines) - 1 else 0)
        # 重采样
        if audio_segment.sample_rate != self._target_sample_rate:
            audio_segment.resample(self._target_sample_rate)
        # decibel normalization
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # 裁剪需要的数据
        audio_segment.crop(duration=self.max_duration, mode=self.mode)
        # 音频增强
        self._augmentation_pipeline.transform_audio(audio_segment)
        return np.array(audio_segment.samples, dtype=np.float32), np.array(int(label), dtype=np.int64)
    def __len__(self):
        return len(self.lines)
--- a/mvector/data_utils/utils.py
+++ b/mvector/data_utils/utils.py
@ -0,0 +1,179 @@
 import io
 import itertools
 import av
 import librosa
 import numpy as np
 import torch
 def vad(wav, top_db=10, overlap=200):
    """
    去除音频中的静音部分
    参数：
        wav: 音频数据
        top_db: 信噪比
        overlap: 重叠长度
    返回值：
        wav_output: 去除静音后的音频数据
    """
    intervals = librosa.effects.split(wav, top_db=top_db)
    if len(intervals) == 0:
        return wav
    wav_output = [np.array([])]
    for sliced in intervals:
        seg = wav[sliced[0]:sliced[1]]
        if len(seg) < 2 * overlap:
            wav_output[-1] = np.concatenate((wav_output[-1], seg))
        else:
            wav_output.append(seg)
    wav_output = [x for x in wav_output if len(x) > 0]
    if len(wav_output) == 1:
        wav_output = wav_output[0]
    else:
        wav_output = concatenate(wav_output)
    return wav_output
 def concatenate(wave, overlap=200):
    """
    拼接音频
    参数：
        wave: 音频数据
        overlap: 重叠长度
    返回值：
        unfolded: 拼接后的音频数据
    """
    total_len = sum([len(x) for x in wave])
    unfolded = np.zeros(total_len)
    # Equal power crossfade
    window = np.hanning(2 * overlap)
    fade_in = window[:overlap]
    fade_out = window[-overlap:]
    end = total_len
    for i in range(1, len(wave)):
        prev = wave[i - 1]
        curr = wave[i]
        if i == 1:
            end = len(prev)
            unfolded[:end] += prev
        max_idx = 0
        max_corr = 0
        pattern = prev[-overlap:]
        # slide the curr batch to match with the pattern of previous one
        for j in range(overlap):
            match = curr[j:j + overlap]
            corr = np.sum(pattern * match) / [(np.sqrt(np.sum(pattern ** 2)) * np.sqrt(np.sum(match ** 2))) + 1e-8]
            if corr > max_corr:
                max_idx = j
                max_corr = corr
        # Apply the gain to the overlap samples
        start = end - overlap
        unfolded[start:end] *= fade_out
        end = start + (len(curr) - max_idx)
        curr[max_idx:max_idx + overlap] *= fade_in
        unfolded[start:end] += curr[max_idx:]
    return unfolded[:end]
 def decode_audio(file, sample_rate: int = 16000):
    """读取音频，主要用于兜底读取，支持各种数据格式
    Args:
      file: Path to the input file or a file-like object.
      sample_rate: Resample the audio to this sample rate.
    Returns:
      A float32 Numpy array.
    """
    resampler = av.audio.resampler.AudioResampler(format="s16", layout="mono", rate=sample_rate)
    raw_buffer = io.BytesIO()
    dtype = None
    with av.open(file, metadata_errors="ignore") as container:
        frames = container.decode(audio=0)
        frames = _ignore_invalid_frames(frames)
        frames = _group_frames(frames, 500000)
        frames = _resample_frames(frames, resampler)
        for frame in frames:
            array = frame.to_ndarray()
            dtype = array.dtype
            raw_buffer.write(array)
    audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
    # Convert s16 back to f32.
    return audio.astype(np.float32) / 32768.0
 def _ignore_invalid_frames(frames):
    iterator = iter(frames)
    while True:
        try:
            yield next(iterator)
        except StopIteration:
            break
        except av.error.InvalidDataError:
            continue
 def _group_frames(frames, num_samples=None):
    fifo = av.audio.fifo.AudioFifo()
    for frame in frames:
        frame.pts = None  # Ignore timestamp check.
        fifo.write(frame)
        if num_samples is not None and fifo.samples >= num_samples:
            yield fifo.read()
    if fifo.samples > 0:
        yield fifo.read()
 def _resample_frames(frames, resampler):
    # Add None to flush the resampler.
    for frame in itertools.chain(frames, [None]):
        yield from resampler.resample(frame)
 # 将音频流转换为numpy
 def buf_to_float(x, n_bytes=2, dtype=np.float32):
    """Convert an integer buffer to floating point values.
    This is primarily useful when loading integer-valued wav data
    into numpy arrays.
    Parameters
    ----------
    x : np.ndarray [dtype=int]
        The integer-valued data buffer
    n_bytes : int [1, 2, 4]
        The number of bytes per sample in ``x``
    dtype : numeric type
        The target output type (default: 32-bit float)
    Returns
    -------
    x_float : np.ndarray [dtype=float]
        The input data buffer cast to floating point
    """
    # Invert the scale of the data
    scale = 1.0 / float(1 << ((8 * n_bytes) - 1))
    # Construct the format string
    fmt = "<i{:d}".format(n_bytes)
    # Rescale and format the data buffer
    return scale * np.frombuffer(x, fmt).astype(dtype)
--- a/mvector/metric/init.py
+++ b/mvector/metric/init.py
--- a/mvector/metric/pycache/init.cpython-37.pyc
+++ b/mvector/metric/pycache/init.cpython-37.pyc
--- a/mvector/metric/pycache/metrics.cpython-37.pyc
+++ b/mvector/metric/pycache/metrics.cpython-37.pyc
--- a/mvector/metric/metrics.py
+++ b/mvector/metric/metrics.py
@ -0,0 +1,63 @@
 import numpy as np
 from mvector.utils.logger import setup_logger
 logger = setup_logger(__name__)
 class TprAtFpr(object):
    def __init__(self, max_fpr=0.01):
        self.pos_score_list = []
        self.neg_score_list = []
        self.max_fpr = max_fpr
    def add(self, y_labels, y_scores):
        for y_label, y_score in zip(y_labels, y_scores):
            if y_label == 0:
                self.neg_score_list.append(y_score)
            else:
                self.pos_score_list.append(y_score)
    def reset(self):
        self.pos_score_list = []
        self.neg_score_list = []
    def calculate_eer(self, tprs, fprs):    
        # 记录所有的eer值
        eer_list = []
        n = len(tprs)
        eer = 1.0
        index = 0
        for i in range(n):
            eer_list.append(fprs[i] + (1 - tprs[i]))
            if fprs[i] + (1 - tprs[i]) < eer:
                eer = fprs[i] + (1 - tprs[i])
                index = i
        return eer, index,eer_list
    def calculate(self):
        tprs, fprs, thresholds = [], [], []
        pos_score_list = np.array(self.pos_score_list)
        neg_score_list = np.array(self.neg_score_list)
        if len(pos_score_list) == 0:
            msg = f"The number of positive samples is 0, please add positive samples."
            logger.warning(msg)
            return tprs, fprs, thresholds, None, None
        if len(neg_score_list) == 0:
            msg = f"The number of negative samples is 0, please add negative samples."
            logger.warning(msg)
            return tprs, fprs, thresholds, None, None
        for i in range(0, 100):
            threshold = i / 100.
            tpr = np.sum(pos_score_list > threshold) / len(pos_score_list)
            fpr = np.sum(neg_score_list > threshold) / len(neg_score_list)
            tprs.append(tpr)
            fprs.append(fpr)
            thresholds.append(threshold)
        eer, index,eer_list = self.calculate_eer(fprs=fprs, tprs=tprs)
        # 根据对应的eer_list输出所有对应的阈值
        for i in range(len(eer_list)):
            print(f"threshold: {thresholds[i]}, eer: {eer_list[i]}")
        return tprs, fprs, thresholds, eer, index
--- a/mvector/models/init.py
+++ b/mvector/models/init.py
--- a/mvector/models/pycache/init.cpython-37.pyc
+++ b/mvector/models/pycache/init.cpython-37.pyc
--- a/mvector/models/pycache/ecapa_tdnn.cpython-37.pyc
+++ b/mvector/models/pycache/ecapa_tdnn.cpython-37.pyc
--- a/mvector/models/pycache/fc.cpython-37.pyc
+++ b/mvector/models/pycache/fc.cpython-37.pyc
--- a/mvector/models/pycache/loss.cpython-37.pyc
+++ b/mvector/models/pycache/loss.cpython-37.pyc
--- a/mvector/models/pycache/pooling.cpython-37.pyc
+++ b/mvector/models/pycache/pooling.cpython-37.pyc
--- a/mvector/models/ecapa_tdnn.py
+++ b/mvector/models/ecapa_tdnn.py
@ -0,0 +1,189 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mvector.models.pooling import AttentiveStatsPool, TemporalAveragePooling
 from mvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
 class Res2Conv1dReluBn(nn.Module):
    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
        super().__init__()
        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
        self.scale = scale
        self.width = channels // scale
        self.nums = scale if scale == 1 else scale - 1
        self.convs = []
        self.bns = []
        for i in range(self.nums):
            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
            self.bns.append(nn.BatchNorm1d(self.width))
        self.convs = nn.ModuleList(self.convs)
        self.bns = nn.ModuleList(self.bns)
    def forward(self, x):
        out = []
        spx = torch.split(x, self.width, 1)
        # 遍历每个分支
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                 # 其他分支则将当前子特征与前面所有子特征相加，形成残差连接
                sp = sp + spx[i]
            # Order: conv -> relu -> bn
            sp = self.convs[i](sp)
            sp = self.bns[i](F.relu(sp))
            out.append(sp)
        if self.scale != 1:
            out.append(spx[self.nums])
         # 将所有子分支的结果在通道维度上合并
        out = torch.cat(out, dim=1)
        return out
 class Conv1dReluBn(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
        self.bn = nn.BatchNorm1d(out_channels)
    def forward(self, x):
        return self.bn(F.relu(self.conv(x)))
 class SE_Connect(nn.Module):
    def __init__(self, channels, s=2):
        super().__init__()
        assert channels % s == 0, "{} % {} != 0".format(channels, s)
        self.linear1 = nn.Linear(channels, channels // s)
        self.linear2 = nn.Linear(channels // s, channels)
    def forward(self, x):
        out = x.mean(dim=2)
        out = F.relu(self.linear1(out))
        out = torch.sigmoid(self.linear2(out))
        out = x * out.unsqueeze(2)
        return out
 def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
    """
    初始化函数。
    参数:
    - input_size: 输入尺寸，默认为80。
    - channels: 通道数，默认为512。
    - kernel_size: 卷积核大小, 默认为3。
    - embd_dim: 嵌入维度，默认为192。
    - pooling_type: 池化类型，默认为"ASP"，可选值包括"ASP"、"SAP"、"TAP"、"TSP"。
    - dilation : 空洞卷积的空洞率，默认为1。
    - scale: SE模块的缩放比例，默认为8。
    返回值:
    - 无。
    """
    return nn.Sequential(
        Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
        Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
        Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
        SE_Connect(channels)
    )
 class EcapaTdnn(nn.Module):
    """
    初始化函数。
    参数:
    - input_size: 输入尺寸，默认为80。
    - channels: 通道数，默认为512。
    - embd_dim: 嵌入维度，默认为192。
    - pooling_type: 池化类型，默认为"ASP"，可选值包括"ASP"、"SAP"、"TAP"、"TSP"。
     """
    def __init__(self, input_size=80, channels=512, embd_dim=192, pooling_type="ASP"):
        super().__init__()
        self.layer1 = Conv1dReluBn(input_size, channels, kernel_size=5, padding=2, dilation=1)
        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
        cat_channels = channels * 3
        self.emb_size = embd_dim
        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
        if pooling_type == "ASP":
            self.pooling = AttentiveStatsPool(cat_channels, 128)
            self.bn1 = nn.BatchNorm1d(cat_channels * 2)
            self.linear = nn.Linear(cat_channels * 2, embd_dim)
            self.bn2 = nn.BatchNorm1d(embd_dim)
        elif pooling_type == "SAP":
            self.pooling = SelfAttentivePooling(cat_channels, 128)
            self.bn1 = nn.BatchNorm1d(cat_channels)
            self.linear = nn.Linear(cat_channels, embd_dim)
            self.bn2 = nn.BatchNorm1d(embd_dim)
        elif pooling_type == "TAP":
            self.pooling = TemporalAveragePooling()
            self.bn1 = nn.BatchNorm1d(cat_channels)
            self.linear = nn.Linear(cat_channels, embd_dim)
            self.bn2 = nn.BatchNorm1d(embd_dim)
        elif pooling_type == "TSP":
            self.pooling = TemporalStatisticsPooling()
            self.bn1 = nn.BatchNorm1d(cat_channels * 2)
            self.linear = nn.Linear(cat_channels * 2, embd_dim)
            self.bn2 = nn.BatchNorm1d(embd_dim)
        else:
            raise Exception(f'没有{pooling_type}池化层！')
    # def forward(self, x):
    #     """
    #     Compute embeddings.
    #     Args:
    #         x (torch.Tensor): Input data with shape (N, time, freq).
    #     Returns:
    #         torch.Tensor: Output embeddings with shape (N, self.emb_size, 1)
    #     """
    #     x = x.transpose(2, 1)
    #     out1 = self.layer1(x)
    #     out2 = self.layer2(out1) + out1
    #     out3 = self.layer3(out1 + out2) + out1 + out2
    #     out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
    #     out = torch.cat([out2, out3, out4], dim=1)
    #     out = F.relu(self.conv(out))
    #     out = self.bn1(self.pooling(out))
    #     out = self.bn2(self.linear(out))
    #     return out
    def forward(self, x):
        """
        计算嵌入向量。
        参数:
            x (torch.Tensor): 输入数据，形状为 (N, time, freq)，其中N为样本数量，time为时间维度，freq为频率维度。
        返回值:
            torch.Tensor: 输出嵌入向量，形状为 (N, self.emb_size, 1)
        """
        # 将输入数据的频率和时间维度交换
        x = x.transpose(2, 1)
        # 通过第一层卷积层
        out1 = self.layer1(x)
        # 通过第二层卷积层，并与第一层输出相加
        out2 = self.layer2(out1) + out1
        # 通过第三层卷积层，并依次与前两层输出相加
        out3 = self.layer3(out1 + out2) + out1 + out2
        # 通过第四层卷积层，并依次与前三层输出相加
        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
        # 将第二、三、四层的输出在特征维度上连接
        out = torch.cat([out2, out3, out4], dim=1)
        # 应用ReLU激活函数，并通过卷积层处理
        out = F.relu(self.conv(out))
        # 经过批归一化和池化操作
        out = self.bn1(self.pooling(out))
        # 经过线性变换和批归一化
        out = self.bn2(self.linear(out))
        return out
--- a/mvector/models/fc.py
+++ b/mvector/models/fc.py
@ -0,0 +1,90 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import Parameter
 class SpeakerIdetification(nn.Module):
    def __init__(
            self,
            backbone,
            num_class=1,
            loss_type='AAMLoss',
            lin_blocks=0,
            lin_neurons=192,
            dropout=0.1, ):
        """
        初始化说话人识别模型，包括说话人背骨网络和在训练中针对说话人类别数的线性变换。
        参数：
            backbone (Paddle.nn.Layer class): 说话人识别背骨网络模型。
            num_class (_type_): 训练数据集中说话人的类别数。
            lin_blocks (int, 可选): 从嵌入向量到最终线性层之间的线性层变换数量。默认为0。
            lin_neurons (int, 可选): 最终线性层的输出维度。默认为192。
            dropout (float, 可选): 嵌入向量上的dropout因子。默认为0.1。
        """
        super(SpeakerIdetification, self).__init__()
        # 初始化背骨网络模型
        # 背骨网络的输出为目标嵌入向量
        self.backbone = backbone
        self.loss_type = loss_type
        if dropout > 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None
        # 构建说话人分类器
        input_size = self.backbone.emb_size
        self.blocks = list()
        # 添加线性层变换
        for i in range(lin_blocks):
            self.blocks.extend([
                nn.BatchNorm1d(input_size),
                nn.Linear(in_features=input_size, out_features=lin_neurons),
            ])
            input_size = lin_neurons
        # 最终层初始化
        if self.loss_type == 'AAMLoss':
            self.weight = Parameter(torch.FloatTensor(num_class, input_size), requires_grad=True)
            nn.init.xavier_normal_(self.weight, gain=1)
        elif self.loss_type == 'AMLoss' or self.loss_type == 'ARMLoss':
            self.weight = Parameter(torch.FloatTensor(input_size, num_class), requires_grad=True)
            nn.init.xavier_normal_(self.weight, gain=1)
        elif self.loss_type == 'CELoss':
            self.output = nn.Linear(input_size, num_class)
        else:
            raise Exception(f'没有{self.loss_type}损失函数！')
    def forward(self, x):
        """
        执行说话人识别模型的前向传播，
        包括说话人嵌入模型和分类器模型网络
        参数:
            x (paddle.Tensor): 输入的音频特征，
                            形状=[批大小, 时间, 维度]
        返回值:
            paddle.Tensor: 返回特征的logits
        """
        # x.shape: (N, L, C)
        x = self.backbone(x)  # (N, emb_size)
        if self.dropout is not None:
            x = self.dropout(x)
        for fc in self.blocks:
            x = fc(x)
        if self.loss_type == 'AAMLoss':
            logits = F.linear(F.normalize(x), F.normalize(self.weight, dim=-1))
        elif self.loss_type == 'AMLoss' or self.loss_type == 'ARMLoss':
            x_norm = torch.norm(x, p=2, dim=1, keepdim=True).clamp(min=1e-12)
            x_norm = torch.div(x, x_norm)
            w_norm = torch.norm(self.weight, p=2, dim=0, keepdim=True).clamp(min=1e-12)
            w_norm = torch.div(self.weight, w_norm)
            logits = torch.mm(x_norm, w_norm)
        else:
            logits = self.output(x)
        return logits
--- a/mvector/models/loss.py
+++ b/mvector/models/loss.py
@ -0,0 +1,97 @@
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class AdditiveAngularMargin(nn.Module):
    def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
        """The Implementation of Additive Angular Margin (AAM) proposed
       in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
       (https://arxiv.org/abs/1906.07317)
        Args:
            margin (float, optional): margin factor. Defaults to 0.0.
            scale (float, optional): scale factor. Defaults to 1.0.
            easy_margin (bool, optional): easy_margin flag. Defaults to False.
        """
        super(AdditiveAngularMargin, self).__init__()
        self.margin = margin
        self.scale = scale
        self.easy_margin = easy_margin
        self.cos_m = math.cos(self.margin)
        self.sin_m = math.sin(self.margin)
        self.th = math.cos(math.pi - self.margin)
        self.mm = math.sin(math.pi - self.margin) * self.margin
    def forward(self, outputs, targets):
        cosine = outputs.float()
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        outputs = (targets * phi) + ((1.0 - targets) * cosine)
        return self.scale * outputs
 class AAMLoss(nn.Module):
    def __init__(self, margin=0.2, scale=30, easy_margin=False):
        super(AAMLoss, self).__init__()
        self.loss_fn = AdditiveAngularMargin(margin=margin, scale=scale, easy_margin=easy_margin)
        self.criterion = torch.nn.KLDivLoss(reduction="sum")
    def forward(self, outputs, targets):
        targets = F.one_hot(targets, outputs.shape[1]).float()
        predictions = self.loss_fn(outputs, targets)
        predictions = F.log_softmax(predictions, dim=1)
        loss = self.criterion(predictions, targets) / targets.sum()
        return loss
 class AMLoss(nn.Module):
    def __init__(self, margin=0.2, scale=30):
        super(AMLoss, self).__init__()
        self.m = margin
        self.s = scale
        self.criterion = torch.nn.CrossEntropyLoss(reduction="sum")
    def forward(self, outputs, targets):
        label_view = targets.view(-1, 1)
        delt_costh = torch.zeros(outputs.size(), device=targets.device).scatter_(1, label_view, self.m)
        costh_m = outputs - delt_costh
        predictions = self.s * costh_m
        loss = self.criterion(predictions, targets) / targets.shape[0]
        return loss
 class ARMLoss(nn.Module):
    def __init__(self, margin=0.2, scale=30):
        super(ARMLoss, self).__init__()
        self.m = margin
        self.s = scale
        self.criterion = torch.nn.CrossEntropyLoss(reduction="sum")
    def forward(self, outputs, targets):
        label_view = targets.view(-1, 1)
        delt_costh = torch.zeros(outputs.size(), device=targets.device).scatter_(1, label_view, self.m)
        costh_m = outputs - delt_costh
        costh_m_s = self.s * costh_m
        delt_costh_m_s = costh_m_s.gather(1, label_view).repeat(1, costh_m_s.size()[1])
        costh_m_s_reduct = costh_m_s - delt_costh_m_s
        predictions = torch.where(costh_m_s_reduct < 0.0, torch.zeros_like(costh_m_s), costh_m_s)
        loss = self.criterion(predictions, targets) / targets.shape[0]
        return loss
 class CELoss(nn.Module):
    def __init__(self):
        super(CELoss, self).__init__()
        self.criterion = torch.nn.CrossEntropyLoss(reduction="sum")
    def forward(self, outputs, targets):
        loss = self.criterion(outputs, targets) / targets.shape[0]
        return loss
--- a/mvector/models/pooling.py
+++ b/mvector/models/pooling.py
@ -0,0 +1,75 @@
 import torch
 import torch.nn as nn
 class TemporalAveragePooling(nn.Module):
    def __init__(self):
        """TAP
        Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification
        Link: https://arxiv.org/pdf/1903.12058.pdf
        """
        super(TemporalAveragePooling, self).__init__()
    def forward(self, x):
        """Computes Temporal Average Pooling Module
        Args:
            x (torch.Tensor): Input tensor (#batch, channels, frames).
        Returns:
            torch.Tensor: Output tensor (#batch, channels)
        """
        x = torch.mean(x, dim=2)
        return x
 class TemporalStatisticsPooling(nn.Module):
    def __init__(self):
        """TSP
        Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition
        Link： http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
        """
        super(TemporalStatisticsPooling, self).__init__()
    def forward(self, x):
        """Computes Temporal Statistics Pooling Module
        Args:
            x (torch.Tensor): Input tensor (#batch, channels, frames).
        Returns:
            torch.Tensor: Output tensor (#batch, channels*2)
        """
        mean = torch.mean(x, dim=2)
        var = torch.var(x, dim=2)
        x = torch.cat((mean, var), dim=1)
        return x
 class SelfAttentivePooling(nn.Module):
    def __init__(self, in_dim, bottleneck_dim=128):
        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
        # attention dim = 128
        super(SelfAttentivePooling, self).__init__()
        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
    def forward(self, x):
        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
        alpha = torch.tanh(self.linear1(x))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        return mean
 class AttentiveStatsPool(nn.Module):
    def __init__(self, in_dim, bottleneck_dim=128):
        super().__init__()
        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
    def forward(self, x):
        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
        alpha = torch.tanh(self.linear1(x))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
        std = torch.sqrt(residuals.clamp(min=1e-9))
        return torch.cat([mean, std], dim=1)
--- a/mvector/predict.py
+++ b/mvector/predict.py
@ -0,0 +1,189 @@
 import os
 import pickle
 import shutil
 from io import BufferedReader
 import numpy as np
 import torch
 import yaml
 from sklearn.metrics.pairwise import cosine_similarity
 from tqdm import tqdm
 from mvector import SUPPORT_MODEL
 from mvector.data_utils.audio import AudioSegment
 from mvector.data_utils.featurizer import AudioFeaturizer
 from mvector.models.ecapa_tdnn import EcapaTdnn
 from mvector.models.fc import SpeakerIdetification
 from mvector.utils.logger import setup_logger
 from mvector.utils.utils import dict_to_object, print_arguments
 logger = setup_logger(__name__)
 class MVectorPredictor:
    def __init__(self,
                 configs,
                 threshold=0.6,
                 model_path='models/ecapa_tdnn_FBank/best_model/',
                 use_gpu=True):
        """
        声纹识别预测工具
        :param configs: 配置参数
        :param threshold: 判断是否为同一个人的阈值
        :param model_path: 导出的预测模型文件夹路径
        :param use_gpu: 是否使用GPU预测
        """
        if use_gpu:
            assert (torch.cuda.is_available()), 'GPU不可用'
            self.device = torch.device("cuda")
        else:
            os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
            self.device = torch.device("cpu")
        # 索引候选数量
        self.cdd_num = 5
        self.threshold = threshold
        # 读取配置文件
        if isinstance(configs, str):
            with open(configs, 'r', encoding='utf-8') as f:
                configs = yaml.load(f.read(), Loader=yaml.FullLoader)
            # print_arguments(configs=configs)
        self.configs = dict_to_object(configs)
        assert 'max_duration' in self.configs.dataset_conf, \
            '【警告】，您貌似使用了旧的配置文件，如果你同时使用了旧的模型，这是错误的，请重新下载或者重新训练，否则只能回滚代码。'
        assert self.configs.use_model in SUPPORT_MODEL, f'没有该模型：{self.configs.use_model}'
        self._audio_featurizer = AudioFeaturizer(feature_conf=self.configs.feature_conf, **self.configs.preprocess_conf)
        self._audio_featurizer.to(self.device)
        # 获取模型
        if self.configs.use_model == 'EcapaTdnn' or self.configs.use_model == 'ecapa_tdnn':
            backbone = EcapaTdnn(input_size=self._audio_featurizer.feature_dim, **self.configs.model_conf)
        else:
            raise Exception(f'{self.configs.use_model} 模型不存在！')
        model = SpeakerIdetification(backbone=backbone, num_class=self.configs.dataset_conf.num_speakers)
        model.to(self.device)
        # 加载模型
        if os.path.isdir(model_path):
            model_path = os.path.join(model_path, 'model.pt')
        assert os.path.exists(model_path), f"{model_path} 模型不存在！"
        if torch.cuda.is_available() and use_gpu:
            model_state_dict = torch.load(model_path)
        else:
            model_state_dict = torch.load(model_path, map_location='cpu')
        # 加载模型参数
        model.load_state_dict(model_state_dict)
        print(f"成功加载模型参数：{model_path}")
        # 设置为评估模式
        model.eval()
        self.predictor = model.backbone
        # 声纹库的声纹特征
        self.audio_feature = None
    def _load_audio(self, audio_data, sample_rate=16000):
        """加载音频
        :param audio_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整的字节文件
        :param sample_rate: 如果传入的事numpy数据，需要指定采样率
        :return: 识别的文本结果和解码的得分数
        """
        # 加载音频文件，并进行预处理
        if isinstance(audio_data, str):
            audio_segment = AudioSegment.from_file(audio_data)
        elif isinstance(audio_data, BufferedReader):
            audio_segment = AudioSegment.from_file(audio_data)
        elif isinstance(audio_data, np.ndarray):
            audio_segment = AudioSegment.from_ndarray(audio_data, sample_rate)
        elif isinstance(audio_data, bytes):
            audio_segment = AudioSegment.from_bytes(audio_data)
        else:
            raise Exception(f'不支持该数据类型，当前数据类型为：{type(audio_data)}')
        assert audio_segment.duration >= self.configs.dataset_conf.min_duration, \
            f'音频太短，最小应该为{self.configs.dataset_conf.min_duration}s，当前音频为{audio_segment.duration}s'
        # 重采样
        if audio_segment.sample_rate != self.configs.dataset_conf.sample_rate:
            audio_segment.resample(self.configs.dataset_conf.sample_rate)
        # decibel normalization
        if self.configs.dataset_conf.use_dB_normalization:
            audio_segment.normalize(target_db=self.configs.dataset_conf.target_dB)
        return audio_segment
    def predict(self,
                audio_data,
                sample_rate=16000):
        """预测一个音频的特征
        :param audio_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整并带格式的字节文件
        :param sample_rate: 如果传入的事numpy数据，需要指定采样率
        :return: 声纹特征向量
        """
        # 加载音频文件，并进行预处理
        input_data = self._load_audio(audio_data=audio_data, sample_rate=sample_rate)
        input_data = torch.tensor(input_data.samples, dtype=torch.float32, device=self.device).unsqueeze(0)
        input_len_ratio = torch.tensor([1], dtype=torch.float32, device=self.device)
        audio_feature, _ = self._audio_featurizer(input_data, input_len_ratio)
        # 执行预测
        feature = self.predictor(audio_feature).data.cpu().numpy()[0]
        return feature
    def predict_batch(self, audios_data, sample_rate=16000):
        """预测一批音频的特征
        :param audios_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整并带格式的字节文件
        :param sample_rate: 如果传入的事numpy数据，需要指定采样率
        :return: 声纹特征向量
        """
        audios_data1 = []
        for audio_data in audios_data:
            # 加载音频文件，并进行预处理
            input_data = self._load_audio(audio_data=audio_data, sample_rate=sample_rate)
            audios_data1.append(input_data.samples)
        # 找出音频长度最长的
        batch = sorted(audios_data1, key=lambda a: a.shape[0], reverse=True)
        max_audio_length = batch[0].shape[0]
        batch_size = len(batch)
        # 以最大的长度创建0张量
        inputs = np.zeros((batch_size, max_audio_length), dtype='float32')
        input_lens_ratio = []
        for x in range(batch_size):
            tensor = audios_data1[x]
            seq_length = tensor.shape[0]
            # 将数据插入都0张量中，实现了padding
            inputs[x, :seq_length] = tensor[:]
            input_lens_ratio.append(seq_length/max_audio_length)
        audios_data = torch.tensor(inputs, dtype=torch.float32, device=self.device)
        input_lens_ratio = torch.tensor(input_lens_ratio, dtype=torch.float32, device=self.device)
        audio_feature, _ = self._audio_featurizer(audios_data, input_lens_ratio)
        # 执行预测
        features = self.predictor(audio_feature).data.cpu().numpy()
        return features
    # 声纹对比
    def contrast(self, audio_data1, audio_data2):
        feature1 = self.predict(audio_data1)
        feature2 = self.predict(audio_data2)
        # 对角余弦值
        dist = np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2))
        return dist
    def recognition(self, audio_data, threshold=None, sample_rate=16000):
        """声纹识别
        :param audio_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整的字节文件
        :param threshold: 判断的阈值，如果为None则用创建对象时使用的阈值
        :param sample_rate: 如果传入的事numpy数据，需要指定采样率
        :return: 识别的用户名称，如果为None，即没有识别到用户
        """
        if threshold:
            self.threshold = threshold
        feature = self.predict(audio_data, sample_rate=sample_rate)
        name = self.__retrieval(np_feature=[feature])[0]
        return name
    def compare(self, feature1, feature2):
        """声纹对比
        :param feature1: 特征1
        :param feature2: 特征2
        :return:
        """
        # 对角余弦值
        dist = np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2))
        return dist
--- a/mvector/trainer.py
+++ b/mvector/trainer.py
@ -0,0 +1,483 @@
 import io
 import json
 import os
 import platform
 import shutil
 import time
 from datetime import timedelta
 import numpy as np
 import torch
 import torch.distributed as dist
 import yaml
 from torch.optim.lr_scheduler import CosineAnnealingLR
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 from torchinfo import summary
 from tqdm import tqdm
 from visualdl import LogWriter
 from mvector import SUPPORT_MODEL, __version__
 from mvector.data_utils.collate_fn import collate_fn
 from mvector.data_utils.featurizer import AudioFeaturizer
 from mvector.data_utils.reader import CustomDataset
 from mvector.metric.metrics import TprAtFpr
 from mvector.models.ecapa_tdnn import EcapaTdnn
 from mvector.models.fc import SpeakerIdetification
 from mvector.models.loss import AAMLoss, CELoss, AMLoss, ARMLoss
 from mvector.utils.logger import setup_logger
 from mvector.utils.utils import dict_to_object, print_arguments
 logger = setup_logger(__name__)
 class MVectorTrainer(object):
    def __init__(self, configs, use_gpu=True):
        """ mvector集成工具类
        :param configs: 配置字典
        :param use_gpu: 是否使用GPU训练模型
        """
        if use_gpu:
            assert (torch.cuda.is_available()), 'GPU不可用'
            self.device = torch.device("cuda")
        else:
            os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
            self.device = torch.device("cpu")
        self.use_gpu = use_gpu
        # 读取配置文件
        if isinstance(configs, str):
            with open(configs, 'r', encoding='utf-8') as f:
                configs = yaml.load(f.read(), Loader=yaml.FullLoader)
            print_arguments(configs=configs)
        self.configs = dict_to_object(configs)
        assert self.configs.use_model in SUPPORT_MODEL, f'没有该模型：{self.configs.use_model}'
        self.model = None
        self.test_loader = None
        # 获取特征器
        self.audio_featurizer = AudioFeaturizer(feature_conf=self.configs.feature_conf, **self.configs.preprocess_conf)
        self.audio_featurizer.to(self.device)
        if platform.system().lower() == 'windows':
            self.configs.dataset_conf.num_workers = 0
            logger.warning('Windows系统不支持多线程读取数据，已自动关闭！')
    # 获取数据
    def __setup_dataloader(self, augment_conf_path=None, is_train=False):
        # 获取训练数据
        if augment_conf_path is not None and os.path.exists(augment_conf_path) and is_train:
            augmentation_config = io.open(augment_conf_path, mode='r', encoding='utf8').read()
        else:
            if augment_conf_path is not None and not os.path.exists(augment_conf_path):
                logger.info('数据增强配置文件{}不存在'.format(augment_conf_path))
            augmentation_config = '{}'
        # 兼容旧的配置文件
        if 'max_duration' not in self.configs.dataset_conf:
            self.configs.dataset_conf.max_duration = self.configs.dataset_conf.chunk_duration
        if is_train:
            self.train_dataset = CustomDataset(data_list_path=self.configs.dataset_conf.train_list,
                                               do_vad=self.configs.dataset_conf.do_vad,
                                               max_duration=self.configs.dataset_conf.max_duration,
                                               min_duration=self.configs.dataset_conf.min_duration,
                                               augmentation_config=augmentation_config,
                                               sample_rate=self.configs.dataset_conf.sample_rate,
                                               use_dB_normalization=self.configs.dataset_conf.use_dB_normalization,
                                               target_dB=self.configs.dataset_conf.target_dB,
                                               mode='train')
            train_sampler = None
            if torch.cuda.device_count() > 1:
                # 设置支持多卡训练
                train_sampler = DistributedSampler(dataset=self.train_dataset)
            self.train_loader = DataLoader(dataset=self.train_dataset,
                                           collate_fn=collate_fn,
                                           shuffle=(train_sampler is None),
                                           batch_size=self.configs.dataset_conf.batch_size,
                                           sampler=train_sampler,
                                           num_workers=self.configs.dataset_conf.num_workers)
        # 获取测试数据
        self.test_dataset = CustomDataset(data_list_path=self.configs.dataset_conf.test_list,
                                          do_vad=self.configs.dataset_conf.do_vad,
                                          max_duration=self.configs.dataset_conf.max_duration,
                                          min_duration=self.configs.dataset_conf.min_duration,
                                          sample_rate=self.configs.dataset_conf.sample_rate,
                                          use_dB_normalization=self.configs.dataset_conf.use_dB_normalization,
                                          target_dB=self.configs.dataset_conf.target_dB,
                                          mode='eval')
        self.test_loader = DataLoader(dataset=self.test_dataset,
                                      batch_size=self.configs.dataset_conf.batch_size,
                                      collate_fn=collate_fn,
                                      num_workers=self.configs.dataset_conf.num_workers)
    def __setup_model(self, input_size, is_train=False):
        use_loss = self.configs.get('use_loss', 'AAMLoss')
        # 获取模型
        if self.configs.use_model == 'EcapaTdnn' or self.configs.use_model == 'ecapa_tdnn':
            backbone = EcapaTdnn(input_size=input_size, **self.configs.model_conf)
        else:
            raise Exception(f'{self.configs.use_model} 模型不存在！')
        self.model = SpeakerIdetification(backbone=backbone,
                                          num_class=self.configs.dataset_conf.num_speakers,
                                          loss_type=use_loss)
        self.model.to(self.device)
        # 打印模型信息
        summary(self.model, (1, 98, self.audio_featurizer.feature_dim))
        # print(self.model)
        # 获取损失函数
        if use_loss == 'AAMLoss':
            self.loss = AAMLoss()
        elif use_loss == 'AMLoss':
            self.loss = AMLoss()
        elif use_loss == 'ARMLoss':
            self.loss = ARMLoss()
        elif use_loss == 'CELoss':
            self.loss = CELoss()
        else:
            raise Exception(f'没有{use_loss}损失函数！')
        if is_train:
            # 获取优化方法
            optimizer = self.configs.optimizer_conf.optimizer
            if optimizer == 'Adam':
                self.optimizer = torch.optim.Adam(params=self.model.parameters(),
                                                  lr=float(self.configs.optimizer_conf.learning_rate),
                                                  weight_decay=float(self.configs.optimizer_conf.weight_decay))
            elif optimizer == 'AdamW':
                self.optimizer = torch.optim.AdamW(params=self.model.parameters(),
                                                   lr=float(self.configs.optimizer_conf.learning_rate),
                                                   weight_decay=float(self.configs.optimizer_conf.weight_decay))
            elif optimizer == 'SGD':
                self.optimizer = torch.optim.SGD(params=self.model.parameters(),
                                                 momentum=self.configs.optimizer_conf.momentum,
                                                 lr=float(self.configs.optimizer_conf.learning_rate),
                                                 weight_decay=float(self.configs.optimizer_conf.weight_decay))
            else:
                raise Exception(f'不支持优化方法：{optimizer}')
            # 学习率衰减函数
            self.scheduler = CosineAnnealingLR(self.optimizer, T_max=int(self.configs.train_conf.max_epoch * 1.2))
    def __load_pretrained(self, pretrained_model):
        # 加载预训练模型
        if pretrained_model is not None:
            if os.path.isdir(pretrained_model):
                pretrained_model = os.path.join(pretrained_model, 'model.pt')
            assert os.path.exists(pretrained_model), f"{pretrained_model} 模型不存在！"
            if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
                model_dict = self.model.module.state_dict()
            else:
                model_dict = self.model.state_dict()
            model_state_dict = torch.load(pretrained_model)
            # 过滤不存在的参数
            for name, weight in model_dict.items():
                if name in model_state_dict.keys():
                    if list(weight.shape) != list(model_state_dict[name].shape):
                        logger.warning('{} not used, shape {} unmatched with {} in model.'.
                                       format(name, list(model_state_dict[name].shape), list(weight.shape)))
                        model_state_dict.pop(name, None)
                else:
                    logger.warning('Lack weight: {}'.format(name))
            if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
                self.model.module.load_state_dict(model_state_dict, strict=False)
            else:
                self.model.load_state_dict(model_state_dict, strict=False)
            logger.info('成功加载预训练模型：{}'.format(pretrained_model))
    def __load_checkpoint(self, save_model_path, resume_model):
        # 加载恢复模型
        last_epoch = -1
        best_eer = 1
        last_model_dir = os.path.join(save_model_path,
                                      f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                      'last_model')
        if resume_model is not None or (os.path.exists(os.path.join(last_model_dir, 'model.pt'))
                                        and os.path.exists(os.path.join(last_model_dir, 'optimizer.pt'))):
            # 自动获取最新保存的模型
            if resume_model is None: resume_model = last_model_dir
            assert os.path.exists(os.path.join(resume_model, 'model.pt')), "模型参数文件不存在！"
            assert os.path.exists(os.path.join(resume_model, 'optimizer.pt')), "优化方法参数文件不存在！"
            state_dict = torch.load(os.path.join(resume_model, 'model.pt'))
            if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
                self.model.module.load_state_dict(state_dict)
            else:
                self.model.load_state_dict(state_dict)
            self.optimizer.load_state_dict(torch.load(os.path.join(resume_model, 'optimizer.pt')))
            with open(os.path.join(resume_model, 'model.state'), 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                last_epoch = json_data['last_epoch'] - 1
                best_eer = json_data['eer']
            logger.info('成功恢复模型参数和优化方法参数：{}'.format(resume_model))
        return last_epoch, best_eer
    # 保存模型
    def __save_checkpoint(self, save_model_path, epoch_id, best_eer=0., best_model=False):
        if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
            state_dict = self.model.module.state_dict()
        else:
            state_dict = self.model.state_dict()
        if best_model:
            model_path = os.path.join(save_model_path,
                                      f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                      'best_model')
        else:
            model_path = os.path.join(save_model_path,
                                      f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                      'epoch_{}'.format(epoch_id))
        os.makedirs(model_path, exist_ok=True)
        torch.save(self.optimizer.state_dict(), os.path.join(model_path, 'optimizer.pt'))
        torch.save(state_dict, os.path.join(model_path, 'model.pt'))
        with open(os.path.join(model_path, 'model.state'), 'w', encoding='utf-8') as f:
            data = {"last_epoch": epoch_id, "eer": best_eer, "version": __version__}
            f.write(json.dumps(data))
        if not best_model:
            last_model_path = os.path.join(save_model_path,
                                           f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                           'last_model')
            shutil.rmtree(last_model_path, ignore_errors=True)
            shutil.copytree(model_path, last_model_path)
            # 删除旧的模型
            old_model_path = os.path.join(save_model_path,
                                          f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                          'epoch_{}'.format(epoch_id - 3))
            if os.path.exists(old_model_path):
                shutil.rmtree(old_model_path)
        logger.info('已保存模型：{}'.format(model_path))
    def __train_epoch(self, epoch_id, save_model_path, local_rank, writer, nranks=0):
        # 训练一个epoch
        train_times, accuracies, loss_sum = [], [], []
        start = time.time()
        sum_batch = len(self.train_loader) * self.configs.train_conf.max_epoch
        for batch_id, (audio, label, input_lens_ratio) in enumerate(self.train_loader):
            if nranks > 1:
                audio = audio.to(local_rank)
                input_lens_ratio = input_lens_ratio.to(local_rank)
                label = label.to(local_rank).long()
            else:
                audio = audio.to(self.device)
                input_lens_ratio = input_lens_ratio.to(self.device)
                label = label.to(self.device).long()
            # 获取音频MFCC特征
            features, _ = self.audio_featurizer(audio, input_lens_ratio)
            output = self.model(features)
            # 计算损失值
            los = self.loss(output, label)
            self.optimizer.zero_grad()
            los.backward()
            self.optimizer.step()
            # 计算准确率
            output = torch.nn.functional.softmax(output, dim=-1)
            output = output.data.cpu().numpy()
            output = np.argmax(output, axis=1)
            label = label.data.cpu().numpy()
            acc = np.mean((output == label).astype(int))
            accuracies.append(acc)
            loss_sum.append(los)
            train_times.append((time.time() - start) * 1000)
            # 多卡训练只使用一个进程打印
            if batch_id % self.configs.train_conf.log_interval == 0 and local_rank == 0:
                # 计算每秒训练数据量
                train_speed = self.configs.dataset_conf.batch_size / (sum(train_times) / len(train_times) / 1000)
                # 计算剩余时间
                eta_sec = (sum(train_times) / len(train_times)) * (
                        sum_batch - (epoch_id - 1) * len(self.train_loader) - batch_id)
                eta_str = str(timedelta(seconds=int(eta_sec / 1000)))
                logger.info(f'Train epoch: [{epoch_id}/{self.configs.train_conf.max_epoch}], '
                            f'batch: [{batch_id}/{len(self.train_loader)}], '
                            f'loss: {sum(loss_sum) / len(loss_sum):.5f}, '
                            f'accuracy: {sum(accuracies) / len(accuracies):.5f}, '
                            f'learning rate: {self.scheduler.get_last_lr()[0]:>.8f}, '
                            f'speed: {train_speed:.2f} data/sec, eta: {eta_str}')
                writer.add_scalar('Train/Loss', sum(loss_sum) / len(loss_sum), self.train_step)
                writer.add_scalar('Train/Accuracy', (sum(accuracies) / len(accuracies)), self.train_step)
                # 记录学习率
                writer.add_scalar('Train/lr', self.scheduler.get_last_lr()[0], self.train_step)
                self.train_step += 1
                train_times = []
            # 固定步数也要保存一次模型
            if batch_id % 10000 == 0 and batch_id != 0 and local_rank == 0:
                self.__save_checkpoint(save_model_path=save_model_path, epoch_id=epoch_id)
            start = time.time()
        self.scheduler.step()
    def train(self,
              save_model_path='models/',
              resume_model=None,
              pretrained_model=None,
              augment_conf_path='configs/augmentation.json'):
        """
        训练模型
        :param save_model_path: 模型保存的路径
        :param resume_model: 恢复训练，当为None则不使用预训练模型
        :param pretrained_model: 预训练模型的路径，当为None则不使用预训练模型
        :param augment_conf_path: 数据增强的配置文件，为json格式
        """
        # 获取有多少张显卡训练
        nranks = torch.cuda.device_count()
        local_rank = 0
        writer = None
        if local_rank == 0:
            # 日志记录器
            writer = LogWriter(logdir='log')
        if nranks > 1 and self.use_gpu:
            # 初始化NCCL环境
            dist.init_process_group(backend='nccl')
            local_rank = int(os.environ["LOCAL_RANK"])
        # 获取数据
        self.__setup_dataloader(augment_conf_path=augment_conf_path, is_train=True)
        # 获取模型
        self.__setup_model(input_size=self.audio_featurizer.feature_dim, is_train=True)
        # 支持多卡训练
        if nranks > 1 and self.use_gpu:
            self.model.to(local_rank)
            self.audio_featurizer.to(local_rank)
            self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[local_rank])
        logger.info('训练数据：{}'.format(len(self.train_dataset)))
        self.__load_pretrained(pretrained_model=pretrained_model)
        # 加载恢复模型
        last_epoch, best_eer = self.__load_checkpoint(save_model_path=save_model_path, resume_model=resume_model)
        if last_epoch > 0:
            self.optimizer.step()
            [self.scheduler.step() for _ in range(last_epoch)]
        test_step, self.train_step = 0, 0
        last_epoch += 1
        if local_rank == 0:
            writer.add_scalar('Train/lr', self.scheduler.get_last_lr()[0], last_epoch)
        # 开始训练
        for epoch_id in range(last_epoch, self.configs.train_conf.max_epoch):
            epoch_id += 1
            start_epoch = time.time()
            # 训练一个epoch
            self.__train_epoch(epoch_id=epoch_id, save_model_path=save_model_path, local_rank=local_rank,
                               writer=writer, nranks=nranks)
            # 多卡训练只使用一个进程执行评估和保存模型
            if local_rank == 0:
                logger.info('=' * 70)
                tpr, fpr, eer, threshold = self.evaluate(resume_model=None)
                logger.info('Test epoch: {}, time/epoch: {}, threshold: {:.2f}, tpr: {:.5f}, fpr: {:.5f}, '
                            'eer: {:.5f}'.format(epoch_id, str(timedelta(
                    seconds=(time.time() - start_epoch))), threshold, tpr, fpr, eer))
                logger.info('=' * 70)
                writer.add_scalar('Test/threshold', threshold, test_step)
                writer.add_scalar('Test/tpr', tpr, test_step)
                writer.add_scalar('Test/fpr', fpr, test_step)
                writer.add_scalar('Test/eer', eer, test_step)
                test_step += 1
                self.model.train()
                # # 保存最优模型
                if eer <= best_eer:
                    best_eer = eer
                    self.__save_checkpoint(save_model_path=save_model_path, epoch_id=epoch_id, best_eer=eer,
                                           best_model=True)
                # 保存模型
                self.__save_checkpoint(save_model_path=save_model_path, epoch_id=epoch_id, best_eer=eer)
    def evaluate(self, resume_model='models/EcapaTdnn_MFCC/best_model/', save_image_path=None):
        """
        评估模型
        :param resume_model: 所使用的模型
        :param save_image_path: 保存混合矩阵的路径
        :return: 评估结果
        """
        if self.test_loader is None:
            self.__setup_dataloader()
        if self.model is None:
            self.__setup_model(input_size=self.audio_featurizer.feature_dim)
        if resume_model is not None:
            if os.path.isdir(resume_model):
                resume_model = os.path.join(resume_model, 'model.pt')
            assert os.path.exists(resume_model), f"{resume_model} 模型不存在！"
            model_state_dict = torch.load(resume_model)
            self.model.load_state_dict(model_state_dict)
            logger.info(f'成功加载模型：{resume_model}')
        self.model.eval()
        if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
            eval_model = self.model.module
        else:
            eval_model = self.model
        features, labels = None, None
        losses = []
        with torch.no_grad():
            for batch_id, (audio, label, input_lens_ratio) in enumerate(tqdm(self.test_loader)):
                audio = audio.to(self.device)
                input_lens_ratio = input_lens_ratio.to(self.device)
                label = label.to(self.device).long()
                audio_features, _ = self.audio_featurizer(audio, input_lens_ratio)
                # logits = eval_model(audio_features)
                # loss = self.loss(logits, label)  # 注意，这里使用的是 logits 而不是提取的特征
                # losses.append(loss.item())
                feature = eval_model.backbone(audio_features).data.cpu().numpy()
                label = label.data.cpu().numpy()
                # 存放特征
                features = np.concatenate((features, feature)) if features is not None else feature
                labels = np.concatenate((labels, label)) if labels is not None else label
        # print('Test loss: {:.5f}'.format(sum(losses) / len(losses)))
        self.model.train()
        metric = TprAtFpr()
        labels = labels.astype(np.int32)
        print('开始两两对比音频特征...')
        for i in tqdm(range(len(features))):
            feature_1 = features[i]
            feature_1 = np.expand_dims(feature_1, 0).repeat(len(features) - i, axis=0)
            feature_2 = features[i:]
            feature_1 = torch.tensor(feature_1, dtype=torch.float32)
            feature_2 = torch.tensor(feature_2, dtype=torch.float32)
            score = torch.nn.functional.cosine_similarity(feature_1, feature_2, dim=-1).data.cpu().numpy().tolist()
            y_true = np.array(labels[i] == labels[i:]).astype(np.int32).tolist()
            metric.add(y_true, score)
        tprs, fprs, thresholds, eer, index = metric.calculate()
        tpr, fpr, threshold = tprs[index], fprs[index], thresholds[index]
        if save_image_path:
            import matplotlib.pyplot as plt
            plt.plot(thresholds, tprs, color='blue', linestyle='-', label='tpr')
            plt.plot(thresholds, fprs, color='red', linestyle='-', label='fpr')
            plt.plot(threshold, tpr, 'bo-')
            plt.text(threshold, tpr, (threshold, round(tpr, 5)), color='blue')
            plt.plot(threshold, fpr, 'ro-')
            plt.text(threshold, fpr, (threshold, round(fpr, 5)), color='red')
            plt.xlabel('threshold')
            plt.title('tpr and fpr')
            plt.grid(True)  # 显示网格线
            # 保存图像
            os.makedirs(save_image_path, exist_ok=True)
            plt.savefig(os.path.join(save_image_path, 'result.png'))
            logger.info(f"结果图以保存在：{os.path.join(save_image_path, 'result.png')}")
        return tpr, fpr, eer, threshold
    def export(self, save_model_path='models/', resume_model='models/EcapaTdnn_MelSpectrogram/best_model/'):
        """
        导出预测模型
        :param save_model_path: 模型保存的路径
        :param resume_model: 准备转换的模型路径
        :return:
        """
        # 获取模型
        self.__setup_model(input_size=self.audio_featurizer.feature_dim)
        # 加载预训练模型
        if os.path.isdir(resume_model):
            resume_model = os.path.join(resume_model, 'model.pt')
        assert os.path.exists(resume_model), f"{resume_model} 模型不存在！"
        model_state_dict = torch.load(resume_model)
        self.model.load_state_dict(model_state_dict)
        logger.info('成功恢复模型参数和优化方法参数：{}'.format(resume_model))
        self.model.eval()
        # 获取静态模型
        infer_model = torch.jit.script(self.model.backbone)
        infer_model_path = os.path.join(save_model_path,
                                        f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
                                        'inference.pt')
        os.makedirs(os.path.dirname(infer_model_path), exist_ok=True)
        torch.jit.save(infer_model, infer_model_path)
        logger.info("预测模型已保存：{}".format(infer_model_path))
--- a/mvector/utils/init.py
+++ b/mvector/utils/init.py
--- a/mvector/utils/pycache/init.cpython-37.pyc
+++ b/mvector/utils/pycache/init.cpython-37.pyc
--- a/mvector/utils/pycache/logger.cpython-37.pyc
+++ b/mvector/utils/pycache/logger.cpython-37.pyc
--- a/mvector/utils/pycache/utils.cpython-37.pyc
+++ b/mvector/utils/pycache/utils.cpython-37.pyc
--- a/mvector/utils/logger.py
+++ b/mvector/utils/logger.py
@ -0,0 +1,89 @@
 import datetime
 import logging
 import os
 import sys
 import termcolor
 __all__ = ['setup_logger']
 logger_initialized = []
 def setup_logger(name, output=None):
    """
    Initialize logger and set its verbosity level to INFO.
    Args:
        output (str): a file name or a directory to save log. If None, will not save log file.
            If ends with ".txt" or ".log", assumed to be a file name.
            Otherwise, logs will be saved to `output/log.txt`.
        name (str): the root module name of this logger
    Returns:
        logging.Logger: a logger
    """
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger
    logger.setLevel(logging.INFO)
    logger.propagate = False
    formatter = ("[%(asctime2)s %(levelname2)s] %(module2)s:%(funcName2)s:%(lineno2)s - %(message2)s")
    color_formatter = ColoredFormatter(formatter, datefmt="%m/%d %H:%M:%S")
    ch = logging.StreamHandler(stream=sys.stdout)
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(color_formatter)
    logger.addHandler(ch)
    # file logging: all workers
    if output is not None:
        if output.endswith(".txt") or output.endswith(".log"):
            filename = output
        else:
            filename = os.path.join(output, "log.txt")
        os.makedirs(os.path.dirname(filename))
        fh = logging.FileHandler(filename, mode='a')
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(logging.Formatter())
        logger.addHandler(fh)
    logger_initialized.append(name)
    return logger
 COLORS = {
    "WARNING": "yellow",
    "INFO": "white",
    "DEBUG": "blue",
    "CRITICAL": "red",
    "ERROR": "red",
 }
 class ColoredFormatter(logging.Formatter):
    def __init__(self, fmt, datefmt, use_color=True):
        logging.Formatter.__init__(self, fmt, datefmt=datefmt)
        self.use_color = use_color
    def format(self, record):
        levelname = record.levelname
        if self.use_color and levelname in COLORS:
            def colored(text):
                return termcolor.colored(
                    text,
                    color=COLORS[levelname],
                    attrs={"bold": True},
                )
            record.levelname2 = colored("{:<7}".format(record.levelname))
            record.message2 = colored(record.msg)
            asctime2 = datetime.datetime.fromtimestamp(record.created)
            record.asctime2 = termcolor.colored(asctime2, color="green")
            record.module2 = termcolor.colored(record.module, color="cyan")
            record.funcName2 = termcolor.colored(record.funcName, color="cyan")
            record.lineno2 = termcolor.colored(record.lineno, color="cyan")
        return logging.Formatter.format(self, record)
--- a/mvector/utils/record.py
+++ b/mvector/utils/record.py
@ -0,0 +1,31 @@
 import os
 import soundcard
 import soundfile
 class RecordAudio:
    def __init__(self, channels=1, sample_rate=16000):
        # 录音参数
        self.channels = channels
        self.sample_rate = sample_rate
        # 获取麦克风
        self.default_mic = soundcard.default_microphone()
    def record(self, record_seconds=3, save_path=None):
        """录音
        :param record_seconds: 录音时间，默认3秒
        :param save_path: 录音保存的路径，后缀名为wav
        :return: 音频的numpy数据
        """
        print("开始录音......")
        num_frames = int(record_seconds * self.sample_rate)
        data = self.default_mic.record(samplerate=self.sample_rate, numframes=num_frames, channels=self.channels)
        audio_data = data.squeeze()
        print("录音已结束!")
        if save_path is not None:
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            soundfile.write(save_path, data=data, samplerate=self.sample_rate)
        return audio_data
--- a/mvector/utils/utils.py
+++ b/mvector/utils/utils.py
@ -0,0 +1,85 @@
 import distutils.util
 import numpy as np
 from tqdm import tqdm
 from mvector.utils.logger import setup_logger
 logger = setup_logger(__name__)
 def print_arguments(args=None, configs=None):
    if args:
        logger.info("----------- 额外配置参数 -----------")
        for arg, value in sorted(vars(args).items()):
            logger.info("%s: %s" % (arg, value))
        logger.info("------------------------------------------------")
    if configs:
        logger.info("----------- 配置文件参数 -----------")
        for arg, value in sorted(configs.items()):
            if isinstance(value, dict):
                logger.info(f"{arg}:")
                for a, v in sorted(value.items()):
                    if isinstance(v, dict):
                        logger.info(f"\t{a}:")
                        for a1, v1 in sorted(v.items()):
                            logger.info("\t\t%s: %s" % (a1, v1))
                    else:
                        logger.info("\t%s: %s" % (a, v))
            else:
                logger.info("%s: %s" % (arg, value))
        logger.info("------------------------------------------------")
 def add_arguments(argname, type, default, help, argparser, **kwargs):
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument("--" + argname,
                           default=default,
                           type=type,
                           help=help + ' 默认: %(default)s.',
                           **kwargs)
 class Dict(dict):
    __setattr__ = dict.__setitem__
    __getattr__ = dict.__getitem__
 def dict_to_object(dict_obj):
    if not isinstance(dict_obj, dict):
        return dict_obj
    inst = Dict()
    for k, v in dict_obj.items():
        inst[k] = dict_to_object(v)
    return inst
 # 根据对角余弦值计算准确率和最优的阈值
 def cal_accuracy_threshold(y_score, y_true):
    y_score = np.asarray(y_score)
    y_true = np.asarray(y_true)
    best_accuracy = 0
    best_threshold = 0
    for i in tqdm(range(0, 100)):
        threshold = i * 0.01
        y_test = (y_score >= threshold)
        acc = np.mean((y_test == y_true).astype(int))
        if acc > best_accuracy:
            best_accuracy = acc
            best_threshold = threshold
    return best_accuracy, best_threshold
 # 根据对角余弦值计算准确率
 def cal_accuracy(y_score, y_true, threshold=0.5):
    y_score = np.asarray(y_score)
    y_true = np.asarray(y_true)
    y_test = (y_score >= threshold)
    accuracy = np.mean((y_test == y_true).astype(int))
    return accuracy
 # 计算对角余弦值
 def cosin_metric(x1, x2):
    return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 numba>=0.52.0
 librosa>=0.9.1
 numpy>=1.19.2
 tqdm>=4.59.0
 visualdl>=2.1.1
 resampy==0.2.2
 soundfile>=0.12.1
 soundcard>=0.4.2
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,45 @@
 from setuptools import setup, find_packages
 import mvector
 VERSION = mvector.__version__
 def readme():
    with open('README.md', encoding='utf-8') as f:
        content = f.read()
    return content
 def parse_requirements():
    with open('./requirements.txt', encoding="utf-8") as f:
        requirements = f.readlines()
    return requirements
 if __name__ == "__main__":
    setup(
        name='mvector',
        packages=find_packages(),
        author='yeyupiaoling',
        version=VERSION,
        install_requires=parse_requirements(),
        description='Voice Print Recognition toolkit on Pytorch',
        long_description=readme(),
        long_description_content_type='text/markdown',
        url='https://github.com/yeyupiaoling/VoiceprintRecognition_Pytorch',
        download_url='https://github.com/yeyupiaoling/VoiceprintRecognition_Pytorch.git',
        keywords=['Voice', 'Pytorch'],
        classifiers=[
            'Intended Audience :: Developers',
            'License :: OSI Approved :: Apache Software License',
            'Operating System :: OS Independent',
            'Natural Language :: Chinese (Simplified)',
            'Programming Language :: Python :: 3',
            'Programming Language :: Python :: 3.5',
            'Programming Language :: Python :: 3.6',
            'Programming Language :: Python :: 3.7',
            'Programming Language :: Python :: 3.8',
            'Programming Language :: Python :: 3.9', 'Topic :: Utilities'
        ],
        license='Apache License 2.0',
        ext_modules=[])
--- a/train.py
+++ b/train.py
@ -0,0 +1,26 @@
 import argparse
 import functools
 from mvector.trainer import MVectorTrainer
 from mvector.utils.utils import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('configs',          str,    'configs/ecapa_tdnn.yml',      '配置文件')
 add_arg("local_rank",       int,    0,                             '多卡训练需要的参数')
 add_arg("use_gpu",          bool,   True,                          '是否使用GPU训练')
 add_arg('augment_conf_path',str,    'configs/augmentation.json',   '数据增强的配置文件，为json格式')
 add_arg('save_model_path',  str,    'models/',                  '模型保存的路径')
 add_arg('resume_model',     str,    None,                       '恢复训练，当为None则不使用预训练模型')
 add_arg('save_image_path',  str,    'output/images/',            "保存结果图的路径")
 add_arg('pretrained_model', str,    'models/ecapa_tdnn_MFCC/best_model/',           '预训练模型的路径，当为None则不使用预训练模型')
 args = parser.parse_args()
 print_arguments(args=args)
 # 获取训练器
 trainer = MVectorTrainer(configs=args.configs, use_gpu=args.use_gpu)
 trainer.train(save_model_path=args.save_model_path,
              resume_model=args.resume_model,
              pretrained_model=args.pretrained_model,
              augment_conf_path=args.augment_conf_path)
		`@ -0,0 +1 @@`
							`Subproject commit cfc4b6a2433a4a6f0d2d1cda5f944d677e072ef4`