项目代码学习 ttsreal-CSDN博客

在basereal中有这样一句代码
from ttsreal import EdgeTTS,SovitsTTS,XTTS,CosyVoiceTTS,FishTTS,TencentTTS,DoubaoTTS

虽然ttsreal代码很长但是其实是tts类太多了只需要看基类basetts和edgetts就行我把这部分代码单独拿出来进行分析

###############################################################################
#  Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking
#  email: lipku@foxmail.com
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################
from __future__ import annotations
import time
import numpy as np
#用于读取和写入音频文件
import soundfile as sf
#音频重采样
import resampy
#用于编写异步程序
import asyncio
#微软edge的文本转语音库
import edge_tts
#操作系统接口 比如文件路径 环境变量
import os
#hmac hashlib 用于加密，签名验证
import hmac
import hashlib
#编码 解码二进制数据
import base64
import json
#uuid 生成唯一标识符
import uuid


from typing import Iterator

#请求
import requests

import queue
from queue import Queue
from io import BytesIO
#copy深拷贝对象 websockets用于websocket通信（实时数据传输） gzip压缩/解压数据
import copy, websockets, gzip

#thread创建并运行新线程 event 用于线程间的同步控制
from threading import Thread, Event
#创建枚举类
from enum import Enum

#只有在类型检查工具运行时 才导入BaseReal 在程序实际运行的时候不导入
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from basereal import BaseReal

from logger import logger

#定义一个叫state的状态类型 running pause
class State(Enum):
    RUNNING = 0
    PAUSE = 1


class BaseTTS:
    def __init__(self, opt, parent: BaseReal):
        self.opt = opt
        self.parent = parent

        #每秒帧数
        self.fps = opt.fps  # 20 ms per frame
        #采样率
        self.sample_rate = 16000
        #每帧对应样本数
        self.chunk = self.sample_rate // self.fps  # 320 samples per chunk (20ms * 16000 / 1000)
        #创建一个内存字节流 用于临时存储音频数据
        self.input_stream = BytesIO()

        #创建一个线程安全的消息队列 用于存储处理的文本消息
        self.msgqueue = Queue()
        #设置初始状态为运行
        self.state = State.RUNNING

    #清空消息并暂停
    def flush_talk(self):
        self.msgqueue.queue.clear()
        self.state = State.PAUSE

    #msg非空 将其和事件信息一起放入消息队列
    def put_msg_txt(self, msg: str, eventpoint=None):
        if len(msg) > 0:
            self.msgqueue.put((msg, eventpoint))

    #创建一个新线程 执行process_tts方法
    def render(self, quit_event):
        process_thread = Thread(target=self.process_tts, args=(quit_event,))
        process_thread.start()

    #
    def process_tts(self, quit_event):
        #当退出信号位触发 持续运行循环
        while not quit_event.is_set():
            try:
                msg = self.msgqueue.get(block=True, timeout=1)
                self.state = State.RUNNING#从消息队列中取出一条消息 设置为running
            except queue.Empty:
                continue
            #调用txt_to_audio方法 将文本转换为音频
            self.txt_to_audio(msg)
        logger.info('ttsreal thread stop')

    def txt_to_audio(self, msg):
        pass


class EdgeTTS(BaseTTS):
    def txt_to_audio(self, msg):
        #获得语音名称 用于后续tts合成
        voicename = self.opt.REF_FILE  # "zh-CN-YunxiaNeural"
        text, textevent = msg
        t = time.time()
        #下面定义__main方法
        asyncio.new_event_loop().run_until_complete(self.__main(voicename, text))
        logger.info(f'-------edge tts time:{time.time() - t:.4f}s')
        #音频流为空
        if self.input_stream.getbuffer().nbytes <= 0:  # edgetts err
            logger.error('edgetts err!!!!!')
            return

        #将音频流指针重置到开头 准备读取数据
        self.input_stream.seek(0)
        #调用私有方法 看下面那个私有方法是干了啥
        stream = self.__create_bytes_stream(self.input_stream)
        #获得音频数据长度
        streamlen = stream.shape[0]
        #初始化索引
        idx = 0
        #当剩余音频长度大于一个帧大小且状态为运行时 进入循环
        while streamlen >= self.chunk and self.state == State.RUNNING:
            #定义变量 赋值
            eventpoint = None
            #更新长度
            streamlen -= self.chunk
            #在开始一帧和最后一帧添加事件标记
            if idx == 0:
                eventpoint = {'status': 'start', 'text': text, 'msgevent': textevent}
            elif streamlen < self.chunk:
                eventpoint = {'status': 'end', 'text': text, 'msgevent': textevent}
            self.parent.put_audio_frame(stream[idx:idx + self.chunk], eventpoint)
            idx += self.chunk#移动索引 准备开始下一帧
        # if streamlen>0:  #skip last frame(not 20ms)
        #    self.queue.put(stream[idx:])
        #清空音频流 为下一次使用做准备
        self.input_stream.seek(0)
        self.input_stream.truncate()

    def __create_bytes_stream(self, byte_stream):
        # byte_stream=BytesIO(buffer)
        stream, sample_rate = sf.read(byte_stream)  # [T*sample_rate,] float64
        logger.info(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        #转换数据类型
        stream = stream.astype(np.float32)

        #设置单声道
        if stream.ndim > 1:
            logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]

        #重采样
        if sample_rate != self.sample_rate and stream.shape[0] > 0:
            logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)

        return stream

    async def __main(self, voicename: str, text: str):
        try:
            #创建一个communicate对象
            communicate = edge_tts.Communicate(text, voicename)

            # with open(OUTPUT_FILE, "wb") as file:
            first = True
            async for chunk in communicate.stream():
                if first:
                    first = False
                #类型是audio 且 状态是运行中 将音频数据写入self.input_stream
                if chunk["type"] == "audio" and self.state == State.RUNNING:
                    # self.push_audio(chunk["data"])
                    self.input_stream.write(chunk["data"])
                    # file.write(chunk["data"])
                #单词边界信息
                elif chunk["type"] == "WordBoundary":
                    pass
        except Exception as e:
            logger.exception('edgetts')