在basereal中有这样一句代码 from ttsreal import EdgeTTS,SovitsTTS,XTTS,CosyVoiceTTS,FishTTS,TencentTTS,DoubaoTTS
虽然ttsreal代码很长 但是其实是tts类太多了 只需要看基类basetts和edgetts就行 我把这部分代码单独拿出来进行分析
############################################################################### # Copyright (C) 2024 LiveTalking@lipku https://github.com/lipku/LiveTalking # email: lipku@foxmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### from __future__ import annotations import time import numpy as np #用于读取和写入音频文件 import soundfile as sf #音频重采样 import resampy #用于编写异步程序 import asyncio #微软edge的文本转语音库 import edge_tts #操作系统接口 比如文件路径 环境变量 import os #hmac hashlib 用于加密,签名验证 import hmac import hashlib #编码 解码二进制数据 import base64 import json #uuid 生成唯一标识符 import uuid from typing import Iterator #请求 import requests import queue from queue import Queue from io import BytesIO #copy深拷贝对象 websockets用于websocket通信(实时数据传输) gzip压缩/解压数据 import copy, websockets, gzip #thread创建并运行新线程 event 用于线程间的同步控制 from threading import Thread, Event #创建枚举类 from enum import Enum #只有在类型检查工具运行时 才导入BaseReal 在程序实际运行的时候不导入 from typing import TYPE_CHECKING if TYPE_CHECKING: from basereal import BaseReal from logger import logger #定义一个叫state的状态类型 running pause class State(Enum): RUNNING = 0 PAUSE = 1 class BaseTTS: def __init__(self, opt, parent: BaseReal): self.opt = opt self.parent = parent #每秒帧数 self.fps = opt.fps # 20 ms per frame #采样率 self.sample_rate = 16000 #每帧对应样本数 self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms * 16000 / 1000) #创建一个内存字节流 用于临时存储音频数据 self.input_stream = BytesIO() #创建一个线程安全的消息队列 用于存储处理的文本消息 self.msgqueue = Queue() #设置初始状态为运行 self.state = State.RUNNING #清空消息并暂停 def flush_talk(self): self.msgqueue.queue.clear() self.state = State.PAUSE #msg非空 将其和事件信息一起放入消息队列 def put_msg_txt(self, msg: str, eventpoint=None): if len(msg) > 0: self.msgqueue.put((msg, eventpoint)) #创建一个新线程 执行process_tts方法 def render(self, quit_event): process_thread = Thread(target=self.process_tts, args=(quit_event,)) process_thread.start() # def process_tts(self, quit_event): #当退出信号位触发 持续运行循环 while not quit_event.is_set(): try: msg = self.msgqueue.get(block=True, timeout=1) self.state = State.RUNNING#从消息队列中取出一条消息 设置为running except queue.Empty: continue #调用txt_to_audio方法 将文本转换为音频 self.txt_to_audio(msg) logger.info('ttsreal thread stop') def txt_to_audio(self, msg): pass class EdgeTTS(BaseTTS): def txt_to_audio(self, msg): #获得语音名称 用于后续tts合成 voicename = self.opt.REF_FILE # "zh-CN-YunxiaNeural" text, textevent = msg t = time.time() #下面定义__main方法 asyncio.new_event_loop().run_until_complete(self.__main(voicename, text)) logger.info(f'-------edge tts time:{time.time() - t:.4f}s') #音频流为空 if self.input_stream.getbuffer().nbytes <= 0: # edgetts err logger.error('edgetts err!!!!!') return #将音频流指针重置到开头 准备读取数据 self.input_stream.seek(0) #调用私有方法 看下面那个私有方法是干了啥 stream = self.__create_bytes_stream(self.input_stream) #获得音频数据长度 streamlen = stream.shape[0] #初始化索引 idx = 0 #当剩余音频长度大于一个帧大小且状态为运行时 进入循环 while streamlen >= self.chunk and self.state == State.RUNNING: #定义变量 赋值 eventpoint = None #更新长度 streamlen -= self.chunk #在开始一帧和最后一帧添加事件标记 if idx == 0: eventpoint = {'status': 'start', 'text': text, 'msgevent': textevent} elif streamlen < self.chunk: eventpoint = {'status': 'end', 'text': text, 'msgevent': textevent} self.parent.put_audio_frame(stream[idx:idx + self.chunk], eventpoint) idx += self.chunk#移动索引 准备开始下一帧 # if streamlen>0: #skip last frame(not 20ms) # self.queue.put(stream[idx:]) #清空音频流 为下一次使用做准备 self.input_stream.seek(0) self.input_stream.truncate() def __create_bytes_stream(self, byte_stream): # byte_stream=BytesIO(buffer) stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64 logger.info(f'[INFO]tts audio stream {sample_rate}: {stream.shape}') #转换数据类型 stream = stream.astype(np.float32) #设置单声道 if stream.ndim > 1: logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.') stream = stream[:, 0] #重采样 if sample_rate != self.sample_rate and stream.shape[0] > 0: logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.') stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate) return stream async def __main(self, voicename: str, text: str): try: #创建一个communicate对象 communicate = edge_tts.Communicate(text, voicename) # with open(OUTPUT_FILE, "wb") as file: first = True async for chunk in communicate.stream(): if first: first = False #类型是audio 且 状态是运行中 将音频数据写入self.input_stream if chunk["type"] == "audio" and self.state == State.RUNNING: # self.push_audio(chunk["data"]) self.input_stream.write(chunk["data"]) # file.write(chunk["data"]) #单词边界信息 elif chunk["type"] == "WordBoundary": pass except Exception as e: logger.exception('edgetts')