""" FunASR 实时转录 GUI — 基于 FunASR-Nano-2512 用法: python transcribe_gui.py """ import datetime, queue, threading, time from collections import deque from pathlib import Path import tkinter as tk from tkinter import ttk, filedialog, messagebox import numpy as np, pyaudiowpatch as pyaudio, scipy.signal as sps, torch DEFAULT_MODEL_DIR = "./Fun-ASR-Nano-2512" DEFAULT_HOTWORDS_FILE = "hotwords.txt" DEFAULT_LANGUAGE = "\u4e2d\u6587" TARGET_SR = 16000 DEFAULT_CHUNK_SEC = 0.72 DEFAULT_TRIM_TOKENS = 5 DEFAULT_MAX_SEG_SEC = 20.0 DEFAULT_ITN = True LANGUAGES = ["\u4e2d\u6587","\u82f1\u6587","\u65e5\u6587","\u7ca4\u8bed","\u97e9\u6587"] BG="#1a1b26"; BG2="#16161e"; BG3="#1f2335" ACCENT="#7aa2f7"; ACCENT2="#bb9af7"; FG="#c0caf5"; FG_DIM="#565f89" GREEN="#9ece6a"; RED="#f7768e"; YELLOW="#e0af68" FONT_UI=("Segoe UI",10); FONT_MONO=("Consolas",11); FONT_SML=("Segoe UI",9) NL = "\n" # newline constant used in runtime strings to avoid Write-tool issues DEFAULT_VAD_THRESHOLD = 0.02 DEFAULT_SILENCE_SEC = 0.8 DEFAULT_MIN_SPEECH = 0.3 DEFAULT_MAX_SEG_SEC_VAD = 15.0 DEFAULT_PRE_ROLL = 0.3 def list_loopback_devices(): p = pyaudio.PyAudio(); out = [] try: for i in range(p.get_device_count()): info = p.get_device_info_by_index(i) if info.get("isLoopbackDevice"): label = info["name"] + " (" + str(int(info["defaultSampleRate"])) + " Hz)" out.append((i, label, int(info["defaultSampleRate"]), info["maxInputChannels"])) finally: p.terminate() return out def load_hotwords(path): p = Path(path) if not p.exists(): return [] words = [] with p.open(encoding="utf-8") as f: for line in f: w = line.strip() if w and not w.startswith("#"): words.append(w) return words def to_mono_float32(raw, channels): audio = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 if channels > 1: audio = audio.reshape(-1, channels).mean(axis=1) return audio def resample_audio(audio, src, dst): if src == dst: return audio g = np.gcd(src, dst) return sps.resample_poly(audio, dst//g, src//g).astype(np.float32) def run_inference(model, kwargs, audio_np, hotwords, prev_text, language, itn): try: results, _ = model.inference( data_in=[torch.tensor(audio_np)], hotwords=hotwords, language=language, itn=itn, prev_text=prev_text, **kwargs) return results[0].get("text", "").strip() except Exception: return "" class TranscribeWorker(threading.Thread): def __init__(self, cfg, gui_q): super().__init__(daemon=True) self.cfg=cfg; self.q=gui_q self._stop_evt=threading.Event() self._pause_evt=threading.Event(); self._pause_evt.set() self._audio_lock=threading.Lock(); self._audio_buf=deque() self._pa=None; self._stream=None self._src_rate=TARGET_SR; self._channels=1 def pause(self): self._pause_evt.clear(); self.q.put(("state","paused")) def resume(self): self._clear_buf(); self._pause_evt.set(); self.q.put(("state","running")) def stop(self): self._stop_evt.set(); self._pause_evt.set() def _audio_cb(self, in_data, fc, ti, st): if self._pause_evt.is_set(): raw = to_mono_float32(in_data, self._channels) res = resample_audio(raw, self._src_rate, TARGET_SR) with self._audio_lock: self._audio_buf.append(res) return (None, pyaudio.paContinue) def _pop_all(self): with self._audio_lock: if not self._audio_buf: return None frames = list(self._audio_buf); self._audio_buf.clear() return np.concatenate(frames) def _clear_buf(self): with self._audio_lock: self._audio_buf.clear() def run(self): cfg = self.cfg; self.q.put(("state","loading")) try: self.q.put(("log", "\u6b63\u5728\u52a0\u8f7d\u6a21\u578b\uff0c\u8bf7\u7a0d\u5019...")) from model import FunASRNano device = ("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu") self.q.put(("log", "\u63a8\u7406\u8bbe\u5907: " + device)) model, kwargs = FunASRNano.from_pretrained(model=cfg["model_dir"], device=device) model.eval(); tokenizer = kwargs.get("tokenizer", None) self.q.put(("log", "\u6a21\u578b\u52a0\u8f7d\u5b8c\u6210\uff0c\u5f00\u59cb\u8f6c\u5f55\u3002")) except Exception as e: self.q.put(("error", "\u6a21\u578b\u52a0\u8f7d\u5931\u8d25: " + str(e))) self.q.put(("state","error")); return try: self._pa = pyaudio.PyAudio() info = self._pa.get_device_info_by_index(cfg["dev_id"]) self._src_rate = int(info["defaultSampleRate"]) self._channels = info["maxInputChannels"] fpr = int(self._src_rate * 0.02) self._stream = self._pa.open( format=pyaudio.paInt16, channels=self._channels, rate=self._src_rate, input=True, input_device_index=cfg["dev_id"], frames_per_buffer=fpr, stream_callback=self._audio_cb) self._stream.start_stream() self.q.put(("log", "\u97f3\u9891\u6355\u83b7: " + info["name"])) except Exception as e: self.q.put(("error", "\u97f3\u9891\u8bbe\u5907\u5f00\u542f\u5931\u8d25: " + str(e))) self.q.put(("state","error")) if self._pa: self._pa.terminate() return self.q.put(("state","running")) hotwords = load_hotwords(cfg["hotwords_file"]) language = cfg["language"]; itn = cfg["itn"] chunk_samp = int(cfg["chunk_sec"] * TARGET_SR) max_samp = int(cfg["max_seg_sec"] * TARGET_SR) trim_n = cfg["trim_tokens"] def trim(text): if tokenizer is None or not text: return text ids = tokenizer.encode(text) if len(ids) <= trim_n: return "" return tokenizer.decode(ids[:-trim_n]).replace("\ufffd", "") seg_buf = np.array([], dtype=np.float32) new_buf = np.array([], dtype=np.float32) prev_text = "" try: while not self._stop_evt.is_set(): if not self._pause_evt.is_set(): self._clear_buf(); time.sleep(0.05); continue chunk = self._pop_all() if chunk is not None: new_buf = np.append(new_buf, chunk) seg_buf = np.append(seg_buf, chunk) has_new = len(new_buf) >= chunk_samp force_cut = len(seg_buf) >= max_samp if has_new or force_cut: new_buf = np.array([], dtype=np.float32) text = run_inference(model, kwargs, seg_buf, hotwords, prev_text, language, itn) if force_cut: if text: ts = datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit", text, ts)) seg_buf = np.array([], dtype=np.float32); prev_text = "" else: if text: self.q.put(("live", text)) prev_text = trim(text) else: time.sleep(0.005) finally: if seg_buf.size > 0: final = run_inference(model, kwargs, seg_buf, hotwords, "", language, itn) if final: ts = datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit", final, ts)) if self._stream: self._stream.stop_stream(); self._stream.close() if self._pa: self._pa.terminate() self.q.put(("state","stopped")) class VadWorker(threading.Thread): """VAD-based worker: infers only on speech segments detected by energy VAD.""" def __init__(self, cfg, gui_q): super().__init__(daemon=True) self.cfg=cfg; self.q=gui_q self._stop_evt=threading.Event() self._pause_evt=threading.Event(); self._pause_evt.set() self._audio_lock=threading.Lock(); self._audio_buf=deque() self._pa=None; self._stream=None self._src_rate=TARGET_SR; self._channels=1 def pause(self): self._pause_evt.clear(); self.q.put(("state","paused")) def resume(self): self._clear_buf(); self._pause_evt.set(); self.q.put(("state","running")) def stop(self): self._stop_evt.set(); self._pause_evt.set() def _audio_cb(self, in_data, fc, ti, st): if self._pause_evt.is_set(): raw=to_mono_float32(in_data,self._channels) res=resample_audio(raw,self._src_rate,TARGET_SR) with self._audio_lock: self._audio_buf.append(res) return (None, pyaudio.paContinue) def _pop_all(self): with self._audio_lock: if not self._audio_buf: return None frames=list(self._audio_buf); self._audio_buf.clear() return np.concatenate(frames) def _clear_buf(self): with self._audio_lock: self._audio_buf.clear() def run(self): cfg=self.cfg; self.q.put(("state","loading")) try: self.q.put(("log","正在加载模型,请稍候...")) from model import FunASRNano device=("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu") self.q.put(("log","推理设备: "+device)) model,kwargs=FunASRNano.from_pretrained(model=cfg["model_dir"],device=device) model.eval() self.q.put(("log","模型加载完成,开始 VAD 转录。")) except Exception as e: self.q.put(("error","模型加载失败: "+str(e))); self.q.put(("state","error")); return try: self._pa=pyaudio.PyAudio() info=self._pa.get_device_info_by_index(cfg["dev_id"]) self._src_rate=int(info["defaultSampleRate"]); self._channels=info["maxInputChannels"] fpr=int(self._src_rate*0.02) self._stream=self._pa.open(format=pyaudio.paInt16,channels=self._channels, rate=self._src_rate,input=True,input_device_index=cfg["dev_id"], frames_per_buffer=fpr,stream_callback=self._audio_cb) self._stream.start_stream() self.q.put(("log","音频捕获: "+info["name"])) except Exception as e: self.q.put(("error","音频设备开启失败: "+str(e))); self.q.put(("state","error")) if self._pa: self._pa.terminate() return self.q.put(("state","running")) hotwords=load_hotwords(cfg["hotwords_file"]) language=cfg["language"]; itn=cfg["itn"] vad_thr = cfg["vad_threshold"] sil_samp = int(cfg["silence_sec"] * TARGET_SR) min_samp = int(cfg["min_speech"] * TARGET_SR) max_samp = int(cfg["max_seg_sec"] * TARGET_SR) pre_samp = int(cfg["pre_roll"] * TARGET_SR) hop = int(TARGET_SR * 0.02) speech_buf = np.array([],dtype=np.float32) pre_roll = np.array([],dtype=np.float32) in_speech = False; sil_cnt = 0 try: while not self._stop_evt.is_set(): if not self._pause_evt.is_set(): self._clear_buf(); time.sleep(0.05); continue chunk=self._pop_all() if chunk is None: time.sleep(0.005); continue for i in range(0,len(chunk),hop): frame=chunk[i:i+hop] if len(frame)==0: continue rms=float(np.sqrt(np.mean(frame**2))) is_sp=rms>vad_thr if is_sp: if not in_speech: in_speech=True; sil_cnt=0 speech_buf=np.concatenate([pre_roll,frame]) else: sil_cnt=0; speech_buf=np.append(speech_buf,frame) pre_roll=np.array([],dtype=np.float32) else: pre_roll=np.append(pre_roll,frame) if len(pre_roll)>pre_samp: pre_roll=pre_roll[-pre_samp:] if in_speech: sil_cnt+=len(frame); speech_buf=np.append(speech_buf,frame) if sil_cnt>=sil_samp: in_speech=False seg=speech_buf[:-sil_cnt]; speech_buf=np.array([],dtype=np.float32); sil_cnt=0 if len(seg)>=min_samp: try: res,_=model.inference(data_in=[torch.tensor(seg)],hotwords=hotwords,language=language,itn=itn,**kwargs) text=res[0].get("text","").strip() except Exception: text="" if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) if in_speech and len(speech_buf)>=max_samp: try: res,_=model.inference(data_in=[torch.tensor(speech_buf)],hotwords=hotwords,language=language,itn=itn,**kwargs) text=res[0].get("text","").strip() except Exception: text="" if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) speech_buf=np.array([],dtype=np.float32); sil_cnt=0 finally: if in_speech and len(speech_buf)>=min_samp: try: res,_=model.inference(data_in=[torch.tensor(speech_buf)],hotwords=hotwords,language=language,itn=itn,**kwargs) text=res[0].get("text","").strip() except Exception: text="" if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) if self._stream: self._stream.stop_stream(); self._stream.close() if self._pa: self._pa.terminate() self.q.put(("state","stopped")) DEFAULT_VAD_THRESHOLD = 0.02 DEFAULT_SILENCE_SEC = 0.8 DEFAULT_MIN_SPEECH_SEC = 0.3 DEFAULT_MAX_SEG_SEC_VAD= 15.0 DEFAULT_PRE_ROLL_SEC = 0.3 class VadWorker(threading.Thread): """VAD mode: energy-based VAD, infer only on detected speech segments.""" def __init__(self, cfg, gui_q): super().__init__(daemon=True) self.cfg=cfg; self.q=gui_q self._stop_evt=threading.Event() self._pause_evt=threading.Event(); self._pause_evt.set() self._audio_lock=threading.Lock(); self._audio_buf=deque() self._pa=None; self._stream=None self._src_rate=TARGET_SR; self._channels=1 def pause(self): self._pause_evt.clear(); self.q.put(("state","paused")) def resume(self): self._clear_buf(); self._pause_evt.set(); self.q.put(("state","running")) def stop(self): self._stop_evt.set(); self._pause_evt.set() def _audio_cb(self, in_data, fc, ti, st): if self._pause_evt.is_set(): raw=to_mono_float32(in_data,self._channels) res=resample_audio(raw,self._src_rate,TARGET_SR) with self._audio_lock: self._audio_buf.append(res) return (None, pyaudio.paContinue) def _pop_all(self): with self._audio_lock: if not self._audio_buf: return None frames=list(self._audio_buf); self._audio_buf.clear() return np.concatenate(frames) def _clear_buf(self): with self._audio_lock: self._audio_buf.clear() def run(self): cfg=self.cfg; self.q.put(("state","loading")) try: self.q.put(("log","正在加载模型,请稍候...")) from model import FunASRNano device=("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu") self.q.put(("log","推理设备: "+device)) model,kwargs=FunASRNano.from_pretrained(model=cfg["model_dir"],device=device) model.eval() self.q.put(("log","模型加载完成。")) except Exception as e: self.q.put(("error","模型加载失败: "+str(e))); self.q.put(("state","error")); return try: self._pa=pyaudio.PyAudio() info=self._pa.get_device_info_by_index(cfg["dev_id"]) self._src_rate=int(info["defaultSampleRate"]); self._channels=info["maxInputChannels"] fpr=int(self._src_rate*0.02) self._stream=self._pa.open(format=pyaudio.paInt16,channels=self._channels, rate=self._src_rate,input=True,input_device_index=cfg["dev_id"], frames_per_buffer=fpr,stream_callback=self._audio_cb) self._stream.start_stream() self.q.put(("log","音频捕获: "+info["name"])) except Exception as e: self.q.put(("error","音频设备开启失败: "+str(e))); self.q.put(("state","error")) if self._pa: self._pa.terminate() return self.q.put(("state","running")) hotwords=load_hotwords(cfg["hotwords_file"]) language=cfg["language"]; itn=cfg["itn"] vad_thr = cfg["vad_threshold"] sil_samp = int(cfg["silence_sec"] * TARGET_SR) max_samp = int(cfg["max_seg_sec_vad"]* TARGET_SR) min_samp = int(cfg["min_speech_sec"] * TARGET_SR) pre_samp = int(cfg["pre_roll_sec"] * TARGET_SR) speech_buf=np.array([],dtype=np.float32) pre_roll =np.array([],dtype=np.float32) in_speech =False; sil_cnt=0 hop=int(TARGET_SR*0.02) try: while not self._stop_evt.is_set(): if not self._pause_evt.is_set(): self._clear_buf(); time.sleep(0.05); continue chunk=self._pop_all() if chunk is None: time.sleep(0.005); continue for i in range(0,len(chunk),hop): frame=chunk[i:i+hop] if len(frame)==0: continue rms=float(np.sqrt(np.mean(frame**2))) if rms>vad_thr: if not in_speech: in_speech=True; sil_cnt=0 speech_buf=np.concatenate([pre_roll,frame]) else: sil_cnt=0; speech_buf=np.append(speech_buf,frame) pre_roll=np.array([],dtype=np.float32) else: pre_roll=np.append(pre_roll,frame) if len(pre_roll)>pre_samp: pre_roll=pre_roll[-pre_samp:] if in_speech: sil_cnt+=len(frame); speech_buf=np.append(speech_buf,frame) if sil_cnt>=sil_samp: in_speech=False seg=speech_buf[:-sil_cnt]; speech_buf=np.array([],dtype=np.float32); sil_cnt=0 if len(seg)>=min_samp: text=run_inference(model,kwargs,seg,hotwords,"",language,itn) if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) if in_speech and len(speech_buf)>=max_samp: text=run_inference(model,kwargs,speech_buf,hotwords,"",language,itn) if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) speech_buf=np.array([],dtype=np.float32); sil_cnt=0 finally: if in_speech and len(speech_buf)>=min_samp: text=run_inference(model,kwargs,speech_buf,hotwords,"",language,itn) if text: ts=datetime.datetime.now().strftime("%H:%M:%S") self.q.put(("commit",text,ts)) if self._stream: self._stream.stop_stream(); self._stream.close() if self._pa: self._pa.terminate() self.q.put(("state","stopped")) class App(tk.Tk): def __init__(self): super().__init__() self.title("\u5b9e\u65f6\u8f6c\u5f55 \u2014 FunASR") self.geometry("960x660") self.minsize(720, 500) self.configure(bg=BG) self._worker=None; self._queue=queue.Queue() self._save_fh=None; self._devices=[] self._apply_theme(); self._build_ui() self._refresh_devices(); self._poll_queue() self.protocol("WM_DELETE_WINDOW", self._on_close) def _apply_theme(self): s=ttk.Style(self); s.theme_use("clam") s.configure(".",background=BG2,foreground=FG,font=FONT_UI,borderwidth=0) s.configure("TCombobox",fieldbackground=BG3,background=BG3,foreground=FG,arrowcolor=ACCENT,selectbackground=BG3,selectforeground=FG) s.map("TCombobox",fieldbackground=[("readonly",BG3)],foreground=[("readonly",FG)]) s.configure("TCheckbutton",background=BG2,foreground=FG) s.map("TCheckbutton",background=[("active",BG2)],foreground=[("active",ACCENT)]) s.configure("TEntry",fieldbackground=BG3,foreground=FG,insertcolor=FG) s.configure("TSpinbox",fieldbackground=BG3,foreground=FG,arrowcolor=FG_DIM) s.configure("TScrollbar",background=BG3,troughcolor=BG2,arrowcolor=FG_DIM) s.configure("Accent.TButton",background=ACCENT,foreground=BG,font=("Segoe UI Semibold",10),relief="flat",padding=(10,5)) s.map("Accent.TButton",background=[("active",ACCENT2)]) s.configure("Danger.TButton",background=RED,foreground=BG,font=("Segoe UI Semibold",10),relief="flat",padding=(10,5)) s.map("Danger.TButton",background=[("active","#ff9e9e")]) s.configure("Warn.TButton",background=YELLOW,foreground=BG,font=("Segoe UI Semibold",10),relief="flat",padding=(10,5)) s.map("Warn.TButton",background=[("active","#ffd08a")]) s.configure("Flat.TButton",background=BG3,foreground=FG,relief="flat",padding=(8,4)) s.map("Flat.TButton",background=[("active",BG2)]) def _build_ui(self): self.columnconfigure(0,weight=1); self.rowconfigure(2,weight=1) top=tk.Frame(self,bg=BG2) top.grid(row=0,column=0,sticky="ew") top.columnconfigure(1,weight=1) def lbl(p,t,r,c): tk.Label(p,text=t,bg=BG2,fg=FG_DIM,font=FONT_SML).grid(row=r,column=c,padx=(12,4),pady=3,sticky="w") lbl(top,"\u97f3\u9891\u8bbe\u5907",0,0) self._dev_var=tk.StringVar() self._dev_cb=ttk.Combobox(top,textvariable=self._dev_var,state="readonly",font=FONT_UI) self._dev_cb.grid(row=0,column=1,padx=(0,4),pady=3,sticky="ew") ttk.Button(top,text="\u21bb",style="Flat.TButton",width=3,command=self._refresh_devices).grid(row=0,column=2,padx=(0,12),pady=3) lbl(top,"\u6a21\u578b\u8def\u5f84",1,0) self._model_var=tk.StringVar(value=DEFAULT_MODEL_DIR) ttk.Entry(top,textvariable=self._model_var,font=FONT_UI).grid(row=1,column=1,padx=(0,4),pady=3,sticky="ew") ttk.Button(top,text="\u6d4f\u89c8",style="Flat.TButton",command=self._browse_model).grid(row=1,column=2,padx=(0,12),pady=3) lbl(top,"\u70ed\u8bcd\u6587\u4ef6",2,0) self._hw_var=tk.StringVar(value=DEFAULT_HOTWORDS_FILE) ttk.Entry(top,textvariable=self._hw_var,font=FONT_UI).grid(row=2,column=1,padx=(0,4),pady=3,sticky="ew") ttk.Button(top,text="\u6d4f\u89c8",style="Flat.TButton",command=self._browse_hotwords).grid(row=2,column=2,padx=(0,12),pady=3) pm=tk.Frame(top,bg=BG2) pm.grid(row=3,column=0,columnspan=3,padx=12,pady=(2,4),sticky="w") def sp(p,label,var,lo,hi,inc,w=6): tk.Label(p,text=label,bg=BG2,fg=FG_DIM,font=FONT_SML).pack(side="left") ttk.Spinbox(p,from_=lo,to=hi,increment=inc,textvariable=var,width=w,font=FONT_UI).pack(side="left",padx=(3,14)) tk.Label(pm,text="\u8bed\u8a00",bg=BG2,fg=FG_DIM,font=FONT_SML).pack(side="left") self._lang_var=tk.StringVar(value=DEFAULT_LANGUAGE) ttk.Combobox(pm,textvariable=self._lang_var,values=LANGUAGES,state="readonly",width=7,font=FONT_UI).pack(side="left",padx=(3,14)) self._itn_var=tk.BooleanVar(value=DEFAULT_ITN) ttk.Checkbutton(pm,text="ITN",variable=self._itn_var).pack(side="left",padx=(0,14)) self._chunk_var=tk.DoubleVar(value=DEFAULT_CHUNK_SEC) self._maxseg_var=tk.DoubleVar(value=DEFAULT_MAX_SEG_SEC) self._trim_var=tk.IntVar(value=DEFAULT_TRIM_TOKENS) sp(pm,"\u63a8\u7406\u95f4\u9694(s)",self._chunk_var,0.3,3.0,0.1) sp(pm,"\u6700\u957f\u65ad\u53e5(s)",self._maxseg_var,5.0,60.0,5.0) sp(pm,"\u622a\u65adToken",self._trim_var,0,20,1,w=4) # mode selector mr=tk.Frame(top,bg=BG2) mr.grid(row=4,column=0,columnspan=3,padx=12,pady=(0,2),sticky="w") tk.Label(mr,text="转录模式",bg=BG2,fg=FG_DIM,font=FONT_SML).pack(side="left") self._mode_var=tk.StringVar(value="stream") ttk.Radiobutton(mr,text="流式实时 (高质量/耗资源)",variable=self._mode_var,value="stream",command=self._on_mode_change).pack(side="left",padx=(6,16)) ttk.Radiobutton(mr,text="VAD 实时 (省资源/按语音断句)",variable=self._mode_var,value="vad",command=self._on_mode_change).pack(side="left") self._vad_frame=tk.Frame(top,bg=BG2) self._vad_frame.grid(row=5,column=0,columnspan=3,padx=12,pady=(0,2),sticky="w") def vsp(p2,label,var,lo,hi,inc,w=6): tk.Label(p2,text=label,bg=BG2,fg=FG_DIM,font=FONT_SML).pack(side="left") ttk.Spinbox(p2,from_=lo,to=hi,increment=inc,textvariable=var,width=w,font=FONT_UI).pack(side="left",padx=(3,14)) self._vad_thr_var=tk.DoubleVar(value=DEFAULT_VAD_THRESHOLD) self._vad_sil_var=tk.DoubleVar(value=DEFAULT_SILENCE_SEC) self._vad_minseg_var=tk.DoubleVar(value=DEFAULT_MIN_SPEECH_SEC) self._vad_maxseg_var=tk.DoubleVar(value=DEFAULT_MAX_SEG_SEC_VAD) vsp(self._vad_frame,"能量阈值",self._vad_thr_var,0.001,0.5,0.005) vsp(self._vad_frame,"静音断句(s)",self._vad_sil_var,0.2,3.0,0.1) vsp(self._vad_frame,"最短语音(s)",self._vad_minseg_var,0.1,2.0,0.1) vsp(self._vad_frame,"最长段落(s)",self._vad_maxseg_var,5.0,30.0,5.0) self._vad_frame.grid_remove() br=tk.Frame(top,bg=BG2) br.grid(row=6,column=0,columnspan=3,padx=12,pady=(4,10),sticky="w") self._btn_start=ttk.Button(br,text="\u25b6 \u5f00\u59cb\u8f6c\u5f55",style="Accent.TButton",command=self._on_start) self._btn_start.pack(side="left",padx=(0,6)) self._btn_pause=ttk.Button(br,text="\u23f8 \u6682\u505c",style="Warn.TButton",command=self._on_pause,state="disabled") self._btn_pause.pack(side="left",padx=(0,6)) self._btn_stop=ttk.Button(br,text="\u23f9 \u505c\u6b62",style="Danger.TButton",command=self._on_stop,state="disabled") self._btn_stop.pack(side="left",padx=(0,20)) self._save_var=tk.BooleanVar(value=False) ttk.Checkbutton(br,text="\u4fdd\u5b58\u5230\u6587\u4ef6",variable=self._save_var,command=self._on_save_toggle).pack(side="left",padx=(0,4)) self._save_path_var=tk.StringVar(value="transcript.txt") self._save_entry=ttk.Entry(br,textvariable=self._save_path_var,width=22,font=FONT_UI,state="disabled") self._save_entry.pack(side="left",padx=(0,4)) self._btn_bsave=ttk.Button(br,text="\u6d4f\u89c8",style="Flat.TButton",command=self._browse_save,state="disabled") self._btn_bsave.pack(side="left") self._status_var=tk.StringVar(value="\u5c31\u7eea") tk.Label(self,textvariable=self._status_var,bg=BG3,fg=FG_DIM,font=FONT_SML,anchor="w",padx=12,pady=3).grid(row=1,column=0,sticky="ew") tf=tk.Frame(self,bg=BG) tf.grid(row=2,column=0,sticky="nsew",padx=8,pady=(4,0)) tf.columnconfigure(0,weight=1); tf.rowconfigure(0,weight=1) self._text=tk.Text(tf,bg=BG,fg=FG,font=FONT_MONO,wrap="word",relief="flat",bd=0,padx=14,pady=10,insertbackground=FG,selectbackground=ACCENT,selectforeground=BG,spacing3=5,state="disabled") self._text.grid(row=0,column=0,sticky="nsew") sb=ttk.Scrollbar(tf,command=self._text.yview) sb.grid(row=0,column=1,sticky="ns") self._text["yscrollcommand"]=sb.set self._text.tag_configure("ts",foreground=FG_DIM,font=("Consolas",10)) self._text.tag_configure("commit",foreground=FG) self._text.tag_configure("live",foreground=YELLOW) self._text.tag_configure("log",foreground=FG_DIM,font=("Consolas",10)) self._text.tag_configure("err",foreground=RED) tb=tk.Frame(self,bg=BG2) tb.grid(row=3,column=0,sticky="ew") ttk.Button(tb,text="\u6e05\u7a7a",style="Flat.TButton",command=self._clear_text).pack(side="left",padx=6,pady=4) ttk.Button(tb,text="\u590d\u5236\u5168\u90e8",style="Flat.TButton",command=self._copy_all).pack(side="left",pady=4) def _refresh_devices(self): self._devices=list_loopback_devices() self._dev_cb["values"]=[d[1] for d in self._devices] if self._devices: self._dev_cb.current(0) else: self._dev_var.set("\u672a\u627e\u5230\u73af\u56de\u8bbe\u5907") def _selected_dev_id(self): idx=self._dev_cb.current() return self._devices[idx][0] if 0<=idx