import os import sys import argparse import platform from faster_whisper import WhisperModel from tqdm import tqdm def detect_best_compute_type(): machine = platform.machine() system = platform.system() if system == "Darwin" and machine in ["arm64", "arm"]: try: test_model = WhisperModel("tiny", compute_type="int8_float16") del test_model return "int8_float16" except ValueError: return "int8" return "int8" def parse_args(): parser = argparse.ArgumentParser(description="Transcribe WAV to text using faster-whisper.") parser.add_argument("input_file", help="Path to input WAV file") parser.add_argument("output_file", help="Path to output TXT file") parser.add_argument("--model", default="medium", choices=["tiny", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2"], help="Model size to use (default: medium)") parser.add_argument("--chunk", type=int, default=None, help="Optional chunk length in seconds") parser.add_argument("--compute_type", choices=["int8", "int8_float16", "float16", "float32"], help="Override compute type (default: auto)") parser.add_argument("--srt", action="store_true", help="Also save SRT subtitle file alongside TXT") return parser.parse_args() def load_model(model_size, compute_type): print(f"Loading model '{model_size}' with compute_type='{compute_type}'...") return WhisperModel(model_size, compute_type=compute_type) def format_srt(segments): def srt_timestamp(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = int(seconds % 60) ms = int((seconds - int(seconds)) * 1000) return f"{h:02}:{m:02}:{s:02},{ms:03}" lines = [] for i, segment in enumerate(segments, start=1): start = srt_timestamp(segment.start) end = srt_timestamp(segment.end) text = segment.text.strip() lines.extend([str(i), f"{start} --> {end}", text, ""]) return "\n".join(lines) def split_audio(input_file, chunk_length_sec): from pydub import AudioSegment audio = AudioSegment.from_wav(input_file) chunks = [] for i in range(0, len(audio), chunk_length_sec * 1000): chunk = audio[i:i + chunk_length_sec * 1000] chunk_filename = f"chunk_{i//1000}.wav" chunk.export(chunk_filename, format="wav") chunks.append(chunk_filename) return chunks def transcribe_audio(model, file_path): segments, _ = model.transcribe(file_path) texts = [] collected = [] for segment in tqdm(segments, desc=f"Transcribing {file_path}"): texts.append(segment.text.strip()) collected.append(segment) return " ".join(texts), collected def main(): args = parse_args() if not os.path.isfile(args.input_file): print(f"Error: Input file '{args.input_file}' does not exist.") sys.exit(1) compute_type = args.compute_type or detect_best_compute_type() model = load_model(args.model, compute_type) full_text = "" full_segments = [] if args.chunk: print(f"Splitting audio into {args.chunk} second chunks...") chunks = split_audio(args.input_file, args.chunk) for chunk_file in chunks: text, segments = transcribe_audio(model, chunk_file) full_text += text + "\n" full_segments.extend(segments) os.remove(chunk_file) else: full_text, full_segments = transcribe_audio(model, args.input_file) with open(args.output_file, "w", encoding="utf-8") as f: f.write(full_text) if args.srt: srt_path = os.path.splitext(args.output_file)[0] + ".srt" with open(srt_path, "w", encoding="utf-8") as f: f.write(format_srt(full_segments)) print(f"āœ… Subtitle saved to '{srt_path}'") print(f"\nāœ… Transcription saved to '{args.output_file}'") if __name__ == "__main__": main()