python script to convert text to audiobook

'''
prerequisites:
  on ubuntu 22.04 or older:
      pip3 install pydub
      pip3 install coqui-tts
  on newer operating systems run the commands in a virtual python environment
      and use 'pip' instead of 'pip3'
'''

import sys
import os
import psutil
import chardet  # You may need to install this library: pip install chardet
import tempfile
import shutil
from pydub import AudioSegment
import subprocess

# Conditional import of TTS
#TTS = None

def is_script_already_running(script_name):
    """Check if another instance of the script is running."""
    current_pid = os.getpid()
    for proc in psutil.process_iter(attrs=['pid', 'name', 'cmdline']):
        try:
            if proc.info['pid'] != current_pid and script_name in proc.info['cmdline']:
                return True
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            continue
    return False

def detect_encoding(file_path):
    """Detect the encoding of a text file."""
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(100000))  # Analyze a portion of the file
    return result['encoding']

def split_text_into_chunks(text, max_words=50):
    """Split text into chunks of approximately max_words, avoiding sentence splits."""
    words = text.split()
    chunks = []
    chunk = []
    word_count = 0

    for word in words:
        chunk.append(word)
        word_count += 1
        if word_count >= max_words and (word.endswith('.') or len(word) > max_words + 25):
            chunks.append(' '.join(chunk))
            chunk = []
            word_count = 0

    if chunk:
        chunks.append(' '.join(chunk))

    return chunks

def create_audio_chunk(chunk_text, chunk_path, model, speaker):
    """Generate TTS audio for a single chunk."""
    # global TTS
    # if TTS is None:
    #     from TTS.api import TTS  # Import only if needed
    
    # tts = TTS(model_name=model)
    # tts.tts_to_file(text=chunk_text, file_path=chunk_path)
    
    '''
    note that if you are using a single speaker model, remove the parameter "--speaker_idx"

    if you are using '--speaker_idx "ED"', and you get an error then you may need to correct a bug:
        open file: nano /root/.local/share/tts/tts_models--en--vctk--vits/speaker_ids.json
        change '"ED\n": 0,' to '"ED": 0,'
        save and exit
    '''
    command = [
        'tts',
        '--text', chunk_text,
        '--model_name', model,
        '--speaker_idx', speaker,
        '--out_path', chunk_path
    ]

    result = subprocess.run(command, capture_output=True, text=True)
    output = result.stdout + result.stderr  # Capture both stdout and stderr
    print(output)

def join_audio_files(audio_files, output_file):
    """Join multiple audio files into one final output file. Returns True when finished."""
    combined = AudioSegment.empty()
    for file in audio_files:
        combined += AudioSegment.from_file(file)
    combined.export(output_file, format="ogg")
    return True

def convert_text_to_audio(input_file, output_file, model, speaker):
    """Convert text to audio and save as a file. Returns True if it thinks that it has succeeded, otherwise False."""
    temp_dir = None
    final_audio_created = False
    try:
        # Detect file encoding
        encoding = detect_encoding(input_file)
        if not encoding:
            raise ValueError("Could not detect file encoding")

        # Read the input text
        with open(input_file, 'r', encoding=encoding, errors='replace') as file:
            text = file.read()

        # Split text into chunks if necessary
        chunks = split_text_into_chunks(text)

        if not chunks:
            print("No text to convert.")
            return

        # Temporary directory for storing audio chunks
        temp_dir = tempfile.mkdtemp()
        temp_audio_files = []

        for i, chunk in enumerate(chunks):
            temp_audio_path = os.path.join(temp_dir, f"chunk_{i}.ogg")
            create_audio_chunk(chunk, temp_audio_path, model, speaker)
            temp_audio_files.append(temp_audio_path)

        # Join all temporary audio files into the final output file
        final_audio_created = join_audio_files(temp_audio_files, output_file)

        print(f"Audio file successfully saved to {output_file}")
    except Exception as e:
        print(f"Error: {e}")
        if temp_dir:
            print(f"Temporary files are saved in {temp_dir} for debugging.")
    finally:
        # Clean up temporary audio files only if the final audio file was created
        if temp_dir and final_audio_created:
            shutil.rmtree(temp_dir)
            return True
        else:
            return False

def process_files(input_dir, output_dir, model="tts_models/en/vctk/vits", speaker="ED"):
    """Process all text and markdown files in the input directory."""
    '''
    model "tts_models/en/jenny/jenny" (irish accent) is good, but is quite slow to process. note that this is a single speaker model and requires a slightly different command.
    model="tts_models/en/vctk/vits", speaker="ED" is quick to process. voice quality is fairly good, but the speaker speaks sort of fast which makes it harder to understand and it is a british accent
    you may want to go through the voice models and see which one works best
    '''
    files_to_process = []

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(('.txt', '.md')):
                files_to_process.append(os.path.join(root, file))

    if not files_to_process:
        print("No files to process.")
        return

    for input_file in files_to_process:
        relative_path = os.path.relpath(os.path.dirname(input_file), input_dir)
        output_subdir = os.path.join(output_dir, relative_path)

        # Ensure the output subdirectory exists
        os.makedirs(output_subdir, exist_ok=True)

        # Ensure unique output file names
        base_name = os.path.splitext(os.path.basename(input_file))[0]
        output_file = os.path.join(output_subdir, f"{base_name}.ogg")
        counter = 1
        while os.path.exists(output_file):
            output_file = os.path.join(output_subdir, f"{base_name}_{counter}.ogg")
            counter += 1

        # Convert text to audio
        success = convert_text_to_audio(input_file, output_file, model, speaker)

        # Move the original text file to the output directory
        if success:
            os.rename(input_file, os.path.join(output_subdir, os.path.basename(input_file)))

if __name__ == "__main__":
    if is_script_already_running(sys.argv[0]):
        print("Error: Another instance of the script is already running.")
        sys.exit(1)

    if len(sys.argv) != 3:
        print("Usage: python script.py <input_directory> <output_directory>")
        sys.exit(1)

    input_dir = sys.argv[1]
    output_dir = sys.argv[2]

    # Ensure the input directory exists
    if not os.path.exists(input_dir):
        print(f"Error: The directory {input_dir} does not exist.")
        sys.exit(1)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Process the files
    process_files(input_dir, output_dir)
More posts

voice to text python script for ubuntu 24.04 using whisper from openai

🎮 Level Up: Spoken Communication Skills Through Online Multiplayer Games

Libertarian UK WWII Strategy

wrist motion timer: app for fitbit versa 2