Blog

  • python script to convert text to audiobook

    '''
    prerequisites:
      on ubuntu 22.04 or older:
          pip3 install pydub
          pip3 install coqui-tts
      on newer operating systems run the commands in a virtual python environment
          and use 'pip' instead of 'pip3'
    '''
    
    import sys
    import os
    import psutil
    import chardet  # You may need to install this library: pip install chardet
    import tempfile
    import shutil
    from pydub import AudioSegment
    import subprocess
    
    # Conditional import of TTS
    #TTS = None
    
    def is_script_already_running(script_name):
        """Check if another instance of the script is running."""
        current_pid = os.getpid()
        for proc in psutil.process_iter(attrs=['pid', 'name', 'cmdline']):
            try:
                if proc.info['pid'] != current_pid and script_name in proc.info['cmdline']:
                    return True
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue
        return False
    
    def detect_encoding(file_path):
        """Detect the encoding of a text file."""
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read(100000))  # Analyze a portion of the file
        return result['encoding']
    
    def split_text_into_chunks(text, max_words=50):
        """Split text into chunks of approximately max_words, avoiding sentence splits."""
        words = text.split()
        chunks = []
        chunk = []
        word_count = 0
    
        for word in words:
            chunk.append(word)
            word_count += 1
            if word_count >= max_words and (word.endswith('.') or len(word) > max_words + 25):
                chunks.append(' '.join(chunk))
                chunk = []
                word_count = 0
    
        if chunk:
            chunks.append(' '.join(chunk))
    
        return chunks
    
    def create_audio_chunk(chunk_text, chunk_path, model, speaker):
        """Generate TTS audio for a single chunk."""
        # global TTS
        # if TTS is None:
        #     from TTS.api import TTS  # Import only if needed
        
        # tts = TTS(model_name=model)
        # tts.tts_to_file(text=chunk_text, file_path=chunk_path)
        
        '''
        note that if you are using a single speaker model, remove the parameter "--speaker_idx"
    
        if you are using '--speaker_idx "ED"', and you get an error then you may need to correct a bug:
            open file: nano /root/.local/share/tts/tts_models--en--vctk--vits/speaker_ids.json
            change '"ED\n": 0,' to '"ED": 0,'
            save and exit
        '''
        command = [
            'tts',
            '--text', chunk_text,
            '--model_name', model,
            '--speaker_idx', speaker,
            '--out_path', chunk_path
        ]
    
        result = subprocess.run(command, capture_output=True, text=True)
        output = result.stdout + result.stderr  # Capture both stdout and stderr
        print(output)
    
    def join_audio_files(audio_files, output_file):
        """Join multiple audio files into one final output file. Returns True when finished."""
        combined = AudioSegment.empty()
        for file in audio_files:
            combined += AudioSegment.from_file(file)
        combined.export(output_file, format="ogg")
        return True
    
    def convert_text_to_audio(input_file, output_file, model, speaker):
        """Convert text to audio and save as a file. Returns True if it thinks that it has succeeded, otherwise False."""
        temp_dir = None
        final_audio_created = False
        try:
            # Detect file encoding
            encoding = detect_encoding(input_file)
            if not encoding:
                raise ValueError("Could not detect file encoding")
    
            # Read the input text
            with open(input_file, 'r', encoding=encoding, errors='replace') as file:
                text = file.read()
    
            # Split text into chunks if necessary
            chunks = split_text_into_chunks(text)
    
            if not chunks:
                print("No text to convert.")
                return
    
            # Temporary directory for storing audio chunks
            temp_dir = tempfile.mkdtemp()
            temp_audio_files = []
    
            for i, chunk in enumerate(chunks):
                temp_audio_path = os.path.join(temp_dir, f"chunk_{i}.ogg")
                create_audio_chunk(chunk, temp_audio_path, model, speaker)
                temp_audio_files.append(temp_audio_path)
    
            # Join all temporary audio files into the final output file
            final_audio_created = join_audio_files(temp_audio_files, output_file)
    
            print(f"Audio file successfully saved to {output_file}")
        except Exception as e:
            print(f"Error: {e}")
            if temp_dir:
                print(f"Temporary files are saved in {temp_dir} for debugging.")
        finally:
            # Clean up temporary audio files only if the final audio file was created
            if temp_dir and final_audio_created:
                shutil.rmtree(temp_dir)
                return True
            else:
                return False
    
    def process_files(input_dir, output_dir, model="tts_models/en/vctk/vits", speaker="ED"):
        """Process all text and markdown files in the input directory."""
        '''
        model "tts_models/en/jenny/jenny" (irish accent) is good, but is quite slow to process. note that this is a single speaker model and requires a slightly different command.
        model="tts_models/en/vctk/vits", speaker="ED" is quick to process. voice quality is fairly good, but the speaker speaks sort of fast which makes it harder to understand and it is a british accent
        you may want to go through the voice models and see which one works best
        '''
        files_to_process = []
    
        for root, _, files in os.walk(input_dir):
            for file in files:
                if file.endswith(('.txt', '.md')):
                    files_to_process.append(os.path.join(root, file))
    
        if not files_to_process:
            print("No files to process.")
            return
    
        for input_file in files_to_process:
            relative_path = os.path.relpath(os.path.dirname(input_file), input_dir)
            output_subdir = os.path.join(output_dir, relative_path)
    
            # Ensure the output subdirectory exists
            os.makedirs(output_subdir, exist_ok=True)
    
            # Ensure unique output file names
            base_name = os.path.splitext(os.path.basename(input_file))[0]
            output_file = os.path.join(output_subdir, f"{base_name}.ogg")
            counter = 1
            while os.path.exists(output_file):
                output_file = os.path.join(output_subdir, f"{base_name}_{counter}.ogg")
                counter += 1
    
            # Convert text to audio
            success = convert_text_to_audio(input_file, output_file, model, speaker)
    
            # Move the original text file to the output directory
            if success:
                os.rename(input_file, os.path.join(output_subdir, os.path.basename(input_file)))
    
    if __name__ == "__main__":
        if is_script_already_running(sys.argv[0]):
            print("Error: Another instance of the script is already running.")
            sys.exit(1)
    
        if len(sys.argv) != 3:
            print("Usage: python script.py <input_directory> <output_directory>")
            sys.exit(1)
    
        input_dir = sys.argv[1]
        output_dir = sys.argv[2]
    
        # Ensure the input directory exists
        if not os.path.exists(input_dir):
            print(f"Error: The directory {input_dir} does not exist.")
            sys.exit(1)
    
        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)
    
        # Process the files
        process_files(input_dir, output_dir)