tools/tts.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

#!/bin/bash

# Function to display usage information
usage() {
	echo "Usage: $0 -l <lang> -v <voice> -s <speed> [--play] \"<text>\""
	echo "  -l|--lang       : Specify the language (french|english)"
	echo "  -v|--voice      : Specify the voice"
	echo "  -s|--speed      : Specify the speed (0.0 > 3.0, default is 1.0)"
	echo "  --play          : Play the generated audio file using ffplay"
	echo "  <text>          : The text to synthesize"
	exit 1
}

# Function to check if a value is a valid float between 0 and 3.0
is_valid_float() {
	local value=$1
	# Check if the value is a valid number
	if [[ $value =~ ^-?[0-9]+(\.[0-9]+)?$ ]]; then
		# Check if the value is between 0 and 3.0
		if (($(echo "$value >= 0" | bc -l))) && (($(echo "$value <= 3.0" | bc -l))); then
			return 0
		fi
	fi
	return 1
}

# Check for required environment variable
if [[ -z "${LLM_API_KEY}" ]]; then
	echo "The environment variable LLM_API_KEY is not set."
	echo 'You can use the following command: export $(xargs < ../.env))'
	exit 1
fi

# Default values
speed=1.0
host=${TTS_API_HOST:-"http://localhost:8000"}
play_audio=false

# Parse command line arguments
while [[ $# -gt 0 ]]; do
	case $1 in
	-l | --lang)
		lang="$2"
		shift 2
		;;
	-v | --voice)
		voice="$2"
		shift 2
		;;
	-s | --speed)
		speed="$2"
		shift 2
		;;
	--play)
		play_audio=true
		shift 1
		;;
	-h | --help)
		usage
		;;
	-* | --*)
		echo "Unknown option $1"
		usage
		;;
	*)
		break
		;;
	esac
done

# Optionally grab the text after the options
if [[ $# -gt 0 ]]; then
	text="$*"
else
	echo "Error: Text to synthesize is required."
	usage
fi

# Generate a timestamp
timestamp=$(date +"%Y%m%d_%H%M%S")

# Construct the filename with the current date and time
filename="speech_${timestamp}.wav"

# Validate language and voice options
if [[ -z "$lang" || -z "$voice" ]]; then
	echo "Error: Language (-l) and voice (-v) options are required."
	usage
fi

# Check if the speed is valid
if ! is_valid_float "$speed"; then
	echo "Error: Speed must be a float between 0.0 and 3.0."
	exit 1
fi

# Fetch the audio file from the API
http_status_code=$(curl -s "${host}/v1/audio/speech" -o "${filename}" -w "%{http_code}" -H "Authorization: Bearer ${LLM_API_KEY}" -H "Content-Type: application/json" -d "{\"model\": \"tts-1\",\"input\": \"${text}\",\"voice\": \"${voice}\",\"response_format\": \"wav\",\"speed\": ${speed}}")

# Check the response code for successful HTTP request
if [[ "$http_status_code" -ne 200 ]]; then
	echo "Error: Failed to fetch audio file. Received HTTP status code: $http_status_code"
	exit 1
fi

# Optionally play the generated WAV file with ffplay
if [ "$play_audio" = true ]; then
	if ! command -v ffplay &>/dev/null; then
		echo "Error: ffplay is not installed. Please install mpv to play audio files."
		exit 1
	fi
	ffplay ${filename} -nodisp -nostats -hide_banner -autoexit -v quiet
fi

echo "Audio file '$filename' generated successfully."