import os import discord from discord.ext import commands import interpreter import dotenv import whisper dotenv.load_dotenv(".env") bot_id = os.getenv("BOT_ID") bot_token = os.getenv("DISCORD_TOKEN") interpreter.api_key = os.getenv("OPENAI_API_KEY") # interpreter.api_base = os.getenv("API_BASE") # interpreter.auto_run = True def split_text(text, chunk_size=1500): ######################################################################### return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] # discord initial intents = discord.Intents.all() intents.message_content = True client = commands.Bot(command_prefix="$", intents=intents) message_chunks = [] send_image = False model = whisper.load_model("base") def transcribe(audio): # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) # decode the audio options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) return result.text @client.event async def on_message(message): await client.process_commands(message) bot_mention = f"<@{bot_id}>" # if ("<@1158923910855798804>" in message.content) or (message.author == client.user or message.content[0] == '$'): # return response = [] for chunk in interpreter.chat(message.content, display=False, stream=False): # await message.channel.send(chunk) if "message" in chunk: response.append(chunk["message"]) last_response = response[-1] max_message_length = 2000 # Discord's max message length is 2000 characters # Splitting the message into chunks of 2000 characters response_chunks = [ last_response[i : i + max_message_length] for i in range(0, len(last_response), max_message_length) ] # Sending each chunk as a separate message for chunk in response_chunks: await message.channel.send(chunk) @client.command() async def join(ctx): if ctx.author.voice: channel = ctx.message.author.voice.channel print("joining..") await channel.connect() print("joined.") else: print("not in a voice channel!") @client.command() async def leave(ctx): if ctx.voice_client: await ctx.voice_client.disconnect() else: print("not in a voice channel!") @client.command() async def listen(ctx): if ctx.voice_client: print("trying to listen..") ctx.voice_client.start_recording(discord.sinks.WaveSink(), callback, ctx) print("listening..") else: print("not in a voice channel!") async def callback(sink: discord.sinks, ctx): print("in callback..") for user_id, audio in sink.audio_data.items(): if user_id == ctx.author.id: print("saving audio..") audio: discord.sinks.core.AudioData = audio print(user_id) filename = "audio.wav" with open(filename, "wb") as f: f.write(audio.file.getvalue()) print("audio saved.") transcription = transcribe(filename) print(transcription) response = [] for chunk in interpreter.chat(transcription, display=False, stream=True): # await message.channel.send(chunk) if "message" in chunk: response.append(chunk["message"]) await ctx.message.channel.send(" ".join(response)) @client.command() async def stop(ctx): ctx.voice_client.stop_recording() @client.event async def on_ready(): print(f"We have logged in as {client.user}") client.run(bot_token)