import os
import re

import litellm

from swarms import Agent

litellm.drop_params = True


def extract_code_blocks(text: str) -> str:
    """
    Extracts code blocks enclosed in triple backticks from the given text.

    Args:
        text (str): The input text containing code blocks.

    Returns:
        str: The extracted code blocks joined together as a single string.
    """
    # Regular expression to match code blocks enclosed in triple backticks
    code_block_pattern = re.compile(
        r"```(?:[a-zA-Z]*)\n(.*?)```", re.DOTALL
    )

    # Find all matches and join them into a single string
    matches = code_block_pattern.findall(text)
    return "\n".join(matches)


# Prompt #1: Translate PyTorch code into CUDA (extensive instructions)
translate_pytorch_to_cuda_prompt = """
You are an AI agent specialized in converting PyTorch code into efficient,
well-structured, and properly functioning CUDA code. Your role is to transform
the given PyTorch code (which may include Python-based tensor operations,
layers, and training loops) into a CUDA codebase capable of running directly on
NVIDIA GPUs without relying on the high-level abstractions of PyTorch.

Detailed Instructions:
1. Read and thoroughly understand every line of the provided PyTorch code, 
   including model definitions, forward passes, loss functions, backward passes, 
   and optimization steps.

2. Rewrite the code in pure CUDA C/C++:
   - Convert high-level tensor operations to raw GPU kernel operations and
     memory management.
   - Allocate and deallocate memory on the GPU using CUDA APIs like cudaMalloc 
     and cudaFree.
   - Handle data transfers between the host (CPU) and the device (GPU) 
     appropriately (e.g., cudaMemcpy).
   - Convert PyTorch's autograd-based gradient calculations into manual CUDA 
     kernel operations or unify them within your own backward pass 
     implementation if needed.
   - Replace Pythonic loops and array operations with explicit CUDA kernels.
   - Implement or port any custom CUDA kernels for special functionality.

3. Optimize the CUDA code for parallel execution:
   - Use grid and block dimensions that take advantage of the target GPU's 
     compute capabilities and memory bandwidth.
   - Leverage shared memory, constant memory, or registers when beneficial.
   - Unroll loops and reduce warp divergence where possible.
   - Use efficient memory access patterns (e.g., coalesced memory access).

4. Preserve the overall logic and functionality:
   - The final CUDA code must produce equivalent numerical results as the 
     original PyTorch code for the same inputs and initialization.
   - Keep the data types, precision, and numerical stability in mind while 
     performing computations in CUDA.

5. Ensure readability and maintainability:
   - Add comments where necessary to explain tricky parts of the CUDA kernels 
     and memory management.
   - Use clear function or file organization to separate different components 
     (e.g., forward pass, backward pass, kernel definitions, helper utilities).

6. Remember that the goal is to create an extensive and fully functional .cu 
   file or files that can be compiled with NVCC or integrated into a larger 
   project.

7. Ensure PyTorch Integration:
   - Implement proper PyTorch C++ extension interfaces using torch::extension
   - Include necessary PyTorch headers and macros for tensor operations
   - Create Python bindings using pybind11 to expose CUDA functions to PyTorch
   - Handle PyTorch tensor types and conversions appropriately
   - Ensure compatibility with PyTorch's autograd system
   - Follow PyTorch's extension conventions for error handling and memory management
   - Make the code loadable as a PyTorch extension module

Output Requirements:
- Provide only the final CUDA code. 
- Do not include any explanatory text outside of comments in the code itself.
- The CUDA code should stand on its own as a complete solution that can be 
  compiled or integrated without additional references.
- The code must be fully compatible with PyTorch's extension system and 
  able to be imported and used seamlessly within PyTorch models.
- Ensure the code is fully compatible with PyTorch's extension system.
- And, the code should be very long and extensive
"""

# Prompt #2: Make the CUDA code super reliable and fast (extensive instructions)
make_cuda_super_reliable_and_fast_prompt = """
You are an advanced AI agent whose goal is to take a fully functional CUDA codebase
and optimize it to be extraordinarily robust, reliable, and efficient. You must
focus on performance improvements at both the kernel level and architectural
level, ensuring that the code is streamlined and built to handle potential edge
cases gracefully.

Detailed Instructions:
1. Dive deeply into the CUDA kernels and identify performance bottlenecks:
   - Look for any uncoalesced memory accesses, unnecessary global memory reads,
     or writes that could be optimized.
   - Identify any opportunities to reduce shared memory usage, if that memory
     usage does not yield significant benefit, or to increase it if it can help 
     reduce global memory accesses.

2. Implement advanced optimization techniques:
   - Use loop unrolling where beneficial to reduce overhead.
   - Employ occupancy analysis to find the ideal block size and grid size for 
     maximum parallelism.
   - Consider the use of constant memory for frequently accessed read-only data.
   - Evaluate the benefits of pinned (page-locked) memory for host-device
     transfers.

3. Improve reliability and error handling:
   - Insert meaningful checks for CUDA API function calls (e.g., cudaMalloc,
     cudaMemcpy, cudaFree, kernel launches). Ensure proper cleanup if any call
     fails or returns an error code.
   - Handle edge cases where input sizes might be too large or too small, 
     preventing invalid memory accesses or out-of-bounds kernel launches.
   - Ensure that any macros, compilation flags, or library calls align with
     best practices for portability across different GPU architectures.

4. Document any advanced tricks or strategies used:
   - In-line commentary is crucial. Briefly explain why a specific optimization
     was chosen and how it impacts performance.

5. Maintain numerical equivalence:
   - All transformations must preserve the original numerical outcomes within
     reasonable floating-point precision limits. Do not alter the algorithmic
     results unexpectedly.

6. Provide a clean final output:
   - The output must be only the improved and optimized CUDA source code.
   - All extraneous explanation beyond code comments should be avoided.
   - The code must be very long and extensive, containing all necessary functionality.
   - Output only the complete code file with no additional text.

Goal:
- Deliver a high-performance CUDA code that is not only efficient in running
  time but is also resilient, capable of handling unexpected conditions, and
  thoroughly commented to allow easy maintenance and future modifications.
- Ensure the output is a complete, extensive codebase with all functionality included.
"""

# Prompt #3: Cleanup errors and add extensive documentation (extensive instructions)
cleanup_and_document_cuda_code_prompt = """
You are a specialized AI agent focused on thoroughly debugging, cleaning up, and
documenting a CUDA codebase. Your mission is to ensure the code compiles cleanly,
handles errors gracefully, follows best coding practices, and includes detailed
documentation for maintainability and educational purposes.

Detailed Instructions:
1. Debug and error cleanup:
   - Identify any compilation or runtime errors and fix them. 
   - Check for mismatched types, undeclared variables, improper memory usage, or
     incorrect kernel configurations. 
   - Make sure that each kernel launch has correct grid and block dimensions and
     that indexing inside kernels is handled properly (avoid out-of-bounds 
     threads).

2. Strengthen error handling:
   - Wrap all CUDA library calls (cudaMalloc, cudaMemcpy, kernel launches, etc.)
     with macros or functions that check for and report errors in a consistent 
     manner (e.g., using cudaGetErrorString). 
   - Ensure resources are deallocated in case of failure and that the program 
     can exit safely without memory leaks or GPU lockups.

3. Add thorough documentation:
   - Provide high-level explanations near the top of the file describing the
     overall code structure and flow.
   - Within each function, write docstrings or block comments to explain the
     function's purpose, inputs, outputs, and major steps.
   - For each kernel, add comments describing how threads are mapped to data,
     how memory is accessed, and what the main loop or computational logic 
     accomplishes.

4. Check performance remains robust:
   - Ensure that none of the debugging or cleanup processes introduce unnecessary
     slowdowns. Where possible, maintain or improve previous optimizations.

5. Provide final cleaned, well-documented CUDA source code:
   - The output must contain only the updated source code, with no additional
     explanation outside of code comments and docstrings.
   - The code must be ready to compile, fully functional, and in a polished 
     state that one can confidently integrate into a production environment.
   - The code must be very long and extensive, containing all necessary functionality.
   - Output only the complete code file with no additional text.

Goal:
- Deliver a thoroughly cleaned, expertly documented CUDA code file. The
  readability, reliability, and educational clarity of the code should be 
  enhanced without sacrificing computational performance.
- Ensure the output is a complete, extensive codebase with all functionality included.
"""

# Prompt #4: Produce a final, extensive CUDA file (extensive instructions)
produce_extensive_cuda_file_prompt = """
You are an AI agent tasked with producing a final, extensive .cu or .cuh file
containing all functionality needed to run the code originally derived from 
PyTorch. This final file should encapsulate:
- The core CUDA kernels.
- The initialization and teardown logic for GPU resources.
- Optimized computations and any specialized routines (e.g., custom backward 
  passes, advanced math operations).
- Comprehensive yet concise in-code documentation.

Detailed Instructions:
1. Merge all relevant kernels, utility functions, and structures into a cohesive
   single file or a well-structured set of files.
2. Ensure you apply all previous optimizations and cleanup efforts, reflecting
   them in this combined final output.
3. Use robust function prototypes for any utility routines, paying attention to
   scope and avoiding unnecessary global variables.
4. Keep the code easily navigable with clear sections:
   - Data structures and utility definitions.
   - Kernel definitions.
   - Host functions for kernel launches.
   - Initialization and cleanup logic.
   - Document each section thoroughly for ease of reference.
5. Ensure it compiles error-free under standard NVCC compilation.
6. Provide only the CUDA code, including all documentation within comments:
   - No additional external explanation should be outside these comments.
   - The code must be very long and extensive, containing all necessary functionality.
   - Output only the complete code file with no additional text.

Goal:
- A single or modular set of .cu/.cuh files that stand as the definitive version
  of the CUDA codebase, balancing performance, reliability, and maintainability.
- Ensure the output is a complete, extensive codebase with all functionality included.
"""


# Now create one agent for each prompt, similar to the example with the Financial-Analysis-Agent.
translate_agent = Agent(
    agent_name="Translate-PyTorch-To-CUDA-Agent",
    system_prompt=translate_pytorch_to_cuda_prompt,
    model_name="openai/o1",
    max_loops=1,
    max_tokens=10000,
    output_type="str",
    temperature=0,
)

super_fast_agent = Agent(
    agent_name="Make-CUDA-Code-Super-Fast-Agent",
    system_prompt=make_cuda_super_reliable_and_fast_prompt,
    model_name="openai/o1",
    max_loops=1,
    max_tokens=10000,
    output_type="str",
    temperature=0,
)

cleanup_agent = Agent(
    agent_name="Cleanup-and-Document-CUDA-Agent",
    system_prompt=cleanup_and_document_cuda_code_prompt,
    model_name="openai/o1",
    max_loops=1,
    max_tokens=10000,
    output_type="str",
    temperature=0,
)

final_cuda_agent = Agent(
    agent_name="Produce-Final-Extensive-CUDA-File-Agent",
    system_prompt=produce_extensive_cuda_file_prompt,
    model_name="openai/o1",
    max_loops=1,
    max_tokens=10000,
    output_type="str",
    temperature=0,
)


class CudaSwarm:
    def __init__(
        self,
        name: str = "CudaSwarm",
        description: str = "A swarm of agents that convert PyTorch code into CUDA code",
        agents: list[Agent] = [
            translate_agent,
            super_fast_agent,
            cleanup_agent,
            final_cuda_agent,
        ],
        max_loops: int = 1,
        file_path: str = "cuda_code.cu",
    ):
        self.name = name
        self.description = description
        self.agents = agents
        self.max_loops = max_loops
        self.file_path = file_path

    def write_file(self, content: str = "") -> None:
        """
        Creates a new CUDA file or overwrites an existing one with the specified content.

        Args:
            content (str): The content to write into the file. Defaults to an empty string.
        """
        # Ensure the file has a .cu extension
        if not self.file_path.endswith(".cu"):
            self.file_path += ".cu"

        # Create the directory if it doesn't exist
        directory = os.path.dirname(self.file_path)
        if directory and not os.path.exists(directory):
            os.makedirs(directory)

        # Use write mode to overwrite the file
        mode = "w"

        try:
            # Write content to the file
            with open(self.file_path, mode) as file:
                file.write(content)
            print(f"Content successfully written to {self.file_path}")
        except IOError as e:
            print(f"An error occurred while writing to the file: {e}")

    def run(self, task: str):
        """
        Runs the swarm of agents to complete the task.
        """
        first_iteration = self.agents[0].run(task)
        first_iteration = extract_code_blocks(first_iteration)
        self.write_file(first_iteration)

        # second_iteration = self.agents[1].run(task)
        # second_iteration = extract_code_blocks(second_iteration)
        # self.write_file(second_iteration)

        # third_iteration = self.agents[2].run(task)
        # third_iteration = extract_code_blocks(third_iteration)
        # self.write_file(third_iteration)

        # final_iteration = self.agents[3].run(task)
        # final_iteration = extract_code_blocks(final_iteration)
        # self.write_file(final_iteration)

        return first_iteration


if __name__ == "__main__":
    swarm = CudaSwarm(file_path="cuda_code.cu")
    swarm.run(
        "Create the cuda code for a highly optimized liquid continous learner ssm model"
    )