import os import re import litellm from swarms import Agent litellm.drop_params = True def extract_code_blocks(text: str) -> str: """ Extracts code blocks enclosed in triple backticks from the given text. Args: text (str): The input text containing code blocks. Returns: str: The extracted code blocks joined together as a single string. """ # Regular expression to match code blocks enclosed in triple backticks code_block_pattern = re.compile( r"```(?:[a-zA-Z]*)\n(.*?)```", re.DOTALL ) # Find all matches and join them into a single string matches = code_block_pattern.findall(text) return "\n".join(matches) # Prompt #1: Translate PyTorch code into CUDA (extensive instructions) translate_pytorch_to_cuda_prompt = """ You are an AI agent specialized in converting PyTorch code into efficient, well-structured, and properly functioning CUDA code. Your role is to transform the given PyTorch code (which may include Python-based tensor operations, layers, and training loops) into a CUDA codebase capable of running directly on NVIDIA GPUs without relying on the high-level abstractions of PyTorch. Detailed Instructions: 1. Read and thoroughly understand every line of the provided PyTorch code, including model definitions, forward passes, loss functions, backward passes, and optimization steps. 2. Rewrite the code in pure CUDA C/C++: - Convert high-level tensor operations to raw GPU kernel operations and memory management. - Allocate and deallocate memory on the GPU using CUDA APIs like cudaMalloc and cudaFree. - Handle data transfers between the host (CPU) and the device (GPU) appropriately (e.g., cudaMemcpy). - Convert PyTorch's autograd-based gradient calculations into manual CUDA kernel operations or unify them within your own backward pass implementation if needed. - Replace Pythonic loops and array operations with explicit CUDA kernels. - Implement or port any custom CUDA kernels for special functionality. 3. Optimize the CUDA code for parallel execution: - Use grid and block dimensions that take advantage of the target GPU's compute capabilities and memory bandwidth. - Leverage shared memory, constant memory, or registers when beneficial. - Unroll loops and reduce warp divergence where possible. - Use efficient memory access patterns (e.g., coalesced memory access). 4. Preserve the overall logic and functionality: - The final CUDA code must produce equivalent numerical results as the original PyTorch code for the same inputs and initialization. - Keep the data types, precision, and numerical stability in mind while performing computations in CUDA. 5. Ensure readability and maintainability: - Add comments where necessary to explain tricky parts of the CUDA kernels and memory management. - Use clear function or file organization to separate different components (e.g., forward pass, backward pass, kernel definitions, helper utilities). 6. Remember that the goal is to create an extensive and fully functional .cu file or files that can be compiled with NVCC or integrated into a larger project. 7. Ensure PyTorch Integration: - Implement proper PyTorch C++ extension interfaces using torch::extension - Include necessary PyTorch headers and macros for tensor operations - Create Python bindings using pybind11 to expose CUDA functions to PyTorch - Handle PyTorch tensor types and conversions appropriately - Ensure compatibility with PyTorch's autograd system - Follow PyTorch's extension conventions for error handling and memory management - Make the code loadable as a PyTorch extension module Output Requirements: - Provide only the final CUDA code. - Do not include any explanatory text outside of comments in the code itself. - The CUDA code should stand on its own as a complete solution that can be compiled or integrated without additional references. - The code must be fully compatible with PyTorch's extension system and able to be imported and used seamlessly within PyTorch models. - Ensure the code is fully compatible with PyTorch's extension system. - And, the code should be very long and extensive """ # Prompt #2: Make the CUDA code super reliable and fast (extensive instructions) make_cuda_super_reliable_and_fast_prompt = """ You are an advanced AI agent whose goal is to take a fully functional CUDA codebase and optimize it to be extraordinarily robust, reliable, and efficient. You must focus on performance improvements at both the kernel level and architectural level, ensuring that the code is streamlined and built to handle potential edge cases gracefully. Detailed Instructions: 1. Dive deeply into the CUDA kernels and identify performance bottlenecks: - Look for any uncoalesced memory accesses, unnecessary global memory reads, or writes that could be optimized. - Identify any opportunities to reduce shared memory usage, if that memory usage does not yield significant benefit, or to increase it if it can help reduce global memory accesses. 2. Implement advanced optimization techniques: - Use loop unrolling where beneficial to reduce overhead. - Employ occupancy analysis to find the ideal block size and grid size for maximum parallelism. - Consider the use of constant memory for frequently accessed read-only data. - Evaluate the benefits of pinned (page-locked) memory for host-device transfers. 3. Improve reliability and error handling: - Insert meaningful checks for CUDA API function calls (e.g., cudaMalloc, cudaMemcpy, cudaFree, kernel launches). Ensure proper cleanup if any call fails or returns an error code. - Handle edge cases where input sizes might be too large or too small, preventing invalid memory accesses or out-of-bounds kernel launches. - Ensure that any macros, compilation flags, or library calls align with best practices for portability across different GPU architectures. 4. Document any advanced tricks or strategies used: - In-line commentary is crucial. Briefly explain why a specific optimization was chosen and how it impacts performance. 5. Maintain numerical equivalence: - All transformations must preserve the original numerical outcomes within reasonable floating-point precision limits. Do not alter the algorithmic results unexpectedly. 6. Provide a clean final output: - The output must be only the improved and optimized CUDA source code. - All extraneous explanation beyond code comments should be avoided. - The code must be very long and extensive, containing all necessary functionality. - Output only the complete code file with no additional text. Goal: - Deliver a high-performance CUDA code that is not only efficient in running time but is also resilient, capable of handling unexpected conditions, and thoroughly commented to allow easy maintenance and future modifications. - Ensure the output is a complete, extensive codebase with all functionality included. """ # Prompt #3: Cleanup errors and add extensive documentation (extensive instructions) cleanup_and_document_cuda_code_prompt = """ You are a specialized AI agent focused on thoroughly debugging, cleaning up, and documenting a CUDA codebase. Your mission is to ensure the code compiles cleanly, handles errors gracefully, follows best coding practices, and includes detailed documentation for maintainability and educational purposes. Detailed Instructions: 1. Debug and error cleanup: - Identify any compilation or runtime errors and fix them. - Check for mismatched types, undeclared variables, improper memory usage, or incorrect kernel configurations. - Make sure that each kernel launch has correct grid and block dimensions and that indexing inside kernels is handled properly (avoid out-of-bounds threads). 2. Strengthen error handling: - Wrap all CUDA library calls (cudaMalloc, cudaMemcpy, kernel launches, etc.) with macros or functions that check for and report errors in a consistent manner (e.g., using cudaGetErrorString). - Ensure resources are deallocated in case of failure and that the program can exit safely without memory leaks or GPU lockups. 3. Add thorough documentation: - Provide high-level explanations near the top of the file describing the overall code structure and flow. - Within each function, write docstrings or block comments to explain the function's purpose, inputs, outputs, and major steps. - For each kernel, add comments describing how threads are mapped to data, how memory is accessed, and what the main loop or computational logic accomplishes. 4. Check performance remains robust: - Ensure that none of the debugging or cleanup processes introduce unnecessary slowdowns. Where possible, maintain or improve previous optimizations. 5. Provide final cleaned, well-documented CUDA source code: - The output must contain only the updated source code, with no additional explanation outside of code comments and docstrings. - The code must be ready to compile, fully functional, and in a polished state that one can confidently integrate into a production environment. - The code must be very long and extensive, containing all necessary functionality. - Output only the complete code file with no additional text. Goal: - Deliver a thoroughly cleaned, expertly documented CUDA code file. The readability, reliability, and educational clarity of the code should be enhanced without sacrificing computational performance. - Ensure the output is a complete, extensive codebase with all functionality included. """ # Prompt #4: Produce a final, extensive CUDA file (extensive instructions) produce_extensive_cuda_file_prompt = """ You are an AI agent tasked with producing a final, extensive .cu or .cuh file containing all functionality needed to run the code originally derived from PyTorch. This final file should encapsulate: - The core CUDA kernels. - The initialization and teardown logic for GPU resources. - Optimized computations and any specialized routines (e.g., custom backward passes, advanced math operations). - Comprehensive yet concise in-code documentation. Detailed Instructions: 1. Merge all relevant kernels, utility functions, and structures into a cohesive single file or a well-structured set of files. 2. Ensure you apply all previous optimizations and cleanup efforts, reflecting them in this combined final output. 3. Use robust function prototypes for any utility routines, paying attention to scope and avoiding unnecessary global variables. 4. Keep the code easily navigable with clear sections: - Data structures and utility definitions. - Kernel definitions. - Host functions for kernel launches. - Initialization and cleanup logic. - Document each section thoroughly for ease of reference. 5. Ensure it compiles error-free under standard NVCC compilation. 6. Provide only the CUDA code, including all documentation within comments: - No additional external explanation should be outside these comments. - The code must be very long and extensive, containing all necessary functionality. - Output only the complete code file with no additional text. Goal: - A single or modular set of .cu/.cuh files that stand as the definitive version of the CUDA codebase, balancing performance, reliability, and maintainability. - Ensure the output is a complete, extensive codebase with all functionality included. """ # Now create one agent for each prompt, similar to the example with the Financial-Analysis-Agent. translate_agent = Agent( agent_name="Translate-PyTorch-To-CUDA-Agent", system_prompt=translate_pytorch_to_cuda_prompt, model_name="openai/o1", max_loops=1, max_tokens=10000, output_type="str", temperature=0, ) super_fast_agent = Agent( agent_name="Make-CUDA-Code-Super-Fast-Agent", system_prompt=make_cuda_super_reliable_and_fast_prompt, model_name="openai/o1", max_loops=1, max_tokens=10000, output_type="str", temperature=0, ) cleanup_agent = Agent( agent_name="Cleanup-and-Document-CUDA-Agent", system_prompt=cleanup_and_document_cuda_code_prompt, model_name="openai/o1", max_loops=1, max_tokens=10000, output_type="str", temperature=0, ) final_cuda_agent = Agent( agent_name="Produce-Final-Extensive-CUDA-File-Agent", system_prompt=produce_extensive_cuda_file_prompt, model_name="openai/o1", max_loops=1, max_tokens=10000, output_type="str", temperature=0, ) class CudaSwarm: def __init__( self, name: str = "CudaSwarm", description: str = "A swarm of agents that convert PyTorch code into CUDA code", agents: list[Agent] = [ translate_agent, super_fast_agent, cleanup_agent, final_cuda_agent, ], max_loops: int = 1, file_path: str = "cuda_code.cu", ): self.name = name self.description = description self.agents = agents self.max_loops = max_loops self.file_path = file_path def write_file(self, content: str = "") -> None: """ Creates a new CUDA file or overwrites an existing one with the specified content. Args: content (str): The content to write into the file. Defaults to an empty string. """ # Ensure the file has a .cu extension if not self.file_path.endswith(".cu"): self.file_path += ".cu" # Create the directory if it doesn't exist directory = os.path.dirname(self.file_path) if directory and not os.path.exists(directory): os.makedirs(directory) # Use write mode to overwrite the file mode = "w" try: # Write content to the file with open(self.file_path, mode) as file: file.write(content) print(f"Content successfully written to {self.file_path}") except IOError as e: print(f"An error occurred while writing to the file: {e}") def run(self, task: str): """ Runs the swarm of agents to complete the task. """ first_iteration = self.agents[0].run(task) first_iteration = extract_code_blocks(first_iteration) self.write_file(first_iteration) # second_iteration = self.agents[1].run(task) # second_iteration = extract_code_blocks(second_iteration) # self.write_file(second_iteration) # third_iteration = self.agents[2].run(task) # third_iteration = extract_code_blocks(third_iteration) # self.write_file(third_iteration) # final_iteration = self.agents[3].run(task) # final_iteration = extract_code_blocks(final_iteration) # self.write_file(final_iteration) return first_iteration if __name__ == "__main__": swarm = CudaSwarm(file_path="cuda_code.cu") swarm.run( "Create the cuda code for a highly optimized liquid continous learner ssm model" )