You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
360 lines
15 KiB
360 lines
15 KiB
import os
|
|
import re
|
|
|
|
import litellm
|
|
|
|
from swarms import Agent
|
|
|
|
litellm.drop_params = True
|
|
|
|
|
|
def extract_code_blocks(text: str) -> str:
|
|
"""
|
|
Extracts code blocks enclosed in triple backticks from the given text.
|
|
|
|
Args:
|
|
text (str): The input text containing code blocks.
|
|
|
|
Returns:
|
|
str: The extracted code blocks joined together as a single string.
|
|
"""
|
|
# Regular expression to match code blocks enclosed in triple backticks
|
|
code_block_pattern = re.compile(
|
|
r"```(?:[a-zA-Z]*)\n(.*?)```", re.DOTALL
|
|
)
|
|
|
|
# Find all matches and join them into a single string
|
|
matches = code_block_pattern.findall(text)
|
|
return "\n".join(matches)
|
|
|
|
|
|
# Prompt #1: Translate PyTorch code into CUDA (extensive instructions)
|
|
translate_pytorch_to_cuda_prompt = """
|
|
You are an AI agent specialized in converting PyTorch code into efficient,
|
|
well-structured, and properly functioning CUDA code. Your role is to transform
|
|
the given PyTorch code (which may include Python-based tensor operations,
|
|
layers, and training loops) into a CUDA codebase capable of running directly on
|
|
NVIDIA GPUs without relying on the high-level abstractions of PyTorch.
|
|
|
|
Detailed Instructions:
|
|
1. Read and thoroughly understand every line of the provided PyTorch code,
|
|
including model definitions, forward passes, loss functions, backward passes,
|
|
and optimization steps.
|
|
|
|
2. Rewrite the code in pure CUDA C/C++:
|
|
- Convert high-level tensor operations to raw GPU kernel operations and
|
|
memory management.
|
|
- Allocate and deallocate memory on the GPU using CUDA APIs like cudaMalloc
|
|
and cudaFree.
|
|
- Handle data transfers between the host (CPU) and the device (GPU)
|
|
appropriately (e.g., cudaMemcpy).
|
|
- Convert PyTorch's autograd-based gradient calculations into manual CUDA
|
|
kernel operations or unify them within your own backward pass
|
|
implementation if needed.
|
|
- Replace Pythonic loops and array operations with explicit CUDA kernels.
|
|
- Implement or port any custom CUDA kernels for special functionality.
|
|
|
|
3. Optimize the CUDA code for parallel execution:
|
|
- Use grid and block dimensions that take advantage of the target GPU's
|
|
compute capabilities and memory bandwidth.
|
|
- Leverage shared memory, constant memory, or registers when beneficial.
|
|
- Unroll loops and reduce warp divergence where possible.
|
|
- Use efficient memory access patterns (e.g., coalesced memory access).
|
|
|
|
4. Preserve the overall logic and functionality:
|
|
- The final CUDA code must produce equivalent numerical results as the
|
|
original PyTorch code for the same inputs and initialization.
|
|
- Keep the data types, precision, and numerical stability in mind while
|
|
performing computations in CUDA.
|
|
|
|
5. Ensure readability and maintainability:
|
|
- Add comments where necessary to explain tricky parts of the CUDA kernels
|
|
and memory management.
|
|
- Use clear function or file organization to separate different components
|
|
(e.g., forward pass, backward pass, kernel definitions, helper utilities).
|
|
|
|
6. Remember that the goal is to create an extensive and fully functional .cu
|
|
file or files that can be compiled with NVCC or integrated into a larger
|
|
project.
|
|
|
|
7. Ensure PyTorch Integration:
|
|
- Implement proper PyTorch C++ extension interfaces using torch::extension
|
|
- Include necessary PyTorch headers and macros for tensor operations
|
|
- Create Python bindings using pybind11 to expose CUDA functions to PyTorch
|
|
- Handle PyTorch tensor types and conversions appropriately
|
|
- Ensure compatibility with PyTorch's autograd system
|
|
- Follow PyTorch's extension conventions for error handling and memory management
|
|
- Make the code loadable as a PyTorch extension module
|
|
|
|
Output Requirements:
|
|
- Provide only the final CUDA code.
|
|
- Do not include any explanatory text outside of comments in the code itself.
|
|
- The CUDA code should stand on its own as a complete solution that can be
|
|
compiled or integrated without additional references.
|
|
- The code must be fully compatible with PyTorch's extension system and
|
|
able to be imported and used seamlessly within PyTorch models.
|
|
- Ensure the code is fully compatible with PyTorch's extension system.
|
|
- And, the code should be very long and extensive
|
|
"""
|
|
|
|
# Prompt #2: Make the CUDA code super reliable and fast (extensive instructions)
|
|
make_cuda_super_reliable_and_fast_prompt = """
|
|
You are an advanced AI agent whose goal is to take a fully functional CUDA codebase
|
|
and optimize it to be extraordinarily robust, reliable, and efficient. You must
|
|
focus on performance improvements at both the kernel level and architectural
|
|
level, ensuring that the code is streamlined and built to handle potential edge
|
|
cases gracefully.
|
|
|
|
Detailed Instructions:
|
|
1. Dive deeply into the CUDA kernels and identify performance bottlenecks:
|
|
- Look for any uncoalesced memory accesses, unnecessary global memory reads,
|
|
or writes that could be optimized.
|
|
- Identify any opportunities to reduce shared memory usage, if that memory
|
|
usage does not yield significant benefit, or to increase it if it can help
|
|
reduce global memory accesses.
|
|
|
|
2. Implement advanced optimization techniques:
|
|
- Use loop unrolling where beneficial to reduce overhead.
|
|
- Employ occupancy analysis to find the ideal block size and grid size for
|
|
maximum parallelism.
|
|
- Consider the use of constant memory for frequently accessed read-only data.
|
|
- Evaluate the benefits of pinned (page-locked) memory for host-device
|
|
transfers.
|
|
|
|
3. Improve reliability and error handling:
|
|
- Insert meaningful checks for CUDA API function calls (e.g., cudaMalloc,
|
|
cudaMemcpy, cudaFree, kernel launches). Ensure proper cleanup if any call
|
|
fails or returns an error code.
|
|
- Handle edge cases where input sizes might be too large or too small,
|
|
preventing invalid memory accesses or out-of-bounds kernel launches.
|
|
- Ensure that any macros, compilation flags, or library calls align with
|
|
best practices for portability across different GPU architectures.
|
|
|
|
4. Document any advanced tricks or strategies used:
|
|
- In-line commentary is crucial. Briefly explain why a specific optimization
|
|
was chosen and how it impacts performance.
|
|
|
|
5. Maintain numerical equivalence:
|
|
- All transformations must preserve the original numerical outcomes within
|
|
reasonable floating-point precision limits. Do not alter the algorithmic
|
|
results unexpectedly.
|
|
|
|
6. Provide a clean final output:
|
|
- The output must be only the improved and optimized CUDA source code.
|
|
- All extraneous explanation beyond code comments should be avoided.
|
|
- The code must be very long and extensive, containing all necessary functionality.
|
|
- Output only the complete code file with no additional text.
|
|
|
|
Goal:
|
|
- Deliver a high-performance CUDA code that is not only efficient in running
|
|
time but is also resilient, capable of handling unexpected conditions, and
|
|
thoroughly commented to allow easy maintenance and future modifications.
|
|
- Ensure the output is a complete, extensive codebase with all functionality included.
|
|
"""
|
|
|
|
# Prompt #3: Cleanup errors and add extensive documentation (extensive instructions)
|
|
cleanup_and_document_cuda_code_prompt = """
|
|
You are a specialized AI agent focused on thoroughly debugging, cleaning up, and
|
|
documenting a CUDA codebase. Your mission is to ensure the code compiles cleanly,
|
|
handles errors gracefully, follows best coding practices, and includes detailed
|
|
documentation for maintainability and educational purposes.
|
|
|
|
Detailed Instructions:
|
|
1. Debug and error cleanup:
|
|
- Identify any compilation or runtime errors and fix them.
|
|
- Check for mismatched types, undeclared variables, improper memory usage, or
|
|
incorrect kernel configurations.
|
|
- Make sure that each kernel launch has correct grid and block dimensions and
|
|
that indexing inside kernels is handled properly (avoid out-of-bounds
|
|
threads).
|
|
|
|
2. Strengthen error handling:
|
|
- Wrap all CUDA library calls (cudaMalloc, cudaMemcpy, kernel launches, etc.)
|
|
with macros or functions that check for and report errors in a consistent
|
|
manner (e.g., using cudaGetErrorString).
|
|
- Ensure resources are deallocated in case of failure and that the program
|
|
can exit safely without memory leaks or GPU lockups.
|
|
|
|
3. Add thorough documentation:
|
|
- Provide high-level explanations near the top of the file describing the
|
|
overall code structure and flow.
|
|
- Within each function, write docstrings or block comments to explain the
|
|
function's purpose, inputs, outputs, and major steps.
|
|
- For each kernel, add comments describing how threads are mapped to data,
|
|
how memory is accessed, and what the main loop or computational logic
|
|
accomplishes.
|
|
|
|
4. Check performance remains robust:
|
|
- Ensure that none of the debugging or cleanup processes introduce unnecessary
|
|
slowdowns. Where possible, maintain or improve previous optimizations.
|
|
|
|
5. Provide final cleaned, well-documented CUDA source code:
|
|
- The output must contain only the updated source code, with no additional
|
|
explanation outside of code comments and docstrings.
|
|
- The code must be ready to compile, fully functional, and in a polished
|
|
state that one can confidently integrate into a production environment.
|
|
- The code must be very long and extensive, containing all necessary functionality.
|
|
- Output only the complete code file with no additional text.
|
|
|
|
Goal:
|
|
- Deliver a thoroughly cleaned, expertly documented CUDA code file. The
|
|
readability, reliability, and educational clarity of the code should be
|
|
enhanced without sacrificing computational performance.
|
|
- Ensure the output is a complete, extensive codebase with all functionality included.
|
|
"""
|
|
|
|
# Prompt #4: Produce a final, extensive CUDA file (extensive instructions)
|
|
produce_extensive_cuda_file_prompt = """
|
|
You are an AI agent tasked with producing a final, extensive .cu or .cuh file
|
|
containing all functionality needed to run the code originally derived from
|
|
PyTorch. This final file should encapsulate:
|
|
- The core CUDA kernels.
|
|
- The initialization and teardown logic for GPU resources.
|
|
- Optimized computations and any specialized routines (e.g., custom backward
|
|
passes, advanced math operations).
|
|
- Comprehensive yet concise in-code documentation.
|
|
|
|
Detailed Instructions:
|
|
1. Merge all relevant kernels, utility functions, and structures into a cohesive
|
|
single file or a well-structured set of files.
|
|
2. Ensure you apply all previous optimizations and cleanup efforts, reflecting
|
|
them in this combined final output.
|
|
3. Use robust function prototypes for any utility routines, paying attention to
|
|
scope and avoiding unnecessary global variables.
|
|
4. Keep the code easily navigable with clear sections:
|
|
- Data structures and utility definitions.
|
|
- Kernel definitions.
|
|
- Host functions for kernel launches.
|
|
- Initialization and cleanup logic.
|
|
- Document each section thoroughly for ease of reference.
|
|
5. Ensure it compiles error-free under standard NVCC compilation.
|
|
6. Provide only the CUDA code, including all documentation within comments:
|
|
- No additional external explanation should be outside these comments.
|
|
- The code must be very long and extensive, containing all necessary functionality.
|
|
- Output only the complete code file with no additional text.
|
|
|
|
Goal:
|
|
- A single or modular set of .cu/.cuh files that stand as the definitive version
|
|
of the CUDA codebase, balancing performance, reliability, and maintainability.
|
|
- Ensure the output is a complete, extensive codebase with all functionality included.
|
|
"""
|
|
|
|
|
|
# Now create one agent for each prompt, similar to the example with the Financial-Analysis-Agent.
|
|
translate_agent = Agent(
|
|
agent_name="Translate-PyTorch-To-CUDA-Agent",
|
|
system_prompt=translate_pytorch_to_cuda_prompt,
|
|
model_name="openai/o1",
|
|
max_loops=1,
|
|
max_tokens=10000,
|
|
output_type="str",
|
|
temperature=0,
|
|
)
|
|
|
|
super_fast_agent = Agent(
|
|
agent_name="Make-CUDA-Code-Super-Fast-Agent",
|
|
system_prompt=make_cuda_super_reliable_and_fast_prompt,
|
|
model_name="openai/o1",
|
|
max_loops=1,
|
|
max_tokens=10000,
|
|
output_type="str",
|
|
temperature=0,
|
|
)
|
|
|
|
cleanup_agent = Agent(
|
|
agent_name="Cleanup-and-Document-CUDA-Agent",
|
|
system_prompt=cleanup_and_document_cuda_code_prompt,
|
|
model_name="openai/o1",
|
|
max_loops=1,
|
|
max_tokens=10000,
|
|
output_type="str",
|
|
temperature=0,
|
|
)
|
|
|
|
final_cuda_agent = Agent(
|
|
agent_name="Produce-Final-Extensive-CUDA-File-Agent",
|
|
system_prompt=produce_extensive_cuda_file_prompt,
|
|
model_name="openai/o1",
|
|
max_loops=1,
|
|
max_tokens=10000,
|
|
output_type="str",
|
|
temperature=0,
|
|
)
|
|
|
|
|
|
class CudaSwarm:
|
|
def __init__(
|
|
self,
|
|
name: str = "CudaSwarm",
|
|
description: str = "A swarm of agents that convert PyTorch code into CUDA code",
|
|
agents: list[Agent] = [
|
|
translate_agent,
|
|
super_fast_agent,
|
|
cleanup_agent,
|
|
final_cuda_agent,
|
|
],
|
|
max_loops: int = 1,
|
|
file_path: str = "cuda_code.cu",
|
|
):
|
|
self.name = name
|
|
self.description = description
|
|
self.agents = agents
|
|
self.max_loops = max_loops
|
|
self.file_path = file_path
|
|
|
|
def write_file(self, content: str = "") -> None:
|
|
"""
|
|
Creates a new CUDA file or overwrites an existing one with the specified content.
|
|
|
|
Args:
|
|
content (str): The content to write into the file. Defaults to an empty string.
|
|
"""
|
|
# Ensure the file has a .cu extension
|
|
if not self.file_path.endswith(".cu"):
|
|
self.file_path += ".cu"
|
|
|
|
# Create the directory if it doesn't exist
|
|
directory = os.path.dirname(self.file_path)
|
|
if directory and not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
|
|
# Use write mode to overwrite the file
|
|
mode = "w"
|
|
|
|
try:
|
|
# Write content to the file
|
|
with open(self.file_path, mode) as file:
|
|
file.write(content)
|
|
print(f"Content successfully written to {self.file_path}")
|
|
except IOError as e:
|
|
print(f"An error occurred while writing to the file: {e}")
|
|
|
|
def run(self, task: str):
|
|
"""
|
|
Runs the swarm of agents to complete the task.
|
|
"""
|
|
first_iteration = self.agents[0].run(task)
|
|
first_iteration = extract_code_blocks(first_iteration)
|
|
self.write_file(first_iteration)
|
|
|
|
# second_iteration = self.agents[1].run(task)
|
|
# second_iteration = extract_code_blocks(second_iteration)
|
|
# self.write_file(second_iteration)
|
|
|
|
# third_iteration = self.agents[2].run(task)
|
|
# third_iteration = extract_code_blocks(third_iteration)
|
|
# self.write_file(third_iteration)
|
|
|
|
# final_iteration = self.agents[3].run(task)
|
|
# final_iteration = extract_code_blocks(final_iteration)
|
|
# self.write_file(final_iteration)
|
|
|
|
return first_iteration
|
|
|
|
|
|
if __name__ == "__main__":
|
|
swarm = CudaSwarm(file_path="cuda_code.cu")
|
|
swarm.run(
|
|
"Create the cuda code for a highly optimized liquid continous learner ssm model"
|
|
)
|