anthropic + kosmo2 + fastvit

Former-commit-id: 6b4c2d45d3
2 years ago · 0c4dd88f98
parent da120e1aef
commit 0c4dd88f98
7 changed files with 1202 additions and 14 deletions
--- a/demos/accountant_team/accountant_team.py
+++ b/demos/accountant_team/accountant_team.py
@ -1,7 +1,6 @@
 # !pip install --upgrade swarms==2.0.6
 from swarms.models import OpenAIChat
 from swarms.models.nougat import Nougat
 from swarms.structs import Flow
@ -11,7 +10,10 @@ from swarms.structs.sequential_workflow import SequentialWorkflow
 IMAGE_OF_FINANCIAL_DOC_URL = "bank_statement_2.jpg"
 # Example usage
-api_key = "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"  # Your actual API key here
+api_key = (
    "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"  # Your actual API key here
 )
 # Initialize the OCR model
 def ocr_model(img: str):
@ -19,12 +21,14 @@ def ocr_model(img: str):
    analyze_finance_docs = ocr(img)
    return str(analyze_finance_docs)
 # Initialize the language flow
 llm = OpenAIChat(
    openai_api_key=api_key,
    temperature=0.5,
 )
 # Create a prompt for the language model
 def summary_agent_prompt(analyzed_doc: str):
    analyzed_doc = ocr_model(img=analyzed_doc)
@ -36,6 +40,7 @@ def summary_agent_prompt(analyzed_doc: str):
    {analyzed_doc}
    """
 # Initialize the Flow with the language flow
 flow1 = Flow(llm=llm, max_loops=1, dashboard=False)
@ -49,7 +54,10 @@ workflow = SequentialWorkflow(max_loops=1)
 workflow.add(summary_agent_prompt(IMAGE_OF_FINANCIAL_DOC_URL), flow1)
 # Suppose the next task takes the output of the first task as input
-workflow.add("Provide an actionable step by step plan on how to cut costs from the analyzed financial document.", flow2)
+workflow.add(
    "Provide an actionable step by step plan on how to cut costs from the analyzed financial document.",
    flow2,
 )
 # Run the workflow
 workflow.run()
--- a/playground/models/anthropic_example.py
+++ b/playground/models/anthropic_example.py
@ -1,9 +1,7 @@
 from swarms.models.anthropic import Anthropic
-model = Anthropic(
+model = Anthropic(anthropic_api_key="")
    anthropic_api_key=""
 )
 task = "Say hello to"
--- a/swarms/models/fast_vit_classes.json
+++ b/swarms/models/fast_vit_classes.json
--- a/swarms/models/fastvit.py
+++ b/swarms/models/fastvit.py
@ -0,0 +1,80 @@
 import json
 import os
 from typing import List
 import numpy as np
 import timm
 import torch
 from PIL import Image
 from pydantic import BaseModel, StrictFloat, StrictInt, validator
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load the classes for image classification
 with open(os.path.join(os.path.dirname(__file__), "fast_vit_classes.json")) as f:
    FASTVIT_IMAGENET_1K_CLASSES = json.load(f)
 class ClassificationResult(BaseModel):
    class_id: List[StrictInt]
    confidence: List[StrictFloat]
    @validator("class_id", "confidence", pre=True, each_item=True)
    def check_list_contents(cls, v):
        assert isinstance(v, int) or isinstance(v, float), "must be integer or float"
        return v
 class FastViT:
    """
    FastViT model for image classification
    Args:
        img (str): path to the input image
        confidence_threshold (float): confidence threshold for the model's predictions
    Returns:
        ClassificationResult: a pydantic BaseModel containing the class ids and confidences of the model's predictions
    Example:
        >>> fastvit = FastViT()
        >>> result = fastvit(img="path_to_image.jpg", confidence_threshold=0.5)
    To use, create a json file called: fast_vit_classes.json
    """
    def __init__(self):
        self.model = timm.create_model(
            "hf_hub:timm/fastvit_s12.apple_in1k", pretrained=True
        ).to(DEVICE)
        data_config = timm.data.resolve_model_data_config(self.model)
        self.transforms = timm.data.create_transform(**data_config, is_training=False)
        self.model.eval()
    def __call__(
        self, img: str, confidence_threshold: float = 0.5
    ) -> ClassificationResult:
        """classifies the input image and returns the top k classes and their probabilities"""
        img = Image.open(img).convert("RGB")
        img_tensor = self.transforms(img).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            output = self.model(img_tensor)
        probabilities = torch.nn.functional.softmax(output, dim=1)
        # Get top k classes and their probabilities
        top_probs, top_classes = torch.topk(
            probabilities, k=FASTVIT_IMAGENET_1K_CLASSES
        )
        # Filter by confidence threshold
        mask = top_probs > confidence_threshold
        top_probs, top_classes = top_probs[mask], top_classes[mask]
        # Convert to Python lists and map class indices to labels if needed
        top_probs = top_probs.cpu().numpy().tolist()
        top_classes = top_classes.cpu().numpy().tolist()
        # top_class_labels = [FASTVIT_IMAGENET_1K_CLASSES[i] for i in top_classes] # Uncomment if class labels are needed
        return ClassificationResult(class_id=top_classes, confidence=top_probs)
--- a/swarms/models/kosmos2.py
+++ b/swarms/models/kosmos2.py
@ -0,0 +1,100 @@
 from typing import List, Tuple
 import numpy as np
 from PIL import Image
 from pydantic import BaseModel, root_validator, validator
 from transformers import AutoModelForVision2Seq, AutoProcessor
 # Assuming the Detections class represents the output of the model prediction
 class Detections(BaseModel):
    xyxy: List[Tuple[float, float, float, float]]
    class_id: List[int]
    confidence: List[float]
    @root_validator
    def check_length(cls, values):
        assert (
            len(values.get("xyxy"))
            == len(values.get("class_id"))
            == len(values.get("confidence"))
        ), "All fields must have the same length."
        return values
    @validator("xyxy", "class_id", "confidence", pre=True, each_item=True)
    def check_not_empty(cls, v):
        if isinstance(v, list) and len(v) == 0:
            raise ValueError("List must not be empty")
        return v
    @classmethod
    def empty(cls):
        return cls(xyxy=[], class_id=[], confidence=[])
 class Kosmos2(BaseModel):
    model: AutoModelForVision2Seq
    processor: AutoProcessor
    @classmethod
    def initialize(cls):
        model = AutoModelForVision2Seq.from_pretrained(
            "ydshieh/kosmos-2-patch14-224", trust_remote_code=True
        )
        processor = AutoProcessor.from_pretrained(
            "ydshieh/kosmos-2-patch14-224", trust_remote_code=True
        )
        return cls(model=model, processor=processor)
    def __call__(self, img: str) -> Detections:
        image = Image.open(img)
        prompt = "<grounding>An image of"
        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
        outputs = self.model.generate(**inputs, use_cache=True, max_new_tokens=64)
        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[
            0
        ]
        # The actual processing of generated_text to entities would go here
        # For the purpose of this example, assume a mock function 'extract_entities' exists:
        entities = self.extract_entities(generated_text)
        # Convert entities to detections format
        detections = self.process_entities_to_detections(entities, image)
        return detections
    def extract_entities(
        self, text: str
    ) -> List[Tuple[str, Tuple[float, float, float, float]]]:
        # Placeholder function for entity extraction
        # This should be replaced with the actual method of extracting entities
        return []
    def process_entities_to_detections(
        self,
        entities: List[Tuple[str, Tuple[float, float, float, float]]],
        image: Image.Image,
    ) -> Detections:
        if not entities:
            return Detections.empty()
        class_ids = [0] * len(entities)  # Replace with actual class ID extraction logic
        xyxys = [
            (
                e[1][0] * image.width,
                e[1][1] * image.height,
                e[1][2] * image.width,
                e[1][3] * image.height,
            )
            for e in entities
        ]
        confidences = [1.0] * len(entities)  # Placeholder confidence
        return Detections(xyxy=xyxys, class_id=class_ids, confidence=confidences)
 # Usage:
 # kosmos2 = Kosmos2.initialize()
 # detections = kosmos2(img="path_to_image.jpg")
--- a/swarms/structs/sequential_workflow.py
+++ b/swarms/structs/sequential_workflow.py
@ -286,12 +286,14 @@ class SequentialWorkflow:
            )
        )
-        task = Task(description=task, flow=kwargs["flow"], args=list(kwargs["args"]), kwargs=kwargs["kwargs"])
+        task = Task(
            description=task,
            flow=kwargs["flow"],
            args=list(kwargs["args"]),
            kwargs=kwargs["kwargs"],
        )
        self.tasks.append(task)
    def load_workflow_state(self, filepath: str = None, **kwargs) -> None:
        """
        Loads the workflow state from a json file and restores the workflow state.