anthropic + kosmo2 + fastvit

Former-commit-id: 6b4c2d45d3
1 year ago · 0c4dd88f98
parent da120e1aef
commit 0c4dd88f98
7 changed files with 1202 additions and 14 deletions
--- a/demos/accountant_team/accountant_team.py
+++ b/demos/accountant_team/accountant_team.py
@ -1,7 +1,6 @@
 # !pip install --upgrade swarms==2.0.6


-
 from swarms.models import OpenAIChat
 from swarms.models.nougat import Nougat
 from swarms.structs import Flow
@ -11,7 +10,10 @@ from swarms.structs.sequential_workflow import SequentialWorkflow
 IMAGE_OF_FINANCIAL_DOC_URL = "bank_statement_2.jpg"

 # Example usage
-api_key = "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"  # Your actual API key here
+api_key = (
+    "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"  # Your actual API key here
+)
+

 # Initialize the OCR model
 def ocr_model(img: str):
@ -19,12 +21,14 @@ def ocr_model(img: str):
    analyze_finance_docs = ocr(img)
    return str(analyze_finance_docs)

+
 # Initialize the language flow
 llm = OpenAIChat(
    openai_api_key=api_key,
    temperature=0.5,
 )

+
 # Create a prompt for the language model
 def summary_agent_prompt(analyzed_doc: str):
    analyzed_doc = ocr_model(img=analyzed_doc)
@ -36,6 +40,7 @@ def summary_agent_prompt(analyzed_doc: str):
    {analyzed_doc}
    """

+
 # Initialize the Flow with the language flow
 flow1 = Flow(llm=llm, max_loops=1, dashboard=False)

@ -49,7 +54,10 @@ workflow = SequentialWorkflow(max_loops=1)
 workflow.add(summary_agent_prompt(IMAGE_OF_FINANCIAL_DOC_URL), flow1)

 # Suppose the next task takes the output of the first task as input
-workflow.add("Provide an actionable step by step plan on how to cut costs from the analyzed financial document.", flow2)
+workflow.add(
+    "Provide an actionable step by step plan on how to cut costs from the analyzed financial document.",
+    flow2,
+)

 # Run the workflow
 workflow.run()
--- a/playground/models/anthropic_example.py
+++ b/playground/models/anthropic_example.py
@ -1,9 +1,7 @@
 from swarms.models.anthropic import Anthropic


-model = Anthropic(
-    anthropic_api_key=""
-)
+model = Anthropic(anthropic_api_key="")


 task = "Say hello to"
--- a/swarms/models/fast_vit_classes.json
+++ b/swarms/models/fast_vit_classes.json
--- a/swarms/models/fastvit.py
+++ b/swarms/models/fastvit.py
@ -0,0 +1,80 @@
+import json
+import os
+from typing import List
+
+import numpy as np
+import timm
+import torch
+from PIL import Image
+from pydantic import BaseModel, StrictFloat, StrictInt, validator
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Load the classes for image classification
+with open(os.path.join(os.path.dirname(__file__), "fast_vit_classes.json")) as f:
+    FASTVIT_IMAGENET_1K_CLASSES = json.load(f)
+
+
+class ClassificationResult(BaseModel):
+    class_id: List[StrictInt]
+    confidence: List[StrictFloat]
+
+    @validator("class_id", "confidence", pre=True, each_item=True)
+    def check_list_contents(cls, v):
+        assert isinstance(v, int) or isinstance(v, float), "must be integer or float"
+        return v
+
+
+class FastViT:
+    """
+    FastViT model for image classification
+
+    Args:
+        img (str): path to the input image
+        confidence_threshold (float): confidence threshold for the model's predictions
+
+    Returns:
+        ClassificationResult: a pydantic BaseModel containing the class ids and confidences of the model's predictions
+
+
+    Example:
+        >>> fastvit = FastViT()
+        >>> result = fastvit(img="path_to_image.jpg", confidence_threshold=0.5)
+
+
+    To use, create a json file called: fast_vit_classes.json
+    
+    """
+    def __init__(self):
+        self.model = timm.create_model(
+            "hf_hub:timm/fastvit_s12.apple_in1k", pretrained=True
+        ).to(DEVICE)
+        data_config = timm.data.resolve_model_data_config(self.model)
+        self.transforms = timm.data.create_transform(**data_config, is_training=False)
+        self.model.eval()
+
+    def __call__(
+        self, img: str, confidence_threshold: float = 0.5
+    ) -> ClassificationResult:
+        """classifies the input image and returns the top k classes and their probabilities"""
+        img = Image.open(img).convert("RGB")
+        img_tensor = self.transforms(img).unsqueeze(0).to(DEVICE)
+        with torch.no_grad():
+            output = self.model(img_tensor)
+        probabilities = torch.nn.functional.softmax(output, dim=1)
+
+        # Get top k classes and their probabilities
+        top_probs, top_classes = torch.topk(
+            probabilities, k=FASTVIT_IMAGENET_1K_CLASSES
+        )
+
+        # Filter by confidence threshold
+        mask = top_probs > confidence_threshold
+        top_probs, top_classes = top_probs[mask], top_classes[mask]
+
+        # Convert to Python lists and map class indices to labels if needed
+        top_probs = top_probs.cpu().numpy().tolist()
+        top_classes = top_classes.cpu().numpy().tolist()
+        # top_class_labels = [FASTVIT_IMAGENET_1K_CLASSES[i] for i in top_classes] # Uncomment if class labels are needed
+
+        return ClassificationResult(class_id=top_classes, confidence=top_probs)
--- a/swarms/models/kosmos2.py
+++ b/swarms/models/kosmos2.py
@ -0,0 +1,100 @@
+from typing import List, Tuple
+
+import numpy as np
+from PIL import Image
+from pydantic import BaseModel, root_validator, validator
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+# Assuming the Detections class represents the output of the model prediction
+class Detections(BaseModel):
+    xyxy: List[Tuple[float, float, float, float]]
+    class_id: List[int]
+    confidence: List[float]
+
+    @root_validator
+    def check_length(cls, values):
+        assert (
+            len(values.get("xyxy"))
+            == len(values.get("class_id"))
+            == len(values.get("confidence"))
+        ), "All fields must have the same length."
+        return values
+
+    @validator("xyxy", "class_id", "confidence", pre=True, each_item=True)
+    def check_not_empty(cls, v):
+        if isinstance(v, list) and len(v) == 0:
+            raise ValueError("List must not be empty")
+        return v
+
+    @classmethod
+    def empty(cls):
+        return cls(xyxy=[], class_id=[], confidence=[])
+
+
+class Kosmos2(BaseModel):
+    model: AutoModelForVision2Seq
+    processor: AutoProcessor
+
+    @classmethod
+    def initialize(cls):
+        model = AutoModelForVision2Seq.from_pretrained(
+            "ydshieh/kosmos-2-patch14-224", trust_remote_code=True
+        )
+        processor = AutoProcessor.from_pretrained(
+            "ydshieh/kosmos-2-patch14-224", trust_remote_code=True
+        )
+        return cls(model=model, processor=processor)
+
+    def __call__(self, img: str) -> Detections:
+        image = Image.open(img)
+        prompt = "<grounding>An image of"
+
+        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
+        outputs = self.model.generate(**inputs, use_cache=True, max_new_tokens=64)
+
+        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[
+            0
+        ]
+
+        # The actual processing of generated_text to entities would go here
+        # For the purpose of this example, assume a mock function 'extract_entities' exists:
+        entities = self.extract_entities(generated_text)
+
+        # Convert entities to detections format
+        detections = self.process_entities_to_detections(entities, image)
+        return detections
+
+    def extract_entities(
+        self, text: str
+    ) -> List[Tuple[str, Tuple[float, float, float, float]]]:
+        # Placeholder function for entity extraction
+        # This should be replaced with the actual method of extracting entities
+        return []
+
+    def process_entities_to_detections(
+        self,
+        entities: List[Tuple[str, Tuple[float, float, float, float]]],
+        image: Image.Image,
+    ) -> Detections:
+        if not entities:
+            return Detections.empty()
+
+        class_ids = [0] * len(entities)  # Replace with actual class ID extraction logic
+        xyxys = [
+            (
+                e[1][0] * image.width,
+                e[1][1] * image.height,
+                e[1][2] * image.width,
+                e[1][3] * image.height,
+            )
+            for e in entities
+        ]
+        confidences = [1.0] * len(entities)  # Placeholder confidence
+
+        return Detections(xyxy=xyxys, class_id=class_ids, confidence=confidences)
+
+
+# Usage:
+# kosmos2 = Kosmos2.initialize()
+# detections = kosmos2(img="path_to_image.jpg")
--- a/swarms/structs/sequential_workflow.py
+++ b/swarms/structs/sequential_workflow.py
@ -286,12 +286,14 @@ class SequentialWorkflow:
            )
        )

-        task = Task(description=task, flow=kwargs["flow"], args=list(kwargs["args"]), kwargs=kwargs["kwargs"])
+        task = Task(
+            description=task,
+            flow=kwargs["flow"],
+            args=list(kwargs["args"]),
+            kwargs=kwargs["kwargs"],
+        )
        self.tasks.append(task)

-
-
-
    def load_workflow_state(self, filepath: str = None, **kwargs) -> None:
        """
        Loads the workflow state from a json file and restores the workflow state.