From c7f79f037a37691aca73d91b3983201478b90c3c Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 20 Oct 2023 03:32:06 -0400 Subject: [PATCH] nougat Former-commit-id: 40b4c9efd943cf2d56f7674462350e1cf3b1d850 --- README.md | 2 +- docs/swarms/models/nougat.md | 118 +++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + pyproject.toml | 3 + requirements.txt | 3 + swarms/__init__.py | 8 +-- swarms/models/__init__.py | 21 ++++++- swarms/models/anthropic.py | 10 +-- swarms/models/base.py | 5 +- swarms/models/bing_chat.py | 18 ++++-- swarms/models/idefics.py | 6 +- swarms/models/kosmos_two.py | 2 +- swarms/models/nougat.py | 68 ++++++++++++++++++++ swarms/models/phi.py | 1 + swarms/models/trocr.py | 19 ++++++ swarms/models/zephyr.py | 19 +++--- swarms/tools/edge_gpt.py | 10 ++- 17 files changed, 284 insertions(+), 30 deletions(-) create mode 100644 docs/swarms/models/nougat.md create mode 100644 swarms/models/nougat.py create mode 100644 swarms/models/phi.py create mode 100644 swarms/models/trocr.py diff --git a/README.md b/README.md index 9b7eeccf..8b1fc68a 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ At Swarms, we're transforming the landscape of AI from siloed AI agents to a uni ----- # 🤝 Schedule a 1-on-1 Session -Book a [1-on-1 Session with Kye](https://calendly.com/apacai/agora), the Creator, to discuss any issues, provide feedback, or explore how we can improve Swarms for you. +Book a [1-on-1 Session with Kye](https://calendly.com/swarm-corp/30min), the Creator, to discuss any issues, provide feedback, or explore how we can improve Swarms for you. ---------- diff --git a/docs/swarms/models/nougat.md b/docs/swarms/models/nougat.md new file mode 100644 index 00000000..88945b5b --- /dev/null +++ b/docs/swarms/models/nougat.md @@ -0,0 +1,118 @@ +# Nougat Documentation + +## Introduction + +Welcome to the documentation for Nougat, a versatile model designed by Meta for transcribing scientific PDFs into user-friendly Markdown format, extracting information from PDFs, and extracting metadata from PDF documents. This documentation will provide you with a deep understanding of the Nougat class, its architecture, usage, and examples. + +## Overview + +Nougat is a powerful tool that combines language modeling and image processing capabilities to convert scientific PDF documents into Markdown format. It is particularly useful for researchers, students, and professionals who need to extract valuable information from PDFs quickly. With Nougat, you can simplify complex PDFs, making their content more accessible and easy to work with. + +## Class Definition + +```python +class Nougat: + def __init__( + self, + model_name_or_path="facebook/nougat-base", + min_length: int = 1, + max_new_tokens: int = 30, + ): +``` + +## Purpose + +The Nougat class serves the following primary purposes: + +1. **PDF Transcription**: Nougat is designed to transcribe scientific PDFs into Markdown format. It helps convert complex PDF documents into a more readable and structured format, making it easier to extract information. + +2. **Information Extraction**: It allows users to extract valuable information and content from PDFs efficiently. This can be particularly useful for researchers and professionals who need to extract data, figures, or text from scientific papers. + +3. **Metadata Extraction**: Nougat can also extract metadata from PDF documents, providing essential details about the document, such as title, author, and publication date. + +## Parameters + +- `model_name_or_path` (str): The name or path of the pretrained Nougat model. Default: "facebook/nougat-base". +- `min_length` (int): The minimum length of the generated transcription. Default: 1. +- `max_new_tokens` (int): The maximum number of new tokens to generate in the Markdown transcription. Default: 30. + +## Usage + +To use Nougat, follow these steps: + +1. Initialize the Nougat instance: + +```python +from swarms.models import Nougat + +nougat = Nougat() +``` + +### Example 1 - Initialization + +```python +nougat = Nougat() +``` + +2. Transcribe a PDF image using Nougat: + +```python +markdown_transcription = nougat("path/to/pdf_file.png") +``` + +### Example 2 - PDF Transcription + +```python +nougat = Nougat() +markdown_transcription = nougat("path/to/pdf_file.png") +``` + +3. Extract information from a PDF: + +```python +information = nougat.extract_information("path/to/pdf_file.png") +``` + +### Example 3 - Information Extraction + +```python +nougat = Nougat() +information = nougat.extract_information("path/to/pdf_file.png") +``` + +4. Extract metadata from a PDF: + +```python +metadata = nougat.extract_metadata("path/to/pdf_file.png") +``` + +### Example 4 - Metadata Extraction + +```python +nougat = Nougat() +metadata = nougat.extract_metadata("path/to/pdf_file.png") +``` + +## How Nougat Works + +Nougat employs a vision encoder-decoder model, along with a dedicated processor, to transcribe PDFs into Markdown format and perform information and metadata extraction. Here's how it works: + +1. **Initialization**: When you create a Nougat instance, you can specify the model to use, the minimum transcription length, and the maximum number of new tokens to generate. + +2. **Processing PDFs**: Nougat can process PDFs as input. You can provide the path to a PDF document. + +3. **Image Processing**: The processor converts PDF pages into images, which are then encoded by the model. + +4. **Transcription**: Nougat generates Markdown transcriptions of PDF content, ensuring a minimum length and respecting the token limit. + +5. **Information Extraction**: Information extraction involves parsing the Markdown transcription to identify key details or content of interest. + +6. **Metadata Extraction**: Metadata extraction involves identifying and extracting document metadata, such as title, author, and publication date. + +## Additional Information + +- Nougat leverages the "facebook/nougat-base" pretrained model, which is specifically designed for document transcription and extraction tasks. +- You can adjust the minimum transcription length and the maximum number of new tokens to control the output's length and quality. +- Nougat can be run on both CPU and GPU devices. + +That concludes the documentation for Nougat. We hope you find this tool valuable for your PDF transcription, information extraction, and metadata extraction needs. If you have any questions or encounter any issues, please refer to the Nougat documentation for further assistance. Enjoy using Nougat! \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 3446f2c0..24dda79d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -100,6 +100,7 @@ nav: - Idefics: "swarms/models/idefics.md" - BingChat: "swarms/models/bingchat.md" - Kosmos: "swarms/models/kosmos.md" + - Nougat: "swarms/models/nougat.md" - swarms.structs: - Overview: "swarms/structs/overview.md" - Workflow: "swarms/structs/workflow.md" diff --git a/pyproject.toml b/pyproject.toml index 96a337fd..a17c6fa0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,12 +36,15 @@ langchain-experimental = "*" playwright = "*" duckduckgo-search = "*" faiss-cpu = "*" +datasets = "*" diffusers = "*" +sentencepiece = "*" wget = "*" griptape = "*" httpx = "*" ggl = "*" beautifulsoup4 = "*" +huggingface-hub = "*" pydantic = "*" tenacity = "*" redis = "*" diff --git a/requirements.txt b/requirements.txt index c92b868c..b2c28df0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,10 @@ google-search-results==2.4.2 Pillow faiss-cpu openai +datasets +huggingface-hub google-generativeai +sentencepiece duckduckgo-search agent-protocol chromadb diff --git a/swarms/__init__.py b/swarms/__init__.py index e1dba262..a93fed25 100644 --- a/swarms/__init__.py +++ b/swarms/__init__.py @@ -8,14 +8,14 @@ warnings.filterwarnings("ignore", category=UserWarning) os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - - from swarms import workers from swarms.workers.worker import Worker + # from swarms import chunkers -from swarms import models +from swarms.models import * # import * only works when __all__ = [] is defined in __init__.py from swarms import structs from swarms import swarms from swarms import agents from swarms.logo import logo -print(logo) \ No newline at end of file + +print(logo) diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index fe66dee8..e9aba679 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -10,4 +10,23 @@ from swarms.models.zephyr import Zephyr from swarms.models.idefics import Idefics from swarms.models.kosmos_two import Kosmos from swarms.models.vilt import Vilt -# from swarms.models.fuyu import Fuyu \ No newline at end of file +from swarms.models.nougat import Nougat +# from swarms.models.fuyu import Fuyu # Not working, wait until they update + + +__all__ = [ + "Anthropic", + "Petals", + "Mistral", + "OpenAI", + "AzureOpenAI", + "OpenAIChat", + "Zephyr", + "Idefics", + "Kosmos", + "Vilt", + "Nougat", +] + + + diff --git a/swarms/models/anthropic.py b/swarms/models/anthropic.py index 453890b9..232ff647 100644 --- a/swarms/models/anthropic.py +++ b/swarms/models/anthropic.py @@ -4,13 +4,13 @@ import os class Anthropic: """ - + Anthropic large language models. - - + + Args: - - + + """ def __init__( diff --git a/swarms/models/base.py b/swarms/models/base.py index 57045165..d2b01695 100644 --- a/swarms/models/base.py +++ b/swarms/models/base.py @@ -1,14 +1,17 @@ import time from abc import ABC, abstractmethod + def count_tokens(text: str) -> int: return len(text.split()) + class AbstractModel(ABC): """ AbstractModel """ + # abstract base class for language models def __init__(self): self.start_time = None @@ -41,7 +44,7 @@ class AbstractModel(ABC): if elapsed_time == 0: return float("inf") return self._num_tokens() / elapsed_time - + def _num_tokens(self, text: str) -> int: """Number of tokens""" return count_tokens(text) diff --git a/swarms/models/bing_chat.py b/swarms/models/bing_chat.py index c91690e5..1d2eb503 100644 --- a/swarms/models/bing_chat.py +++ b/swarms/models/bing_chat.py @@ -29,14 +29,22 @@ class BingChat: self.cookies = json.loads(open(cookies_path, encoding="utf-8").read()) self.bot = asyncio.run(Chatbot.create(cookies=self.cookies)) - def __call__(self, prompt: str, style: ConversationStyle = ConversationStyle.creative) -> str: + def __call__( + self, prompt: str, style: ConversationStyle = ConversationStyle.creative + ) -> str: """ Get a text response using the EdgeGPT model based on the provided prompt. """ - response = asyncio.run(self.bot.ask(prompt=prompt, conversation_style=style, simplify_response=True)) - return response['text'] + response = asyncio.run( + self.bot.ask( + prompt=prompt, conversation_style=style, simplify_response=True + ) + ) + return response["text"] - def create_img(self, prompt: str, output_dir: str = "./output", auth_cookie: str = None) -> str: + def create_img( + self, prompt: str, output_dir: str = "./output", auth_cookie: str = None + ) -> str: """ Generate an image based on the provided prompt and save it in the given output directory. Returns the path of the generated image. @@ -48,7 +56,7 @@ class BingChat: images = image_generator.get_images(prompt) image_generator.save_images(images, output_dir=output_dir) - return Path(output_dir) / images[0]['path'] + return Path(output_dir) / images[0]["path"] @staticmethod def set_cookie_dir_path(path: str): diff --git a/swarms/models/idefics.py b/swarms/models/idefics.py index 747def16..3e8227c2 100644 --- a/swarms/models/idefics.py +++ b/swarms/models/idefics.py @@ -87,7 +87,7 @@ class Idefics: prompts : list A list of prompts. Each prompt is a list of text strings and images. batched_mode : bool, optional - Whether to process the prompts in batched mode. If True, all prompts are + Whether to process the prompts in batched mode. If True, all prompts are processed together. If False, only the first prompt is processed (default is True). Returns @@ -131,8 +131,8 @@ class Idefics: prompts : list A list of prompts. Each prompt is a list of text strings and images. batched_mode : bool, optional - Whether to process the prompts in batched mode. - If True, all prompts are processed together. + Whether to process the prompts in batched mode. + If True, all prompts are processed together. If False, only the first prompt is processed (default is True). Returns diff --git a/swarms/models/kosmos_two.py b/swarms/models/kosmos_two.py index 91118c77..b36affcb 100644 --- a/swarms/models/kosmos_two.py +++ b/swarms/models/kosmos_two.py @@ -20,7 +20,7 @@ class Kosmos: """ Args: - + # Initialize Kosmos diff --git a/swarms/models/nougat.py b/swarms/models/nougat.py new file mode 100644 index 00000000..4bd99f1a --- /dev/null +++ b/swarms/models/nougat.py @@ -0,0 +1,68 @@ +""" +Nougat by Meta + +Good for: +- transcribe Scientific PDFs into an easy to use markdown +format +- Extracting information from PDFs +- Extracting metadata from pdfs + +""" + +import torch +from PIL import Image +from transformers import NougatProcessor, VisionEncoderDecoderModel + + +class Nougat: + """ + Nougat + + ArgsS: + model_name_or_path: str, default="facebook/nougat-base" + min_length: int, default=1 + max_new_tokens: int, default=30 + + Usage: + >>> from swarms.models.nougat import Nougat + >>> nougat = Nougat() + >>> nougat("path/to/image.png") + + + """ + def __init__( + self, + model_name_or_path="facebook/nougat-base", + min_length: int = 1, + max_new_tokens: int = 30, + ): + self.model_name_or_path = model_name_or_path + self.min_length = min_length + self.max_new_tokens = max_new_tokens + + self.processor = NougatProcessor.from_pretrained(self.model_name_or_path) + self.model = VisionEncoderDecoderModel.from_pretrained(self.model_name_or_path) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model.to(self.device) + + def get_image(self, img_path: str): + """Get an image from a path""" + image = Image.open(img_path) + return image + + def __call__(self, img_path: str): + """Call the model with an image_path str as an input""" + image = Image.open(img_path) + pixel_values = self.processor(image, return_tensors="pt").pixel_values + + # Generate transcriptions, here we only generate 30 tokens + outputs = self.model.generate( + pixel_values.to(self.device), + min_length=self.min_length, + max_new_tokens=self.max_new_tokens, + bad_words_ids=[[self.processor.unk_token - id]], + ) + + sequence = self.processor.batch_decode(outputs, skip_special_tokens=True)[0] + sequence = self.processor.post_process_generation(sequence, fix_markdown=False) + return sequence diff --git a/swarms/models/phi.py b/swarms/models/phi.py new file mode 100644 index 00000000..90fca08e --- /dev/null +++ b/swarms/models/phi.py @@ -0,0 +1 @@ +"""Phi by Microsoft written by Kye""" diff --git a/swarms/models/trocr.py b/swarms/models/trocr.py new file mode 100644 index 00000000..f4a4156d --- /dev/null +++ b/swarms/models/trocr.py @@ -0,0 +1,19 @@ +""" + +TROCR for Multi-Modal OCR tasks + + +""" +from transformers import TrOCRProcessor, VisionEncoderDecoderModel +from PIL import Image +import requests + + +class TrOCR: + def __init__( + self, + ): + pass + + def __call__(self): + pass diff --git a/swarms/models/zephyr.py b/swarms/models/zephyr.py index 8ee12ed9..582bc740 100644 --- a/swarms/models/zephyr.py +++ b/swarms/models/zephyr.py @@ -1,16 +1,15 @@ """Zephyr by HF""" -import torch +import torch from transformers import pipeline - class Zephyr: """ Zehpyr model from HF Args: - max_new_tokens(int) = Number of max new tokens + max_new_tokens(int) = Number of max new tokens temperature(float) = temperature of the LLM top_k(float) = top k of the model set to 50 top_p(float) = top_p of the model set to 0.95 @@ -23,6 +22,7 @@ class Zephyr: """ + def __init__( self, max_new_tokens: int = 300, @@ -40,18 +40,23 @@ class Zephyr: "text-generation", model="HuggingFaceH4/zephyr-7b-alpha", torch_dtype=torch.bfloa16, - device_map="auto" + device_map="auto", ) self.messages = [ { "role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate", }, - {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, + { + "role": "user", + "content": "How many helicopters can a human eat in one sitting?", + }, ] def __call__(self, text: str): """Call the model""" - prompt = self.pipe.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) + prompt = self.pipe.tokenizer.apply_chat_template( + self.messages, tokenize=False, add_generation_prompt=True + ) outputs = self.pipe(prompt, max_new_token=self.max_new_tokens) - print(outputs[0])["generated_text"] \ No newline at end of file + print(outputs[0])["generated_text"] diff --git a/swarms/tools/edge_gpt.py b/swarms/tools/edge_gpt.py index bef44cfb..9e8eac42 100644 --- a/swarms/tools/edge_gpt.py +++ b/swarms/tools/edge_gpt.py @@ -1,9 +1,15 @@ from swarms.tools.tool import BaseTool + class EdgeGPTTool(BaseTool): - def __init__(self, model, name="EdgeGPTTool", description="Tool that uses EdgeGPTModel to generate responses"): + def __init__( + self, + model, + name="EdgeGPTTool", + description="Tool that uses EdgeGPTModel to generate responses", + ): super().__init__(name=name, description=description) self.model = model def _run(self, prompt): - return self.model.__call__(prompt) \ No newline at end of file + return self.model.__call__(prompt)