From 92e9191e4f5aed977fd8d867bab2251e1a05deb9 Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 20 Oct 2023 03:47:46 -0400 Subject: [PATCH] layout llm Former-commit-id: 06a164ac5d87b398ffa78ad69700ba0bbab21ada --- docs/swarms/models/kosmos.md | 2 +- docs/swarms/models/layoutlm_document_qa.md | 88 ++++++++++++++++++++++ docs/swarms/models/nougat.md | 2 +- docs/swarms/models/openai_chat.md | 2 +- swarms/models/__init__.py | 2 + swarms/models/layoutlm_document_qa.py | 37 +++++++++ 6 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 docs/swarms/models/layoutlm_document_qa.md create mode 100644 swarms/models/layoutlm_document_qa.py diff --git a/docs/swarms/models/kosmos.md b/docs/swarms/models/kosmos.md index 81e3ffd2..1735e153 100644 --- a/docs/swarms/models/kosmos.md +++ b/docs/swarms/models/kosmos.md @@ -1,4 +1,4 @@ -# Kosmos Documentation +# `Kosmos` Documentation ## Introduction diff --git a/docs/swarms/models/layoutlm_document_qa.md b/docs/swarms/models/layoutlm_document_qa.md new file mode 100644 index 00000000..4c6169d0 --- /dev/null +++ b/docs/swarms/models/layoutlm_document_qa.md @@ -0,0 +1,88 @@ +# `LayoutLMDocumentQA` Documentation + +## Introduction + +Welcome to the documentation for LayoutLMDocumentQA, a multimodal model designed for visual question answering (QA) on real-world documents, such as invoices, PDFs, and more. This comprehensive documentation will provide you with a deep understanding of the LayoutLMDocumentQA class, its architecture, usage, and examples. + +## Overview + +LayoutLMDocumentQA is a versatile model that combines layout-based understanding of documents with natural language processing to answer questions about the content of documents. It is particularly useful for automating tasks like invoice processing, extracting information from PDFs, and handling various document-based QA scenarios. + +## Class Definition + +```python +class LayoutLMDocumentQA(AbstractModel): + def __init__( + self, + model_name: str = "impira/layoutlm-document-qa", + task: str = "document-question-answering", + ): +``` + +## Purpose + +The LayoutLMDocumentQA class serves the following primary purposes: + +1. **Document QA**: LayoutLMDocumentQA is specifically designed for document-based question answering. It can process both the textual content and the layout of a document to answer questions. + +2. **Multimodal Understanding**: It combines natural language understanding with document layout analysis, making it suitable for documents with complex structures. + +## Parameters + +- `model_name` (str): The name or path of the pretrained LayoutLMDocumentQA model. Default: "impira/layoutlm-document-qa". +- `task` (str): The specific task for which the model will be used. Default: "document-question-answering". + +## Usage + +To use LayoutLMDocumentQA, follow these steps: + +1. Initialize the LayoutLMDocumentQA instance: + +```python +from swarms.models import LayoutLMDocumentQA + +layout_lm_doc_qa = LayoutLMDocumentQA() +``` + +### Example 1 - Initialization + +```python +layout_lm_doc_qa = LayoutLMDocumentQA() +``` + +2. Ask a question about a document and provide the document's image path: + +```python +question = "What is the total amount?" +image_path = "path/to/document_image.png" +answer = layout_lm_doc_qa(question, image_path) +``` + +### Example 2 - Document QA + +```python +layout_lm_doc_qa = LayoutLMDocumentQA() +question = "What is the total amount?" +image_path = "path/to/document_image.png" +answer = layout_lm_doc_qa(question, image_path) +``` + +## How LayoutLMDocumentQA Works + +LayoutLMDocumentQA employs a multimodal approach to document QA. Here's how it works: + +1. **Initialization**: When you create a LayoutLMDocumentQA instance, you can specify the model to use and the task, which is "document-question-answering" by default. + +2. **Question and Document**: You provide a question about the document and the image path of the document to the LayoutLMDocumentQA instance. + +3. **Multimodal Processing**: LayoutLMDocumentQA processes both the question and the document image. It combines layout-based analysis with natural language understanding. + +4. **Answer Generation**: The model generates an answer to the question based on its analysis of the document layout and content. + +## Additional Information + +- LayoutLMDocumentQA uses the "impira/layoutlm-document-qa" pretrained model, which is specifically designed for document-based question answering. +- You can adapt this model to various document QA scenarios by changing the task and providing relevant questions and documents. +- This model is particularly useful for automating document-based tasks and extracting valuable information from structured documents. + +That concludes the documentation for LayoutLMDocumentQA. We hope you find this tool valuable for your document-based question answering needs. If you have any questions or encounter any issues, please refer to the LayoutLMDocumentQA documentation for further assistance. Enjoy using LayoutLMDocumentQA! \ No newline at end of file diff --git a/docs/swarms/models/nougat.md b/docs/swarms/models/nougat.md index 88945b5b..217990a1 100644 --- a/docs/swarms/models/nougat.md +++ b/docs/swarms/models/nougat.md @@ -1,4 +1,4 @@ -# Nougat Documentation +# `Nougat` Documentation ## Introduction diff --git a/docs/swarms/models/openai_chat.md b/docs/swarms/models/openai_chat.md index 4bb3ba78..a2ef9811 100644 --- a/docs/swarms/models/openai_chat.md +++ b/docs/swarms/models/openai_chat.md @@ -1,4 +1,4 @@ -# `OpenAIChat`` Documentation +# `OpenAIChat` Documentation ## Table of Contents diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index e9aba679..c5b787e5 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -11,6 +11,7 @@ from swarms.models.idefics import Idefics from swarms.models.kosmos_two import Kosmos from swarms.models.vilt import Vilt from swarms.models.nougat import Nougat +from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA # from swarms.models.fuyu import Fuyu # Not working, wait until they update @@ -26,6 +27,7 @@ __all__ = [ "Kosmos", "Vilt", "Nougat", + "LayoutLMDocumentQA", ] diff --git a/swarms/models/layoutlm_document_qa.py b/swarms/models/layoutlm_document_qa.py new file mode 100644 index 00000000..a5f7a32c --- /dev/null +++ b/swarms/models/layoutlm_document_qa.py @@ -0,0 +1,37 @@ +""" +LayoutLMDocumentQA is a multimodal good for +visual question answering on real world docs lik invoice, pdfs, etc +""" +from transformers import pipeline +from swarms.models.base import AbstractModel + +class LayoutLMDocumentQA(AbstractModel): + """ + LayoutLMDocumentQA for document question answering: + + Args: + model_name (str, optional): [description]. Defaults to "impira/layoutlm-document-qa". + task (str, optional): [description]. Defaults to "document-question-answering". + + Usage: + >>> from swarms.models import LayoutLMDocumentQA + >>> model = LayoutLMDocumentQA() + >>> out = model("What is the total amount?", "path/to/img.png") + >>> print(out) + + """ + def __init__( + self, + model_name: str = "impira/layoutlm-document-qa", + task: str = "document-question-answering", + ): + self.pipeline = pipeline( + self.task, + model=self.model_name + ) + + def __call__(self, task: str, img_path: str): + """Call for model""" + out = self.pipeline(img_path, task) + out = str(out) + return out \ No newline at end of file