-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgptv.py
153 lines (125 loc) · 6.01 KB
/
gptv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import base64
import inspect
import os
import logging
import io
from dotenv import load_dotenv
from langchain.chains.transform import TransformChain
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import chain
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, Field
load_dotenv('.env')
# Needs following environment variables:
AZURE_OPENAI_API_DEPLOYMENT = os.getenv("AZURE_OPENAI_API_DEPLOYMENT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")
DOC_INTEL_ENDPOINT = os.getenv("DOC_INTEL_ENDPOINT")
DOC_INTEL_KEY = os.getenv("DOC_INTEL_KEY")
llm = AzureChatOpenAI(
azure_endpoint= AZURE_OPENAI_ENDPOINT,
azure_deployment=AZURE_OPENAI_API_DEPLOYMENT,
openai_api_version = OPENAI_API_VERSION,
openai_api_key= AZURE_OPEN_API_KEY,
temperature=0,
max_tokens=1000,
verbose=True)
def load_image(inputs: dict) -> dict:
"""Load image from file and encode it as base64."""
image_path = inputs["image_path"]
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
image_base64 = encode_image(image_path)
return {"image": image_base64}
# This piece-of-chain is used later, that loads an image from a file and encodes to base64 with the previous function.
load_image_chain = TransformChain(
input_variables=["image_path"],
output_variables=["image"],
transform=load_image
)
# Pydantic schema for the image information. Descriptions are important (used by the model).
class ImageInformation(BaseModel):
"""Information about a product image."""
brand: str = Field(default="n/a", description="Name of the brand, or n/a")
product_name: str = Field(default="n/a", description="Name of the product, or n/a")
price: str = Field(default="n/a", description="price of the product, or n/a")
price_per_unit: str = Field(default="n/a", description="price of the product per unit, or n/a")
expiration_date: str = Field(default="n/a", description="product's date of expiration/best-before, or n/a")
article_number: str = Field(default="n/a", description="Article Number of the product, or n/a")
bar_code_available: bool = Field(default=False, description="is there bar-code in the image")
bar_code_numbers: str = Field(default=False, description="Numbers of the bar-code, or n/a")
parser = PydanticOutputParser(pydantic_object=ImageInformation)
# piece-of-chain that invokes the model with the image and prompt.
@chain
def gpt_vision(inputs: dict) -> str | list[str] | dict:
"""Invoke model with image and prompt."""
model = llm
msg = model.invoke(
[HumanMessage(
content=[
{"type": "text", "text": inputs["prompt"]},
{"type": "text", "text": parser.get_format_instructions()},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
])]
)
return msg.content
def analyze_with_gpt(image_path: str, doc_int_results: str, doc_int_barcode: str) -> ImageInformation:
vision_prompt = inspect.cleandoc("""
Given the image, provide the following information:
- Brand
- Product Name
- Price
- Price per unit
- Expiration Date (or best-before)
- Article Number (if available, it will be multiple numbers seperated by a period)
- Is there a bar-code available?
- bar-code numbers (if available)
We also found following text with other OCR tool, you can use it to enhance the results:
{doc_int_results}
The other tool also found following bar-code: {doc_int_barcode}
Use it if you can.
""")
template = PromptTemplate(
input_variables=["doc_int_results", "doc_int_barcode"], template="Question: {question}\n{answer}"
)
vision_chain = load_image_chain | gpt_vision | parser
return vision_chain.invoke({'image_path': image_path,
'prompt': vision_prompt.format(doc_int_results=doc_int_results,doc_int_barcode=doc_int_barcode)})
#############################################
# Setup Document intelligence for barcodes #
#############################################
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentAnalysisFeature
from azure.core.credentials import AzureKeyCredential
kwargs = {"api_version": "2023-10-31-preview"}
client = document_analysis_client = DocumentIntelligenceClient(endpoint=DOC_INTEL_ENDPOINT,
credential=AzureKeyCredential(DOC_INTEL_KEY),
**kwargs)
def get_doc_int_results(file_path: str) -> AnalyzeResult:
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-layout",
analyze_request=f,
locale="en-US",
content_type="application/octet-stream",
features=[DocumentAnalysisFeature.BARCODES],
output_content_format="markdown")
return poller.result()
"""
if __name__ == '__main__':
img1 = "image1.jpeg"
doci1 = get_doc_int_results(img1)
barcode = doci1.pages[0].barcodes[0].value if doci1.pages[0].barcodes else "n/a"
oimg1 = analyze_with_gpt(img1, doci1.content, barcode)
return(oimg1)
"""
def process_image(img_path):
#Processes an image and returns the modified output.
img1 = img_path
doci1 = get_doc_int_results(img1)
barcode = doci1.pages[0].barcodes[0].value if doci1.pages[0].barcodes else "n/a"
oimg1 = analyze_with_gpt(img1, doci1.content, barcode)
return(oimg1)