-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathocrNconvert.py
152 lines (122 loc) · 8.37 KB
/
ocrNconvert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re, datetime, os, io, time
from pyperclip import copy as cp
from variousRegexChecks import *
import creds
def detectTextUsingAWSRekognition(path, imageURL): # go to Billing page and there it shows the images out of /5000 processed per month.
"""
Input: path to image file (str)
Output: text found in image (str)
Detects text in the file using AWS Rekognition. LIMITATION: Rekog is able to detect ONLY 50 WORDS PER IMAGE :(
"""
import base64, boto3 #boto3 is AWS Python API iThink
# then i copied the code from here: https://docs.aws.amazon.com/rekognition/latest/dg/text-detecting-text-procedure.html and tried to bruteforce my way through it. Even this Getting Started code helped: https://docs.aws.amazon.com/rekognition/latest/dg/images-bytes.html
# then i brute forced those codees and got errors for "region" and "creds not found". which were solved by the SO Gods, God blessem.
#os.chdir("/home/pi/redditScripts/paceconverterbot") # this is because script is running inside venv (on AWS), so creds.json & numberofTime.txt files don't get found if i don't do this.
with open("nuOfTimesOCRused.txt") as f:
nuOfTimesOCRusedAndURLsBrowsed = f.read() # record my free usage of AWS Rekog, 5000 images a month is free then chargeable :OO Also recroding URLs that have already been processed so that if some error in code & it keeps looping on a specific URL, at least GCV usage isn't wasted.
countAndURLSeparated = nuOfTimesOCRusedAndURLsBrowsed.split("\n")
nuOfTimesOCRused = int(countAndURLSeparated[1]) # AWS Rekog usage is on the 2nd line, 1st line is GCV usage
if nuOfTimesOCRused > 4800:
print("AWS Rekog usage count reached 4800, suspending further processes until manual humanity override.")
return ""
# startTime = datetime.datetime.now()
if f"{imageURL} aws" in nuOfTimesOCRusedAndURLsBrowsed: # recording & checking AWS/GCV operation separately
print("AWS Rekog being re-used for same image, check ID above & sort it out.")
return ""
# endTime = datetime.datetime.now(); print(f"Time it took to check OCR'd URLs: {(str(endTime - startTime))[-9:]} secs")
client=boto3.client('rekognition', region_name=creds.region_name, \
aws_access_key_id=creds.aws_access_key_id, aws_secret_access_key=creds.aws_secret_access_key)
# do the recording of URL & usage count BEFORE you make the call because sometimes call to AWS results in error which halts the script & so usage doesn't get recorded and worse it keeps doing the same URL again and again.
nuOfTimesOCRused += 1
countAndURLSeparated[1] = str(nuOfTimesOCRused)
countAndURLSeparated.append(imageURL + " aws") # recording & checking AWS/GCV operation separately
countAndURLSeparatedStringified = "\n".join(countAndURLSeparated)
with open("nuOfTimesOCRused.txt", "w") as f:
f.write(countAndURLSeparatedStringified)
# print(f"Performing AWS Rekog request - {imageURL} - {time.ctime()}")
with open(f"{path}.jpg", 'rb') as image:
response = client.detect_text(Image={'Bytes': image.read()})
# print(f"Performed AWS Rekog request - {imageURL} - {time.ctime()}")
extractedText = ""
wordCount = 0
for text in response['TextDetections']: # as per my understanding, it gives results by "LINE" & "WORD" 2 types only. So this code will just take by "LINE". Because "WORD" listings are already captured in "LINE" & it just repeats the stuff if i don't filter it out.
if text['Type'] == "LINE":
extractedText += text['DetectedText'] + "\n" # new line is correct delimiter since i'm grabbing the "LINES" from image
if text["Type"] == "WORD":
wordCount+=1
if wordCount >= 99: # AWS Rekog has a limit of doing 100 words per image only. If 100 words get detected means there might be more. & if there is more i'd rather skip the image than perform conversion using incomplete data
print(">= 99 words detected. Skipping this image.")
return ""
if extractedText == "":
print(f'No text detected in {imageURL}')
else:
print("AWS Rekog Output:\n", extractedText)
return extractedText
def detectTextUsingGoogleCloudVision(path, imageURL):
"""
Input: path to image file (str)
Output: text found in image (str)
Detects text in the file using Google Cloud Vision. If GCV throws an error, just prints the error on screen & skips the specific image.
"""
import io, os
from google.cloud import vision
with open("nuOfTimesOCRused.txt") as f:
nuOfTimesOCRusedAndURLsBrowsed = f.read() # record my free usage of GCV, 1000 images a month is free then chargeable $1.5 per imagee :OO Also recroding URLs that have already been processed so that if some error in code & it keeps looping on a specific URL, at least GCV usage isn't wasted.
countAndURLSeparated = nuOfTimesOCRusedAndURLsBrowsed.split("\n")
nuOfTimesOCRused = int(countAndURLSeparated[0])
if nuOfTimesOCRused > 980:
# print("GCV count reached 980, now letting AWS Rekog handle images.")
return "&*2^$9 GCV Limit reached @0%*3"
# startTime = datetime.datetime.now()
if f"{imageURL} gcv" in nuOfTimesOCRusedAndURLsBrowsed: # recording & checking AWS/GCV operation separately
print("GCV being re-used for same image, check ID above & sort it out.")
return ""
# endTime = datetime.datetime.now(); print(f"Time it took to check OCR'd URLs: {(str(endTime - startTime))[-9:]} secs")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"credsgcv.json"
client = vision.ImageAnnotatorClient()
with io.open(f"{path}.jpg", 'rb') as image_file:
content = image_file.read()
image = vision.Image(content=content)
# do the recording of URL & usage count BEFORE you make the call because sometimes call to GCV results in error which halts the script & so usage doesn't get recorded and worse it keeps doing the same URL again and again.
nuOfTimesOCRused += 1
countAndURLSeparated[0] = str(nuOfTimesOCRused)
countAndURLSeparated.append(imageURL + " gcv") # recording & checking AWS/GCV operation separately
countAndURLSeparatedStringified = "\n".join(countAndURLSeparated)
with open("nuOfTimesOCRused.txt", "w") as f:
f.write(countAndURLSeparatedStringified)
# print(f"Performing GCV request - {imageURL} - {time.ctime()}")
response = client.text_detection(image=image)
# print(f"Performed GCV request - {imageURL} - {time.ctime()}")
texts = response.text_annotations
if response.error.message:
print(f"""GCV threw this error "{response.error.message}" \nBad image URL is this: {imageURL} \nSkipping this image.""")
return ""
if texts != []: # return underneath thing ONLY if texts is not empty. Texts will be empty when there is no text in the image
print("GCV output: ")
print(texts[0].description)
return texts[0].description # returns a string
print(f'No text detected in {imageURL}')
return ""
def ocrTheImage(path, imageURL):
"""
Input: Image name/path without extension (str)
Output: Comment for a Single image that has been processed (str) or empty str if nothing found
"""
extractedText = detectTextUsingGoogleCloudVision(path, imageURL)
if extractedText == "&*2^$9 GCV Limit reached @0%*3": # automatically using AWS Rekog if GCV limit reached
if os.path.getsize(f"{path}.jpg") <= 5_000_000: # if file size greater than 5 MB, return "" so that it continues to next image in post / next post
extractedText = detectTextUsingAWSRekognition(path, imageURL)
else:
print("File size greater than 5 MB, AWS Rekog cannot accept it as it has a limit of 5 MB only. Continuing to next image/post.")
return ""
commentForThisImage = checkStravaOrGarminSplitsPaceList(extractedText) # checks for this kind of image first as it causes false positives in further code if not checked beforehand
if commentForThisImage != "":
return commentForThisImage
commentForThisImage = ("" + checkStrava1stVariation(extractedText)
+ checkGarmin1stVariation(extractedText)
+ checkNike1stVariation(extractedText)
+ checkForMPHorKPH(extractedText)
+ checkNike4thVariation(extractedText)
+ checkGarmin4thVariation(extractedText)
)
return commentForThisImage