Image to text
Open image and extract the containing text string.
"""Image to text
Open Cv (Open Source Computer Vision Library) used to load images.
Tesseract used for optical character recognition.
"""
import os
DIR = os.path.dirname(os.path.realpath(__file__))
import cv2, pytesseract
img = cv2.imread(DIR + '/files/01.png')
text = pytesseract.image_to_string(img)
print(text)
"""
I've also done a lot of testing since LiveJournal.
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life. I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code.
"""
Highlighted
Converting to HSV makes color selection easier.
"""Image to text
Get only highlighted text
"""
import os
import cv2, pytesseract
import numpy as np
DIR = os.path.dirname(os.path.realpath(__file__))
def imread_highlighted(img):
# Convert BGR to HSV
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# Range of yellow color in HSV
lower = np.array([22, 93, 0])
upper = np.array([45, 255, 255])
# Mask to get only yellow colors
mask = cv2.inRange(hsv, lower, upper)
# Bitwise-AND mask and original image
res = cv2.bitwise_and(img, img, mask= mask)
# Invert the mask to get black letters on white background
res2 = cv2.bitwise_not(mask)
# Display images
if False:
cv2.imshow("img", res)
cv2.imshow("img2", res2)
cv2.waitKey(0)
cv2.destroyAllWindows()
return res2
img = cv2.imread(DIR + '/files/01.png')
img2 = imread_highlighted(img)
text = pytesseract.image_to_string(img).strip()
highlighted = pytesseract.image_to_string(img2).strip()
replaced = text.replace(highlighted, '<i>%s</i>' % highlighted)
print('Text: \n' + text, '\n')
print('Highlighted: \n' + highlighted, '\n')
print('Replaced: \n' + replaced, '\n')
"""
Text:
I've also done a lot of testing since LiveJournal.
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life. I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code.
Highlighted:
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life.
Replaced:
I've also done a lot of testing since LiveJournal.
<i>Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life.</i> I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code.
"""
Corrections
Some characters might not be correctly detected (more work to do).
"""Image to text - highlight mask
Some character might not be correctly detected.
We can reduse noise, remove new lines, use string slices,
check punctuation, etc ...
"""
import os, sys
import cv2, pytesseract, numpy as np
import re
DIR = os.path.dirname(os.path.realpath(__file__))
def imread_highlighted(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lower = np.array([22, 93, 0])
upper = np.array([45, 255, 255])
mask = cv2.inRange(hsv, lower, upper)
res = cv2.bitwise_and(img, img, mask= mask)
res2 = cv2.bitwise_not(mask)
res2 = cv2.GaussianBlur(res2, (3,3), 0) # reduse noise
if False:
cv2.imshow("img", res)
cv2.imshow("img2", res2)
cv2.waitKey(0)
cv2.destroyAllWindows()
return res2
def highlighted_replaced(img, img2):
text = pytesseract.image_to_string(img).strip()
highlighted = pytesseract.image_to_string(img2).strip()
text = '\n'.join(text.split('\n\n')) # remove double new lines
highlighted = '\n'.join(highlighted.split('\n\n'))
pattern = re.compile(r',$') # wrong , end of propostion
highlighted = pattern.sub(r'', highlighted) # remove
start = highlighted[0:10].strip() # start of highlighted text
end = highlighted[-10:].strip()
replaced = text.replace(start, '<i>%s' % start)
replaced = replaced.replace(end, '%s</i>' % end)
if False:
print('Text: \n' + text, '\n')
print('Highlighted: \n' + highlighted, '\n')
print('Start: \n' + start, '\n')
print('End: \n' + end, '\n')
print('Replaced: \n' + replaced, '\n')
try:
assert (start in text), '<%s> not in text' % start
assert (end in text), '<%s> not in text' % end
except AssertionError as e:
print('AssertionError:\n', e)
return replaced
for root, dirs, files in os.walk(DIR + '/files/'):
for file in files:
img = cv2.imread(DIR + '/files/' + file)
img2 = imread_highlighted(img)
replaced = highlighted_replaced(img, img2)
print(replaced, '\n')
"""
where it was writing some big file. <i>We took really
good advantage of multithreading in Java, which
was less painful than I had expected it to be. It was
just really pleasant to work on.</i> From the API we
had designed we saw all these directions it could
grow.
I've also done a lot of testing since LiveJournal.
<i>Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life.</i> I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code.
Your competitor's six-month 1.0 has crap code and
<i>they're going to have to rewrite it in two years but</i>,
guess what: they can rewrite it because you don't
have ajob anymore.
"""
Last update: 415 days ago