Skip to content

Implement PNG alpha channel removal in OCR processing #310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lib/OcrProcessors/IOcrProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Exception\OcrAlreadyDoneException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCP\Files\File;
Expand All @@ -35,6 +37,8 @@ interface IOcrProcessor {
* @param WorkflowSettings $settings The settings to be used for this specific workflow
* @param GlobalSettings $globalSettings The global settings configured for all OCR workflows on this system
* @return OcrProcessorResult
* @throws OcrAlreadyDoneException
* @throws OcrResultEmptyException
* @throws OcrNotPossibleException
*/
public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings) : OcrProcessorResult;
Expand Down
35 changes: 8 additions & 27 deletions lib/OcrProcessors/Local/OcrMyPdfBasedProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,34 +23,30 @@

namespace OCA\WorkflowOcr\OcrProcessors\Local;

use OCA\WorkflowOcr\Exception\OcrAlreadyDoneException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Helper\ISidecarFileAccessor;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessor;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorBase;
use OCA\WorkflowOcr\Wrapper\ICommand;
use OCP\Files\File;
use Psr\Log\LoggerInterface;

abstract class OcrMyPdfBasedProcessor implements IOcrProcessor {
abstract class OcrMyPdfBasedProcessor extends OcrProcessorBase {
public function __construct(
private ICommand $command,
private LoggerInterface $logger,
protected LoggerInterface $logger,
private ISidecarFileAccessor $sidecarFileAccessor,
private ICommandLineUtils $commandLineUtils,
) {
parent::__construct($logger);
}

public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult {
protected function doOcrProcessing($fileResource, string $fileName, WorkflowSettings $settings, GlobalSettings $globalSettings): array {
$additionalCommandlineArgs = $this->getAdditionalCommandlineArgs($settings, $globalSettings);
$sidecarFile = $this->sidecarFileAccessor->getOrCreateSidecarFile();
$commandStr = 'ocrmypdf ' . $this->commandLineUtils->getCommandlineArgs($settings, $globalSettings, $sidecarFile, $additionalCommandlineArgs) . ' - - || exit $? ; cat';

$inputFileContent = $file->getContent();
$inputFileContent = stream_get_contents($fileResource);

$this->command
->setCommand($commandStr)
Expand All @@ -64,11 +60,7 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$exitCode = $this->command->getExitCode();

if (!$success) {
# Gracefully handle OCR_MODE_SKIP_FILE (ExitCode.already_done_ocr)
if ($exitCode === 6) {
throw new OcrAlreadyDoneException('File ' . $file->getPath() . ' appears to contain text so it may not need OCR. Message: ' . $errorOutput . ' ' . $stdErr);
}
throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . ' for file ' . $file->getPath() . '. Message: ' . $errorOutput . ' ' . $stdErr);
return [false, null, null, $exitCode, $errorOutput . ' ' . $stdErr];
}

if ($stdErr !== '' || $errorOutput !== '') {
Expand All @@ -80,20 +72,9 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
}

$ocrFileContent = $this->command->getOutput();

if (!$ocrFileContent) {
throw new OcrResultEmptyException('OCRmyPDF did not produce any output for file ' . $file->getPath());
}

$recognizedText = $this->sidecarFileAccessor->getSidecarFileContent();

if (!$recognizedText) {
$this->logger->info('Temporary sidecar file at \'{path}\' was empty', ['path' => $sidecarFile]);
}

$this->logger->debug('OCR processing was successful');

return new OcrProcessorResult($ocrFileContent, 'pdf', $recognizedText);
return [true, $ocrFileContent, $recognizedText, $exitCode, null];
}

/**
Expand Down
125 changes: 125 additions & 0 deletions lib/OcrProcessors/OcrProcessorBase.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2025 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Exception\OcrAlreadyDoneException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCP\Files\File;
use Psr\Log\LoggerInterface;

/**
* Base class for all OCR processors.
*/
abstract class OcrProcessorBase implements IOcrProcessor {
public function __construct(
protected LoggerInterface $logger,
) {
}

public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult {
$fileName = $file->getName();
$fileResource = $this->doFilePreprocessing($file);
try {
[$success, $fileContent, $recognizedText, $exitCode, $errorMessage] = $this->doOcrProcessing($fileResource, $fileName, $settings, $globalSettings);
if (!$success) {
$this->throwException($errorMessage, $exitCode);
}
if (!$recognizedText) {
$this->logger->info('Recognized text was empty');
}
return $fileContent ? new OcrProcessorResult($fileContent, $recognizedText) : throw new OcrResultEmptyException('OCRmyPDF did not produce any output for file ' . $fileName);
} finally {
if (is_resource($fileResource)) {
fclose($fileResource);
}
}
}

/**
* Perform the actual OCR processing. Implementation is specific to the OCR processor. Might me local or remote.
* Should return [$success, $fileContent, $recognizedText, $exitCode, $errorMessage]
* @param resource $fileResource
* @param string $fileName
* @param WorkflowSettings $settings
* @param GlobalSettings $globalSettings
* @return array
*/
abstract protected function doOcrProcessing($fileResource, string $fileName, WorkflowSettings $settings, GlobalSettings $globalSettings): array;

/**
* @return resource|false
*/
private function doFilePreprocessing(File $file) {
return $file->getMimeType() != 'image/png' ? $file->fopen('rb') : $this->removeAlphaChannelFromImage($file);
}

/**
* @return resource|false
*/
private function removeAlphaChannelFromImage(File $file) {
// Remove any alpha channel from the PNG image (if any)
$imageResource = null;
try {
$this->logger->debug('Checking if PNG has alpha channel');

$imageResource = $file->fopen('rb');
$image = new \Imagick();
$image->readImageFile($imageResource, $file->getName());
$alphaChannel = $image->getImageAlphaChannel();

if (!$alphaChannel) {
$this->logger->debug('PNG does not have alpha channel, no need to remove it');
return $imageResource;
}

$this->logger->debug('PNG has alpha channel, removing it');
$image->setImageAlphaChannel(\Imagick::ALPHACHANNEL_REMOVE);
$image->mergeImageLayers(\Imagick::LAYERMETHOD_FLATTEN);
$imageBlob = $image->getImageBlob();
$stream = fopen('php://temp', 'r+');
fwrite($stream, $imageBlob);
rewind($stream);
return $stream;
} finally {
if (is_resource($imageResource)) {
fclose($imageResource);
}
$image->clear();
$image->destroy();
}
}

/**
* Throws an appropriate exception based on the error message and exit code.
*/
private function throwException($errorMessage, $exitCode) {
if ($exitCode === 6) {
throw new OcrAlreadyDoneException('File appears to contain text so it may not need OCR. Message: ' . $errorMessage);
}
throw new OcrNotPossibleException($errorMessage);
}
}
9 changes: 1 addition & 8 deletions lib/OcrProcessors/OcrProcessorResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,18 @@ class OcrProcessorResult {
/** @var string */
private $fileContent;
/** @var string */
private $fileExtension;
/** @var string */
private $recognizedText;


public function __construct(string $fileContent, string $fileExtension, string $recognizedText) {
public function __construct(string $fileContent, string $recognizedText) {
$this->fileContent = $fileContent;
$this->fileExtension = $fileExtension;
$this->recognizedText = $recognizedText;
}

public function getFileContent(): string {
return $this->fileContent;
}

public function getFileExtension(): string {
return $this->fileExtension;
}

public function getRecognizedText(): string {
return $this->recognizedText;
}
Expand Down
28 changes: 7 additions & 21 deletions lib/OcrProcessors/Remote/WorkflowOcrRemoteProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,34 +23,28 @@

namespace OCA\WorkflowOcr\OcrProcessors\Remote;

use OCA\WorkflowOcr\Exception\OcrAlreadyDoneException;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessor;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorBase;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\IApiClient;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\Model\ErrorResult;
use OCP\Files\File;
use Psr\Log\LoggerInterface;

/**
* OCR Processor which utilizes the Workflow OCR Backend remote service to perform OCR.
*/
class WorkflowOcrRemoteProcessor implements IOcrProcessor {
class WorkflowOcrRemoteProcessor extends OcrProcessorBase {
public function __construct(
private IApiClient $apiClient,
private ICommandLineUtils $commandLineUtils,
private LoggerInterface $logger,
protected LoggerInterface $logger,
) {

parent::__construct($logger);
}

public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult {
protected function doOcrProcessing($fileResource, string $fileName, WorkflowSettings $settings, GlobalSettings $globalSettings): array {
$ocrMyPdfParameters = $this->commandLineUtils->getCommandlineArgs($settings, $globalSettings);
$fileResource = $file->fopen('rb');
$fileName = $file->getName();

$this->logger->debug('Sending OCR request to remote backend');
$apiResult = $this->apiClient->processOcr($fileResource, $fileName, $ocrMyPdfParameters);
Expand All @@ -60,17 +54,9 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
$resultMessage = $apiResult->getMessage();
$exitCode = $apiResult->getOcrMyPdfExitCode();

# Gracefully handle OCR_MODE_SKIP_FILE (ExitCode.already_done_ocr)
if ($exitCode === 6) {
throw new OcrAlreadyDoneException('File ' . $file->getPath() . ' appears to contain text so it may not need OCR. Message: ' . $resultMessage);
}
throw new OcrNotPossibleException($resultMessage);
return [false, null, null, $exitCode, $resultMessage];
}

return new OcrProcessorResult(
base64_decode($apiResult->getFileContent()),
pathinfo($apiResult->getFilename(), PATHINFO_EXTENSION),
$apiResult->getRecognizedText()
);
return [true, base64_decode($apiResult->getFileContent()), $apiResult->getRecognizedText(), 0, null];
}
}
13 changes: 6 additions & 7 deletions lib/Service/OcrService.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
class OcrService implements IOcrService {
private const FILE_VERSION_LABEL_KEY = 'label';
private const FILE_VERSION_LABEL_VALUE = 'Before OCR';
private const PDF_FILE_EXTENSION = 'pdf';

/** @var IOcrProcessorFactory */
private $ocrProcessorFactory;
Expand Down Expand Up @@ -323,7 +324,6 @@ private function doPostProcessing(Node $file, string $uid, WorkflowSettings $set
$fileId = $file->getId();
$fileContent = $result->getFileContent();
$originalFileExtension = $file->getExtension();
$newFileExtension = $result->getFileExtension();

// Only create a new file version if the file OCR result was not empty #130
if ($result->getRecognizedText() !== '') {
Expand All @@ -332,7 +332,7 @@ private function doPostProcessing(Node $file, string $uid, WorkflowSettings $set
$this->setFileVersionsLabel($file, $uid, self::FILE_VERSION_LABEL_VALUE);
}

$newFilePath = $this->determineNewFilePath($file, $originalFileExtension, $newFileExtension);
$newFilePath = $this->determineNewFilePath($file, $originalFileExtension);
$this->createNewFileVersion($newFilePath, $fileContent, $fileMtime);
}

Expand All @@ -349,19 +349,18 @@ private function doPostProcessing(Node $file, string $uid, WorkflowSettings $set
*
* @param Node $file The original file node for which the OCR processing has been succeeded.
* @param string $originalFileExtension The original file extension.
* @param string $newFileExtension The new file extension to be applied.
* @return string The new file path with the updated extension.
*/
private function determineNewFilePath(Node $file, string $originalFileExtension, string $newFileExtension): string {
private function determineNewFilePath(Node $file, string $originalFileExtension): string {
$filePath = $file->getPath();
if ($originalFileExtension !== $newFileExtension) {
if ($originalFileExtension !== self::PDF_FILE_EXTENSION) {
// If the extension changed, will create a new file with the new extension
return $filePath . '.' . $newFileExtension;
return "$filePath." . self::PDF_FILE_EXTENSION;
}
if (!$file->isUpdateable()) {
// Add suffix '_OCR' if original file cannot be updated
$fileInfo = pathinfo($filePath);
return $fileInfo['dirname'] . '/' . $fileInfo['filename'] . '_OCR.' . $newFileExtension;
return $fileInfo['dirname'] . '/' . $fileInfo['filename'] . '_OCR.' . self::PDF_FILE_EXTENSION;
}
// By returning the original file path, we will create a new file version of the original file
return $filePath;
Expand Down
24 changes: 24 additions & 0 deletions tests/Integration/LocalBackendTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,28 @@ public function testWorkflowOcrLocalBackendSkipText(): void {
$this->assertInstanceOf(TextRecognizedEvent::class, $textRecognizedEvent, 'Expected TextRecognizedEvent instance');
$this->assertEquals('[OCR skipped on page(s) 1]', trim($textRecognizedEvent->getRecognizedText()), 'Expected recognized text');
}

public function testWorkflowOcrLocalBackendPngWithAlphaChannel(): void {
$this->addOperation('image/png');
$this->uploadTestFile('png-with-alpha-channel.png');
$this->runOcrBackgroundJob();

$this->assertEmpty($this->apiClient->getRequests(), 'Expected no OCR Backend Service requests');
$this->assertEquals(1, count($this->capturedEvents), 'Expected 1 TextRecognizedEvent');
$textRecognizedEvent = $this->capturedEvents[0];
$this->assertInstanceOf(TextRecognizedEvent::class, $textRecognizedEvent, 'Expected TextRecognizedEvent instance');
$this->assertEquals('PNG with alpha channel', trim($textRecognizedEvent->getRecognizedText()), 'Expected recognized text');
}

public function testWorkflowOcrLocalBackendRegularJpg(): void {
$this->addOperation('image/png');
$this->uploadTestFile('png-without-alpha-channel.png');
$this->runOcrBackgroundJob();

$this->assertEmpty($this->apiClient->getRequests(), 'Expected no OCR Backend Service requests');
$this->assertEquals(1, count($this->capturedEvents), 'Expected 1 TextRecognizedEvent');
$textRecognizedEvent = $this->capturedEvents[0];
$this->assertInstanceOf(TextRecognizedEvent::class, $textRecognizedEvent, 'Expected TextRecognizedEvent instance');
$this->assertEquals('PNG without alpha channel', trim($textRecognizedEvent->getRecognizedText()), 'Expected recognized text');
}
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading