Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/recipe-fiddle.git
<?php
namespace com\whitemagicsoftware;

require "constants.php";

use Imagick;

/**
 * Handles storage and retrieval of photographs.
 */
class Scan extends Obj {
  /**
   * Interprets the text that passed through the OCR.
   *
   * @return Text.
   */
  public function distill( $path ) {
    return $this->interpret( $this->recognize( $path ) );
  }

  /**
   * Given a path to an image file, this will run the OCR executable on the
   * image and return the raw text that was extracted.
   *
   * @param $path The fully-qualified path to the file to perform OCR.
   * @return An array of raw text after performing OCR.
   */
  public function recognize( $path ) {
    global $EXECUTABLE_OCR;

    $language = "eng";

    $ext = pathinfo( $path, PATHINFO_EXTENSION );

    // Convert the image to grayscale and PNG format.
    $this->resample( $path );

    $command = "$EXECUTABLE_OCR $path stdout -l $language -psm 1";

    //$this->log( "exec: $command" );

    exec( "$command", $output, $exec_result );

    // Strip out blank lines and return the result.
    return array_filter( $output );
  }

  /**
   * Given a path to an image file this will convert the image to grayscale,
   * then scale the image to a new resolution (300dpi). This helps the OCR
   * find the text easier.
   */
  private function resample( $path ) {
    $im = new Imagick();
    $im->setResolution( 300, 300 );
    $im->readImage( $path );
    $im->trimImage( 0 );

    $w = $im->getImageWidth();

    // Scale and resample the image if it is too small.
    if( $w < 1024 ) {
      // Use grayscale colour space.
      $im->setImageColorspace( Imagick::COLORSPACE_REC601LUMA );

      // Desaturate the image.
      $im->modulateImage( 100, 0, 100 );

      // Scale the image up with interpolation.
      $im->resizeImage( $w * 3, 0, Imagick::INTERPOLATE_NEARESTNEIGHBOR, 1 );
    }

    // Should set the image to TIFF...
    // @see http://stackoverflow.com/a/4728905/59087
    $im->setFormat("png24");
    file_put_contents( $path, $im );
    $im->destroy();
  }

  /**
   * Interprets the text extracted from a call to recognize.
   *
   * @see recognize
   * @param $rawOcr An array of text.
   * @return A tuple of ingredients and instructions.
   */
  public function interpret( $rawOcr ) {
    $text = "";
    $result = "";
    $space = " ";

    foreach( $rawOcr as $index => $text ) {
      // \todo make multibyte safe: http://php.net/manual/en/ref.mbstring.php
      $text = trim( $text );

      if( empty( $text ) ) {
        $result = "$result\n";
      }
      else {
        // If the text is hyphenated, join the hyphens together. The
        // spell check should extract the hyphenated words correctly.
        if( mb_substr( $text, -1 ) === "-" ) {
          $text = mb_substr( $text, 0, -1 );
          $result = "$result$space$text";
          $space = "";
        }
        else {
          $result = "$result$space$text";
          $space = " ";
        }
      }
    }

    return $result;
  }
}