Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/redirect.git
<?php
final class ArithCoder {
  private const BITS = 31;
  private const TOP  = 0x7FFFFFFF;
  private const HALF = 0x40000000;
  private const QTR  = 0x20000000;

  private const ALPHABET =
      'abcdefghijklmnopqrstuvwxyz'
    . 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    . '0123456789'
    . '-._~:/?#[]@!$&\'()*+,;=%'
    . "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
    . "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";

  private const DICT_ENTRIES = [
    "\x01" => '.com/',    "\x02" => '.org/',    "\x03" => '.net/',
    "\x04" => '.edu/',    "\x05" => '.gov/',    "\x06" => '.co.',
    "\x07" => '.html',    "\x08" => '.htm',     "\x09" => '.php',
    "\x0A" => '.json',    "\x0B" => '.xml',     "\x0C" => '.jpg',
    "\x0D" => '.png',     "\x0E" => '.pdf',     "\x0F" => '.asp',
    "\x10" => 'tion',     "\x11" => 'ment',     "\x12" => 'news/',
    "\x13" => 'wiki/',    "\x14" => 'index',    "\x15" => 'article',
    "\x16" => 'https://www.',
    "\x17" => 'https://',
    "\x18" => 'http://www.',
    "\x19" => 'http://',
    "\x1A" => '-the-',
    "\x1B" => '-and-',
    "\x1C" => '-to-',
    "\x1D" => '-of-',
    "\x1E" => '-in-',
    "\x1F" => '-for-',
  ];

  private const BASE_FREQ = [
    'a' =>  80, 'b' =>  28, 'c' =>  42, 'd' =>  35, 'e' => 105,
    'f' =>  20, 'g' =>  25, 'h' =>  35, 'i' =>  70, 'j' =>   5,
    'k' =>  15, 'l' =>  58, 'm' =>  38, 'n' =>  65, 'o' =>  68,
    'p' =>  32, 'q' =>   4, 'r' =>  60, 's' =>  62, 't' =>  72,
    'u' =>  28, 'v' =>  12, 'w' =>  22, 'x' =>   5, 'y' =>  18,
    'z' =>   4,
    'A' =>  15, 'B' =>  12, 'C' =>  14, 'D' =>  10, 'E' =>  10,
    'F' =>   9, 'G' =>   8, 'H' =>  10, 'I' =>  10, 'J' =>   5,
    'K' =>   9, 'L' =>  10, 'M' =>  12, 'N' =>  10, 'O' =>   9,
    'P' =>  12, 'Q' =>   3, 'R' =>  10, 'S' =>  12, 'T' =>  12,
    'U' =>   7, 'V' =>   5, 'W' =>   9, 'X' =>   3, 'Y' =>   5,
    'Z' =>   3,
    '0' =>  24, '1' =>  26, '2' =>  20, '3' =>  15, '4' =>  14,
    '5' =>  16, '6' =>  14, '7' =>  11, '8' =>  11, '9' =>  15,
    '-' =>  58, '.' =>  42, '_' =>  16, '~' =>   2,
    ':' =>   5, '/' =>  78, '?' =>   8, '#' =>   3,
    '[' =>   1, ']' =>   1, '@' =>   2, '!' =>   1,
    '$' =>   1, '&' =>   8, "'" =>   1, '(' =>   2,
    ')' =>   2, '*' =>   1, '+' =>   3, ',' =>   2,
    ';' =>   2, '=' =>  12, '%' =>   8,
    "\x01" => 12, "\x02" =>  6, "\x03" =>  6, "\x04" =>  3,
    "\x05" =>  3, "\x06" =>  3, "\x07" =>  5, "\x08" =>  3,
    "\x09" =>  4, "\x0A" =>  3, "\x0B" =>  2, "\x0C" =>  3,
    "\x0D" =>  3, "\x0E" =>  3, "\x0F" =>  2, "\x10" =>  5,
    "\x11" =>  4, "\x12" =>  4, "\x13" =>  3, "\x14" =>  3,
    "\x15" =>  3,
    "\x16" => 30, "\x17" => 25, "\x18" =>  5, "\x19" =>  5,
    "\x1A" =>  8, "\x1B" =>  6, "\x1C" =>  8, "\x1D" =>  7,
    "\x1E" =>  7, "\x1F" =>  5,
  ];
  private const EOF_FREQ = 1;

  private const CTX_START = 0;
  private const CTX_SLASH = 1;
  private const CTX_DOT   = 2;
  private const CTX_DIGIT = 3;
  private const CTX_HYPH  = 4;
  private const CTX_LOWER = 5;
  private const CTX_UPPER = 6;
  private const CTX_OTHER = 7;
  private const CTX_COUNT = 8;

  private const CLS_LOWER = 0;
  private const CLS_UPPER = 1;
  private const CLS_DIGIT = 2;
  private const CLS_SLASH = 3;
  private const CLS_DOT   = 4;
  private const CLS_HYPH  = 5;
  private const CLS_OTHER = 6;

  private const CTX_CLASS_MULT = [
    [180,  30,  50,   5,  20,  10,  30],
    [160,  80, 140,  10,  20,  15,  40],
    [ 40,  10,  60,  60,  30,  10,  20],
    [ 30,  15, 400, 100,  80, 120,  40],
    [200, 120,  80,  15,  10,  15,  30],
    [160,  20,  40,  90,  70, 140,  40],
    [160, 150,  30,  40,  30, 120,  30],
    [100, 100, 100, 100, 100, 100, 100],
  ];

  private const CTX_DOT_OVERRIDES = [
    'c' => 500, 'o' => 400, 'h' => 300, 'n' => 300, 'p' => 300,
    'j' => 200, 's' => 150, 't' => 150, 'g' => 150, 'e' => 150,
    'x' => 120, 'a' => 120, 'i' => 120,
  ];

  private const CTX_START_OVERRIDES = [
    "\x16" => 3000,
    "\x17" => 2000,
    "\x18" => 500,
    "\x19" => 500,
  ];

  private const DICT_NEXT_CTX = [
    "\x01" => self::CTX_SLASH, "\x02" => self::CTX_SLASH,
    "\x03" => self::CTX_SLASH, "\x04" => self::CTX_SLASH,
    "\x05" => self::CTX_SLASH, "\x06" => self::CTX_DOT,
    "\x07" => self::CTX_LOWER, "\x08" => self::CTX_LOWER,
    "\x09" => self::CTX_LOWER, "\x0A" => self::CTX_LOWER,
    "\x0B" => self::CTX_LOWER, "\x0C" => self::CTX_LOWER,
    "\x0D" => self::CTX_LOWER, "\x0E" => self::CTX_LOWER,
    "\x0F" => self::CTX_LOWER, "\x10" => self::CTX_LOWER,
    "\x11" => self::CTX_LOWER, "\x12" => self::CTX_SLASH,
    "\x13" => self::CTX_SLASH, "\x14" => self::CTX_LOWER,
    "\x15" => self::CTX_LOWER,
    "\x16" => self::CTX_DOT,   "\x17" => self::CTX_SLASH,
    "\x18" => self::CTX_DOT,   "\x19" => self::CTX_SLASH,
    "\x1A" => self::CTX_HYPH,  "\x1B" => self::CTX_HYPH,
    "\x1C" => self::CTX_HYPH,  "\x1D" => self::CTX_HYPH,
    "\x1E" => self::CTX_HYPH,  "\x1F" => self::CTX_HYPH,
  ];

  private static array $model   = [];
  private static array $dictEnc = [];
  private static array $dictDec = [];
  private static bool  $init    = false;

  private static function charClass( string $ch ): int {
    return match( true ) {
      $ch >= 'a' && $ch <= 'z'   => self::CLS_LOWER,
      $ch >= 'A' && $ch <= 'Z'   => self::CLS_UPPER,
      $ch >= '0' && $ch <= '9'   => self::CLS_DIGIT,
      $ch === '/'                => self::CLS_SLASH,
      $ch === '.'                => self::CLS_DOT,
      $ch === '-' || $ch === '_' => self::CLS_HYPH,
      default                    => self::CLS_OTHER,
    };
  }

  private static function charCtx( string $ch ): int {
    return isset( self::DICT_NEXT_CTX[$ch] )
      ? self::DICT_NEXT_CTX[$ch]
      : match( self::charClass( $ch ) ) {
        self::CLS_LOWER => self::CTX_LOWER,
        self::CLS_UPPER => self::CTX_UPPER,
        self::CLS_DIGIT => self::CTX_DIGIT,
        self::CLS_SLASH => self::CTX_SLASH,
        self::CLS_DOT   => self::CTX_DOT,
        self::CLS_HYPH  => self::CTX_HYPH,
        default         => self::CTX_OTHER,
      };
  }

  private static function initModel(): void {
    if( !self::$init ) {
      $chars    = str_split( self::ALPHABET );
      $chars[]  = "\x00";
      $symCount = count( $chars );
      $c2i      = [];
      $i2c      = [];

      foreach( $chars as $i => $ch ) {
        $c2i[$ch] = $i;
        $i2c[$i]  = $ch;
      }

      $contexts = [];

      for( $ctx = 0; $ctx < self::CTX_COUNT; $ctx++ ) {
        $cum     = [0];
        $running = 0;

        foreach( $chars as $ch ) {
          if( $ch === "\x00" ) {
            $w = self::EOF_FREQ;
          } else {
            $base = self::BASE_FREQ[$ch] ?? 1;
            $cls  = self::charClass( $ch );
            $mult = self::CTX_CLASS_MULT[$ctx][$cls];

            if( $ctx === self::CTX_DOT
              && isset( self::CTX_DOT_OVERRIDES[$ch] ) ) {
              $mult = self::CTX_DOT_OVERRIDES[$ch];
            }

            if( $ctx === self::CTX_START
              && isset( self::CTX_START_OVERRIDES[$ch] ) ) {
              $mult = self::CTX_START_OVERRIDES[$ch];
            }

            $w = max( 1, intdiv( $base * $mult, 100 ) );
          }

          $running += $w;
          $cum[]    = $running;
        }

        $contexts[$ctx] = ['cum' => $cum, 'total' => $running];
      }

      $enc = [];
      $dec = [];

      foreach( self::DICT_ENTRIES as $token => $substr ) {
        $enc[$substr] = $token;
        $dec[$token]  = $substr;
      }

      $sortedEnc = [];

      foreach( $enc as $substr => $token ) {
        $sortedEnc[] = [$substr, $token];
      }

      usort( $sortedEnc, function( $a, $b ) {
        return strlen( $b[0] ) - strlen( $a[0] );
      });

      self::$model   = [
        'contexts' => $contexts,
        'c2i'      => $c2i,
        'i2c'      => $i2c,
        'size'     => $symCount,
      ];
      self::$dictEnc = $sortedEnc;
      self::$dictDec = $dec;
      self::$init    = true;
    }
  }

  private static function dictEncode( string $s ): string {
    foreach( self::$dictEnc as [$substr, $token] ) {
      $s = str_replace( $substr, $token, $s );
    }

    return $s;
  }

  private static function dictDecode( string $s ): string {
    foreach( self::$dictDec as $token => $substr ) {
      $s = str_replace( $token, $substr, $s );
    }

    return $s;
  }

  private static function needsRenorm( int $low, int $high ): bool {
    return $high < self::HALF
      || $low >= self::HALF
      || ($low >= self::QTR && $high < 3 * self::QTR);
  }

  public static function encode( string $input ): string {
    self::initModel();

    $m       = self::$model;
    $input   = self::dictEncode( $input );
    $low     = 0;
    $high    = self::TOP;
    $pending = 0;
    $bits    = [];
    $ctx     = self::CTX_START;
    $len     = strlen( $input );

    for( $pos = 0; $pos <= $len; $pos++ ) {
      $ch  = $pos < $len ? $input[$pos] : "\x00";
      $idx = $m['c2i'][$ch] ?? -1;

      if( $idx !== -1 ) {
        $ct    = $m['contexts'][$ctx];
        $range = $high - $low + 1;
        $high  = $low + intdiv( $range * $ct['cum'][$idx + 1], $ct['total'] ) - 1;
        $low   = $low + intdiv( $range * $ct['cum'][$idx], $ct['total'] );

        while( self::needsRenorm( $low, $high ) ) {
          if( $high < self::HALF ) {
            self::emitBit( 0, $pending, $bits );
          } elseif( $low >= self::HALF ) {
            self::emitBit( 1, $pending, $bits );

            $low  -= self::HALF;
            $high -= self::HALF;
          } else {
            $pending++;

            $low  -= self::QTR;
            $high -= self::QTR;
          }

          $low  = $low << 1;
          $high = ($high << 1) | 1;
        }

        if( $ch !== "\x00" ) {
          $ctx = self::charCtx( $ch );
        }
      }
    }

    $pending++;

    self::emitBit( $low < self::QTR ? 0 : 1, $pending, $bits );

    return self::packBits( $bits );
  }

  public static function decode( string $data ): string|false {
    self::initModel();

    $m        = self::$model;
    $bits     = self::unpackBits( $data );
    $bitCount = count( $bits );
    $bitPos   = 0;
    $value    = 0;

    for( $i = 0; $i < self::BITS; $i++ ) {
      $value = ($value << 1) | ($bitPos < $bitCount ? $bits[$bitPos++] : 0);
    }

    $low    = 0;
    $high   = self::TOP;
    $result = '';
    $eof    = false;
    $ctx    = self::CTX_START;

    while( !$eof && strlen( $result ) <= MAX_URL_LENGTH ) {
      $ct     = $m['contexts'][$ctx];
      $range  = $high - $low + 1;
      $scaled = intdiv( ($value - $low + 1) * $ct['total'] - 1, $range );
      $lo     = 0;
      $hi     = $m['size'] - 1;

      while( $lo < $hi ) {
        $mid = ($lo + $hi) >> 1;

        if( $ct['cum'][$mid + 1] <= $scaled ) {
          $lo = $mid + 1;
        } else {
          $hi = $mid;
        }
      }

      $idx = $lo;
      $ch  = $m['i2c'][$idx];

      if( $ch === "\x00" ) {
        $eof = true;
      } else {
        $result .= $ch;
        $high    = $low + intdiv( $range * $ct['cum'][$idx + 1], $ct['total'] ) - 1;
        $low     = $low + intdiv( $range * $ct['cum'][$idx], $ct['total'] );

        while( self::needsRenorm( $low, $high ) ) {
          if( $high < self::HALF ) {
            // low half
          } elseif( $low >= self::HALF ) {
            $value -= self::HALF;
            $low   -= self::HALF;
            $high  -= self::HALF;
          } else {
            $value -= self::QTR;
            $low   -= self::QTR;
            $high  -= self::QTR;
          }

          $low   = $low << 1;
          $high  = ($high << 1) | 1;
          $value = ($value << 1) | ($bitPos < $bitCount ? $bits[$bitPos++] : 0);
        }

        $ctx = self::charCtx( $ch );
      }
    }

    return $eof ? self::dictDecode( $result ) : false;
  }

  private static function emitBit( int $bit, int &$pending, array &$bits ): void {
    $bits[]   = $bit;
    $opposite = $bit ^ 1;

    while( $pending > 0 ) {
      $bits[] = $opposite;
      $pending--;
    }
  }

  private static function packBits( array $bits ): string {
    $out = '';

    while( count( $bits ) % 8 !== 0 ) {
      $bits[] = 0;
    }

    for( $i = 0, $n = count( $bits ); $i < $n; $i += 8 ) {
      $byte = 0;

      for( $j = 0; $j < 8; $j++ ) {
        $byte = ($byte << 1) | $bits[$i + $j];
      }

      $out .= chr( $byte );
    }

    return $out;
  }

  private static function unpackBits( string $data ): array {
    $bits = [];

    for( $i = 0, $len = strlen( $data ); $i < $len; $i++ ) {
      $byte = ord( $data[$i] );

      for( $j = 7; $j >= 0; $j-- ) {
        $bits[] = ($byte >> $j) & 1;
      }
    }

    return $bits;
  }
}