<?php
final class ArithCoder {
private const BITS = 31;
private const TOP = 0x7FFFFFFF;
private const HALF = 0x40000000;
private const QTR = 0x20000000;
private const ALPHABET =
'abcdefghijklmnopqrstuvwxyz'
. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
. '0123456789'
. '-._~:/?#[]@!$&\'()*+,;=%'
. "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
. "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
private const DICT_ENTRIES = [
"\x01" => '.com/', "\x02" => '.org/', "\x03" => '.net/',
"\x04" => '.edu/', "\x05" => '.gov/', "\x06" => '.co.',
"\x07" => '.html', "\x08" => '.htm', "\x09" => '.php',
"\x0A" => '.json', "\x0B" => '.xml', "\x0C" => '.jpg',
"\x0D" => '.png', "\x0E" => '.pdf', "\x0F" => '.asp',
"\x10" => 'tion', "\x11" => 'ment', "\x12" => 'news/',
"\x13" => 'wiki/', "\x14" => 'index', "\x15" => 'article',
"\x16" => 'https://www.',
"\x17" => 'https://',
"\x18" => 'http://www.',
"\x19" => 'http://',
"\x1A" => '-the-',
"\x1B" => '-and-',
"\x1C" => '-to-',
"\x1D" => '-of-',
"\x1E" => '-in-',
"\x1F" => '-for-',
];
private const BASE_FREQ = [
'a' => 80, 'b' => 28, 'c' => 42, 'd' => 35, 'e' => 105,
'f' => 20, 'g' => 25, 'h' => 35, 'i' => 70, 'j' => 5,
'k' => 15, 'l' => 58, 'm' => 38, 'n' => 65, 'o' => 68,
'p' => 32, 'q' => 4, 'r' => 60, 's' => 62, 't' => 72,
'u' => 28, 'v' => 12, 'w' => 22, 'x' => 5, 'y' => 18,
'z' => 4,
'A' => 15, 'B' => 12, 'C' => 14, 'D' => 10, 'E' => 10,
'F' => 9, 'G' => 8, 'H' => 10, 'I' => 10, 'J' => 5,
'K' => 9, 'L' => 10, 'M' => 12, 'N' => 10, 'O' => 9,
'P' => 12, 'Q' => 3, 'R' => 10, 'S' => 12, 'T' => 12,
'U' => 7, 'V' => 5, 'W' => 9, 'X' => 3, 'Y' => 5,
'Z' => 3,
'0' => 24, '1' => 26, '2' => 20, '3' => 15, '4' => 14,
'5' => 16, '6' => 14, '7' => 11, '8' => 11, '9' => 15,
'-' => 58, '.' => 42, '_' => 16, '~' => 2,
':' => 5, '/' => 78, '?' => 8, '#' => 3,
'[' => 1, ']' => 1, '@' => 2, '!' => 1,
'$' => 1, '&' => 8, "'" => 1, '(' => 2,
')' => 2, '*' => 1, '+' => 3, ',' => 2,
';' => 2, '=' => 12, '%' => 8,
"\x01" => 12, "\x02" => 6, "\x03" => 6, "\x04" => 3,
"\x05" => 3, "\x06" => 3, "\x07" => 5, "\x08" => 3,
"\x09" => 4, "\x0A" => 3, "\x0B" => 2, "\x0C" => 3,
"\x0D" => 3, "\x0E" => 3, "\x0F" => 2, "\x10" => 5,
"\x11" => 4, "\x12" => 4, "\x13" => 3, "\x14" => 3,
"\x15" => 3,
"\x16" => 30, "\x17" => 25, "\x18" => 5, "\x19" => 5,
"\x1A" => 8, "\x1B" => 6, "\x1C" => 8, "\x1D" => 7,
"\x1E" => 7, "\x1F" => 5,
];
private const EOF_FREQ = 1;
private const CTX_START = 0;
private const CTX_SLASH = 1;
private const CTX_DOT = 2;
private const CTX_DIGIT = 3;
private const CTX_HYPH = 4;
private const CTX_LOWER = 5;
private const CTX_UPPER = 6;
private const CTX_OTHER = 7;
private const CTX_COUNT = 8;
private const CLS_LOWER = 0;
private const CLS_UPPER = 1;
private const CLS_DIGIT = 2;
private const CLS_SLASH = 3;
private const CLS_DOT = 4;
private const CLS_HYPH = 5;
private const CLS_OTHER = 6;
private const CTX_CLASS_MULT = [
[180, 30, 50, 5, 20, 10, 30],
[160, 80, 140, 10, 20, 15, 40],
[ 40, 10, 60, 60, 30, 10, 20],
[ 30, 15, 400, 100, 80, 120, 40],
[200, 120, 80, 15, 10, 15, 30],
[160, 20, 40, 90, 70, 140, 40],
[160, 150, 30, 40, 30, 120, 30],
[100, 100, 100, 100, 100, 100, 100],
];
private const CTX_DOT_OVERRIDES = [
'c' => 500, 'o' => 400, 'h' => 300, 'n' => 300, 'p' => 300,
'j' => 200, 's' => 150, 't' => 150, 'g' => 150, 'e' => 150,
'x' => 120, 'a' => 120, 'i' => 120,
];
private const CTX_START_OVERRIDES = [
"\x16" => 3000,
"\x17" => 2000,
"\x18" => 500,
"\x19" => 500,
];
private const DICT_NEXT_CTX = [
"\x01" => self::CTX_SLASH, "\x02" => self::CTX_SLASH,
"\x03" => self::CTX_SLASH, "\x04" => self::CTX_SLASH,
"\x05" => self::CTX_SLASH, "\x06" => self::CTX_DOT,
"\x07" => self::CTX_LOWER, "\x08" => self::CTX_LOWER,
"\x09" => self::CTX_LOWER, "\x0A" => self::CTX_LOWER,
"\x0B" => self::CTX_LOWER, "\x0C" => self::CTX_LOWER,
"\x0D" => self::CTX_LOWER, "\x0E" => self::CTX_LOWER,
"\x0F" => self::CTX_LOWER, "\x10" => self::CTX_LOWER,
"\x11" => self::CTX_LOWER, "\x12" => self::CTX_SLASH,
"\x13" => self::CTX_SLASH, "\x14" => self::CTX_LOWER,
"\x15" => self::CTX_LOWER,
"\x16" => self::CTX_DOT, "\x17" => self::CTX_SLASH,
"\x18" => self::CTX_DOT, "\x19" => self::CTX_SLASH,
"\x1A" => self::CTX_HYPH, "\x1B" => self::CTX_HYPH,
"\x1C" => self::CTX_HYPH, "\x1D" => self::CTX_HYPH,
"\x1E" => self::CTX_HYPH, "\x1F" => self::CTX_HYPH,
];
private static array $model = [];
private static array $dictEnc = [];
private static array $dictDec = [];
private static bool $init = false;
private static function charClass( string $ch ): int {
return match( true ) {
$ch >= 'a' && $ch <= 'z' => self::CLS_LOWER,
$ch >= 'A' && $ch <= 'Z' => self::CLS_UPPER,
$ch >= '0' && $ch <= '9' => self::CLS_DIGIT,
$ch === '/' => self::CLS_SLASH,
$ch === '.' => self::CLS_DOT,
$ch === '-' || $ch === '_' => self::CLS_HYPH,
default => self::CLS_OTHER,
};
}
private static function charCtx( string $ch ): int {
return isset( self::DICT_NEXT_CTX[$ch] )
? self::DICT_NEXT_CTX[$ch]
: match( self::charClass( $ch ) ) {
self::CLS_LOWER => self::CTX_LOWER,
self::CLS_UPPER => self::CTX_UPPER,
self::CLS_DIGIT => self::CTX_DIGIT,
self::CLS_SLASH => self::CTX_SLASH,
self::CLS_DOT => self::CTX_DOT,
self::CLS_HYPH => self::CTX_HYPH,
default => self::CTX_OTHER,
};
}
private static function initModel(): void {
if( !self::$init ) {
$chars = str_split( self::ALPHABET );
$chars[] = "\x00";
$symCount = count( $chars );
$c2i = [];
$i2c = [];
foreach( $chars as $i => $ch ) {
$c2i[$ch] = $i;
$i2c[$i] = $ch;
}
$contexts = [];
for( $ctx = 0; $ctx < self::CTX_COUNT; $ctx++ ) {
$cum = [0];
$running = 0;
foreach( $chars as $ch ) {
if( $ch === "\x00" ) {
$w = self::EOF_FREQ;
} else {
$base = self::BASE_FREQ[$ch] ?? 1;
$cls = self::charClass( $ch );
$mult = self::CTX_CLASS_MULT[$ctx][$cls];
if( $ctx === self::CTX_DOT
&& isset( self::CTX_DOT_OVERRIDES[$ch] ) ) {
$mult = self::CTX_DOT_OVERRIDES[$ch];
}
if( $ctx === self::CTX_START
&& isset( self::CTX_START_OVERRIDES[$ch] ) ) {
$mult = self::CTX_START_OVERRIDES[$ch];
}
$w = max( 1, intdiv( $base * $mult, 100 ) );
}
$running += $w;
$cum[] = $running;
}
$contexts[$ctx] = ['cum' => $cum, 'total' => $running];
}
$enc = [];
$dec = [];
foreach( self::DICT_ENTRIES as $token => $substr ) {
$enc[$substr] = $token;
$dec[$token] = $substr;
}
$sortedEnc = [];
foreach( $enc as $substr => $token ) {
$sortedEnc[] = [$substr, $token];
}
usort( $sortedEnc, function( $a, $b ) {
return strlen( $b[0] ) - strlen( $a[0] );
});
self::$model = [
'contexts' => $contexts,
'c2i' => $c2i,
'i2c' => $i2c,
'size' => $symCount,
];
self::$dictEnc = $sortedEnc;
self::$dictDec = $dec;
self::$init = true;
}
}
private static function dictEncode( string $s ): string {
foreach( self::$dictEnc as [$substr, $token] ) {
$s = str_replace( $substr, $token, $s );
}
return $s;
}
private static function dictDecode( string $s ): string {
foreach( self::$dictDec as $token => $substr ) {
$s = str_replace( $token, $substr, $s );
}
return $s;
}
private static function needsRenorm( int $low, int $high ): bool {
return $high < self::HALF
|| $low >= self::HALF
|| ($low >= self::QTR && $high < 3 * self::QTR);
}
public static function encode( string $input ): string {
self::initModel();
$m = self::$model;
$input = self::dictEncode( $input );
$low = 0;
$high = self::TOP;
$pending = 0;
$bits = [];
$ctx = self::CTX_START;
$len = strlen( $input );
for( $pos = 0; $pos <= $len; $pos++ ) {
$ch = $pos < $len ? $input[$pos] : "\x00";
$idx = $m['c2i'][$ch] ?? -1;
if( $idx !== -1 ) {
$ct = $m['contexts'][$ctx];
$range = $high - $low + 1;
$high = $low + intdiv( $range * $ct['cum'][$idx + 1], $ct['total'] ) - 1;
$low = $low + intdiv( $range * $ct['cum'][$idx], $ct['total'] );
while( self::needsRenorm( $low, $high ) ) {
if( $high < self::HALF ) {
self::emitBit( 0, $pending, $bits );
} elseif( $low >= self::HALF ) {
self::emitBit( 1, $pending, $bits );
$low -= self::HALF;
$high -= self::HALF;
} else {
$pending++;
$low -= self::QTR;
$high -= self::QTR;
}
$low = $low << 1;
$high = ($high << 1) | 1;
}
if( $ch !== "\x00" ) {
$ctx = self::charCtx( $ch );
}
}
}
$pending++;
self::emitBit( $low < self::QTR ? 0 : 1, $pending, $bits );
return self::packBits( $bits );
}
public static function decode( string $data ): string|false {
self::initModel();
$m = self::$model;
$bits = self::unpackBits( $data );
$bitCount = count( $bits );
$bitPos = 0;
$value = 0;
for( $i = 0; $i < self::BITS; $i++ ) {
$value = ($value << 1) | ($bitPos < $bitCount ? $bits[$bitPos++] : 0);
}
$low = 0;
$high = self::TOP;
$result = '';
$eof = false;
$ctx = self::CTX_START;
while( !$eof && strlen( $result ) <= MAX_URL_LENGTH ) {
$ct = $m['contexts'][$ctx];
$range = $high - $low + 1;
$scaled = intdiv( ($value - $low + 1) * $ct['total'] - 1, $range );
$lo = 0;
$hi = $m['size'] - 1;
while( $lo < $hi ) {
$mid = ($lo + $hi) >> 1;
if( $ct['cum'][$mid + 1] <= $scaled ) {
$lo = $mid + 1;
} else {
$hi = $mid;
}
}
$idx = $lo;
$ch = $m['i2c'][$idx];
if( $ch === "\x00" ) {
$eof = true;
} else {
$result .= $ch;
$high = $low + intdiv( $range * $ct['cum'][$idx + 1], $ct['total'] ) - 1;
$low = $low + intdiv( $range * $ct['cum'][$idx], $ct['total'] );
while( self::needsRenorm( $low, $high ) ) {
if( $high < self::HALF ) {
} elseif( $low >= self::HALF ) {
$value -= self::HALF;
$low -= self::HALF;
$high -= self::HALF;
} else {
$value -= self::QTR;
$low -= self::QTR;
$high -= self::QTR;
}
$low = $low << 1;
$high = ($high << 1) | 1;
$value = ($value << 1) | ($bitPos < $bitCount ? $bits[$bitPos++] : 0);
}
$ctx = self::charCtx( $ch );
}
}
return $eof ? self::dictDecode( $result ) : false;
}
private static function emitBit( int $bit, int &$pending, array &$bits ): void {
$bits[] = $bit;
$opposite = $bit ^ 1;
while( $pending > 0 ) {
$bits[] = $opposite;
$pending--;
}
}
private static function packBits( array $bits ): string {
$out = '';
while( count( $bits ) % 8 !== 0 ) {
$bits[] = 0;
}
for( $i = 0, $n = count( $bits ); $i < $n; $i += 8 ) {
$byte = 0;
for( $j = 0; $j < 8; $j++ ) {
$byte = ($byte << 1) | $bits[$i + $j];
}
$out .= chr( $byte );
}
return $out;
}
private static function unpackBits( string $data ): array {
$bits = [];
for( $i = 0, $len = strlen( $data ); $i < $len; $i++ ) {
$byte = ord( $data[$i] );
for( $j = 7; $j >= 0; $j-- ) {
$bits[] = ($byte >> $j) & 1;
}
}
return $bits;
}
}