Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/treetrek.git

Uses finfo to detect media type

AuthorDave Jarvis <email>
Date2026-02-11 21:37:42 GMT-0800
Commit8353b2678ebbdee8c8c2bd929ff6f461695cd610
Parent00f6043
Delta44 lines added, 197 lines removed, 153-line decrease
MediaTypeSniffer.php
<?php
class MediaTypeSniffer {
- private const BUFFER = 12;
- private const ANY = -1;
-
- public const CAT_IMAGE = 'image';
- public const CAT_VIDEO = 'video';
- public const CAT_AUDIO = 'audio';
- public const CAT_TEXT = 'text';
+ public const CAT_IMAGE = 'image';
+ public const CAT_VIDEO = 'video';
+ public const CAT_AUDIO = 'audio';
+ public const CAT_TEXT = 'text';
public const CAT_ARCHIVE = 'archive';
- public const CAT_APP = 'application';
- public const CAT_BINARY = 'binary';
-
- private const FORMATS = [
- [self::CAT_IMAGE, [0x3C, 0x73, 0x76, 0x67, 0x20], 'image/svg+xml'],
- [self::CAT_IMAGE, [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 'image/png'],
- [self::CAT_IMAGE, [0xFF, 0xD8, 0xFF, 0xE0], 'image/jpeg'],
- [self::CAT_IMAGE, [0xFF, 0xD8, 0xFF, 0xEE], 'image/jpeg'],
- [self::CAT_IMAGE, [0xFF, 0xD8, 0xFF, 0xE1, self::ANY, self::ANY, 0x45, 0x78, 0x69, 0x66, 0x00], 'image/jpeg'],
- [self::CAT_IMAGE, [0x47, 0x49, 0x46, 0x38], 'image/gif'],
- [self::CAT_IMAGE, [0x42, 0x4D], 'image/bmp'],
- [self::CAT_IMAGE, [0x49, 0x49, 0x2A, 0x00], 'image/tiff'],
- [self::CAT_IMAGE, [0x4D, 0x4D, 0x00, 0x2A], 'image/tiff'],
- [self::CAT_IMAGE, [0x52, 0x49, 0x46, 0x46, self::ANY, self::ANY, self::ANY, self::ANY, 0x57, 0x45, 0x42, 0x50], 'image/webp'],
- [self::CAT_IMAGE, [0x38, 0x42, 0x50, 0x53, 0x00, 0x01], 'image/vnd.adobe.photoshop'],
- [self::CAT_IMAGE, [0x23, 0x64, 0x65, 0x66], 'image/x-xbitmap'],
- [self::CAT_IMAGE, [0x21, 0x20, 0x58, 0x50, 0x4D, 0x32], 'image/x-xpixmap'],
- [self::CAT_VIDEO, [0x8A, 0x4D, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 'video/x-mng'],
- [self::CAT_VIDEO, [0x52, 0x49, 0x46, 0x46, self::ANY, self::ANY, self::ANY, self::ANY, 0x41, 0x56, 0x49, 0x20], 'video/x-msvideo'],
- [self::CAT_VIDEO, [self::ANY, self::ANY, self::ANY, self::ANY, 0x66, 0x74, 0x79, 0x70], 'video/mp4'],
- [self::CAT_VIDEO, [0x1A, 0x45, 0xDF, 0xA3], 'video/x-matroska'],
- [self::CAT_VIDEO, [0x00, 0x00, 0x01, 0xBA], 'video/mpeg'],
- [self::CAT_VIDEO, [0x46, 0x4C, 0x56, 0x01], 'video/x-flv'],
- [self::CAT_TEXT, [0x3C, 0x21], 'text/html'],
- [self::CAT_TEXT, [0x3C, 0x68, 0x74, 0x6D, 0x6C], 'text/html'],
- [self::CAT_TEXT, [0x3C, 0x68, 0x65, 0x61, 0x64], 'text/html'],
- [self::CAT_TEXT, [0x3C, 0x62, 0x6F, 0x64, 0x79], 'text/html'],
- [self::CAT_TEXT, [0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20], 'text/xml'],
- [self::CAT_TEXT, [0x25, 0x50, 0x44, 0x46, 0x2D], 'application/pdf'],
- [self::CAT_TEXT, [0xEF, 0xBB, 0xBF], 'text/plain'],
- [self::CAT_TEXT, [0xFE, 0xFF], 'text/plain'],
- [self::CAT_TEXT, [0xFF, 0xFE], 'text/plain'],
- [self::CAT_TEXT, [0x00, 0x00, 0xFE, 0xFF], 'text/plain'],
- [self::CAT_TEXT, [0xFF, 0xFE, 0x00, 0x00], 'text/plain'],
- [self::CAT_AUDIO, [0xFF, 0xFB, self::ANY], 'audio/mpeg'],
- [self::CAT_AUDIO, [0x49, 0x44, 0x33], 'audio/mpeg'],
- [self::CAT_AUDIO, [0x52, 0x49, 0x46, 0x46, self::ANY, self::ANY, self::ANY, self::ANY, 0x57, 0x41, 0x56, 0x45], 'audio/wav'],
- [self::CAT_AUDIO, [0x4F, 0x67, 0x67, 0x53], 'audio/ogg'],
- [self::CAT_ARCHIVE, [0x50, 0x4B, 0x03, 0x04], 'application/zip'],
- [self::CAT_ARCHIVE, [0x1F, 0x8B, 0x08], 'application/gzip'],
- [self::CAT_APP, [0x7F, 0x45, 0x4C, 0x46], 'application/x-elf']
- ];
-
- private const EXTENSION_MAP = [
- // Documentation / markup
- 'md' => [self::CAT_TEXT, 'text/markdown'],
- 'rmd' => [self::CAT_TEXT, 'text/r-markdown'],
- 'txt' => [self::CAT_TEXT, 'text/plain'],
- 'tex' => [self::CAT_TEXT, 'application/x-tex'],
- 'lyx' => [self::CAT_TEXT, 'application/x-lyx'],
- 'rst' => [self::CAT_TEXT, 'text/x-rst'],
- 'asciidoc' => [self::CAT_TEXT, 'text/asciidoc'],
- 'adoc' => [self::CAT_TEXT, 'text/asciidoc'],
- 'org' => [self::CAT_TEXT, 'text/org'],
- 'latex' => [self::CAT_TEXT, 'application/x-tex'],
- 'csv' => [self::CAT_TEXT, 'text/csv'],
- 'tsv' => [self::CAT_TEXT, 'text/tab-separated-values'],
- 'psv' => [self::CAT_TEXT, 'text/plain'],
-
- 'json' => [self::CAT_TEXT, 'application/json'],
- 'xml' => [self::CAT_TEXT, 'application/xml'],
- 'gitignore' => [self::CAT_TEXT, 'text/plain'],
- 'ts' => [self::CAT_TEXT, 'application/typescript'],
- 'log' => [self::CAT_TEXT, 'text/plain'],
- 'ndjson' => [self::CAT_TEXT, 'application/x-ndjson'],
- 'conf' => [self::CAT_TEXT, 'text/plain'],
- 'ini' => [self::CAT_TEXT, 'text/plain'],
- 'yaml' => [self::CAT_TEXT, 'text/yaml'],
- 'yml' => [self::CAT_TEXT, 'text/yaml'],
- 'toml' => [self::CAT_TEXT, 'application/toml'],
- 'env' => [self::CAT_TEXT, 'text/plain'],
- 'cfg' => [self::CAT_TEXT, 'text/plain'],
- 'properties'=> [self::CAT_TEXT, 'text/plain'],
- 'dotenv' => [self::CAT_TEXT, 'text/plain'],
-
- // Programming languages
- 'gradle' => [self::CAT_TEXT, 'text/plain'],
- 'php' => [self::CAT_TEXT, 'application/x-php'],
- 'sql' => [self::CAT_TEXT, 'application/sql'],
- 'html' => [self::CAT_TEXT, 'text/html'],
- 'xhtml' => [self::CAT_TEXT, 'text/xhtml'],
- 'css' => [self::CAT_TEXT, 'text/css'],
- 'js' => [self::CAT_TEXT, 'application/javascript'],
- 'py' => [self::CAT_TEXT, 'text/x-python'],
- 'rb' => [self::CAT_TEXT, 'text/x-ruby'],
- 'java' => [self::CAT_TEXT, 'text/x-java-source'],
- 'c' => [self::CAT_TEXT, 'text/x-csrc'],
- 'cpp' => [self::CAT_TEXT, 'text/x-c++src'],
- 'h' => [self::CAT_TEXT, 'text/x-chdr'],
- 'cs' => [self::CAT_TEXT, 'text/x-csharp'],
- 'go' => [self::CAT_TEXT, 'text/x-go'],
- 'rs' => [self::CAT_TEXT, 'text/x-rust'],
- 'swift' => [self::CAT_TEXT, 'text/x-swift'],
- 'kt' => [self::CAT_TEXT, 'text/x-kotlin'],
- 'kts' => [self::CAT_TEXT, 'text/x-kotlin'],
- 'scala' => [self::CAT_TEXT, 'text/x-scala'],
- 'dart' => [self::CAT_TEXT, 'text/x-dart'],
- 'lua' => [self::CAT_TEXT, 'text/x-lua'],
- 'pl' => [self::CAT_TEXT, 'text/x-perl'],
- 'pm' => [self::CAT_TEXT, 'text/x-perl'],
- 'r' => [self::CAT_TEXT, 'text/x-r'],
- 'm' => [self::CAT_TEXT, 'text/x-matlab'],
- 'jl' => [self::CAT_TEXT, 'text/x-julia'],
+ public const CAT_BINARY = 'binary';
- // Shell / scripting
- 'sh' => [self::CAT_TEXT, 'application/x-sh'],
- 'bash' => [self::CAT_TEXT, 'application/x-sh'],
- 'zsh' => [self::CAT_TEXT, 'application/x-sh'],
- 'fish' => [self::CAT_TEXT, 'text/plain'],
- 'bat' => [self::CAT_TEXT, 'application/x-msdos-program'],
- 'ps1' => [self::CAT_TEXT, 'application/x-powershell']
+ private const ARCHIVE_EXTENSIONS = [
+ 'zip', 'tar', 'gz', '7z', 'rar', 'jar', 'lha', 'bz', 'tgz', 'cab',
+ 'iso', 'dmg', 'xz', 'z', 'ar', 'war', 'ear', 'pak', 'hqx', 'arj',
+ 'zoo', 'rpm', 'deb', 'apk'
];
-
- private static function getTypeInfo( string $data, string $filePath ): array {
- $info = [];
- $ext = strtolower( pathinfo( $filePath, PATHINFO_EXTENSION ) );
-
- if( $ext === 'svg' ){
- $info = [self::CAT_IMAGE, 'image/svg+xml'];
- }
-
- if( empty( $info ) ){
- $info = self::sniff( $data );
- }
-
- if( empty( $info ) && !empty( $filePath ) ){
- $info = self::getInfoByExtension( $filePath );
- }
-
- if( empty( $info ) ){
- $info = [self::CAT_BINARY, 'application/octet-stream'];
- }
-
- return $info;
- }
-
- private static function sniff( string $data ): array {
- $found = [];
- $dataLength = strlen( $data );
- $maxScan = min( $dataLength, self::BUFFER );
- $sourceBytes = [];
-
- for( $i = 0; $i < $maxScan; $i++ ){
- $sourceBytes[$i] = ord( $data[$i] ) & 0xFF;
- }
-
- foreach( self::FORMATS as [$category, $pattern, $type] ){
- $patternLength = count( $pattern );
-
- if( $patternLength > $dataLength ){
- continue;
- }
-
- $matches = true;
-
- for( $i = 0; $i < $patternLength; $i++ ){
- if( $pattern[$i] !== self::ANY && $pattern[$i] !== $sourceBytes[$i] ){
- $matches = false;
- break;
- }
- }
-
- if( $matches ){
- $found = [$category, $type];
- break;
- }
- }
-
- return $found;
- }
-
- private static function getInfoByExtension( string $filePath ): array {
- $ext = strtolower( pathinfo( $filePath, PATHINFO_EXTENSION ) );
- $info = self::EXTENSION_MAP[$ext] ?? [self::CAT_BINARY, 'application/octet-stream'];
- return $info;
+ public static function isMediaType(
+ string $buffer,
+ string $filename = ''
+ ): string {
+ $finfo = new finfo( FILEINFO_MIME_TYPE );
+ $mediaType = $finfo->buffer( $buffer );
+ return $mediaType ?: 'application/octet-stream';
}
- public static function isMediaType( string $data, string $filePath = '' ): string {
- $info = self::getTypeInfo( $data, $filePath );
+ public static function isCategory(
+ string $buffer,
+ string $filename = ''
+ ): string {
+ $mediaType = self::isMediaType( $buffer, $filename );
+ $parts = explode( '/', $mediaType );
- return $info[1];
+ return match( true ) {
+ $parts[0] === 'image' => self::CAT_IMAGE,
+ $parts[0] === 'video' => self::CAT_VIDEO,
+ $parts[0] === 'audio' => self::CAT_AUDIO,
+ $parts[0] === 'text' => self::CAT_TEXT,
+ self::isArchive( $filename ) => self::CAT_ARCHIVE,
+ str_contains( $mediaType, 'compressed' ) => self::CAT_ARCHIVE,
+ default => self::CAT_BINARY,
+ };
}
-
- public static function isCategory( string $data, string $filePath = '' ): string {
- $info = self::getTypeInfo( $data, $filePath );
- return $info[0];
+ public static function isBinary(
+ string $buffer,
+ string $filename = ''
+ ): bool {
+ return !str_starts_with(
+ self::isMediaType( $buffer, $filename ),
+ 'text/'
+ );
}
-
- public static function isBinary( string $data, string $filePath = '' ): bool {
- $info = self::getTypeInfo( $data, $filePath );
- $category = $info[0];
- $type = $info[1];
- return !(
- $category === self::CAT_TEXT ||
- str_starts_with( $type, 'text/' ) ||
- $type === 'image/svg+xml'
+ private static function isArchive( string $filename ): bool {
+ return in_array(
+ strtolower( pathinfo( $filename, PATHINFO_EXTENSION ) ),
+ self::ARCHIVE_EXTENSIONS,
+ true
);
}
}
-?>