PNG  IHDR;IDATxܻn0K )(pA 7LeG{ §㻢|ذaÆ 6lذaÆ 6lذaÆ 6lom$^yذag5bÆ 6lذaÆ 6lذa{ 6lذaÆ `}HFkm,mӪôô! x|'ܢ˟;E:9&ᶒ}{v]n&6 h_tڠ͵-ҫZ;Z$.Pkž)!o>}leQfJTu іچ\X=8Rن4`Vwl>nG^is"ms$ui?wbs[m6K4O.4%/bC%t Mז -lG6mrz2s%9s@-k9=)kB5\+͂Zsٲ Rn~GRC wIcIn7jJhۛNCS|j08yiHKֶۛkɈ+;SzL/F*\Ԕ#"5m2[S=gnaPeғL lذaÆ 6l^ḵaÆ 6lذaÆ 6lذa; _ذaÆ 6lذaÆ 6lذaÆ RIENDB` * @copyright Copyright (c) 2021, ownCloud GmbH * @license AGPL-3.0 * * This code is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License, version 3, * along with this program. If not, see * */ namespace OC; class Utf8Analyzer { /** * https://en.wikipedia.org/wiki/UTF-8 * Ranges to determine how many bytes we need to read based on the first byte read. */ private $utf8Ranges = [ "1b" => [0x00, 0x7f], "2b" => [0xc0, 0xdf], "3b" => [0xe0, 0xef], "4b" => [0xf0, 0xf7], // ranges [0x80, 0xbf] and [0xf8, 0xff] aren't defined ]; /** * https://en.wikipedia.org/wiki/Unicode_block * Unicode block ranges * Keep the list sorted to be able to do a binary search * NOTE: The list is incomplete and might be inaccurate with some symbols */ private $unicodeRanges = [ ['range' => [0x0000, 0x0040], 'script' => 'Common'], ['range' => [0x0041, 0x005a], 'script' => 'Latin'], ['range' => [0x005b, 0x0060], 'script' => 'Common'], ['range' => [0x0061, 0x007a], 'script' => 'Latin'], ['range' => [0x007b, 0x00bf], 'script' => 'Common'], ['range' => [0x00c0, 0x00d6], 'script' => 'Latin'], ['range' => [0x00d7, 0x00d7], 'script' => 'Common'], ['range' => [0x00d8, 0x00f6], 'script' => 'Latin'], ['range' => [0x00f7, 0x00f7], 'script' => 'Common'], ['range' => [0x00f8, 0x024f], 'script' => 'Latin'], ['range' => [0x0370, 0x03e1], 'script' => 'Greek'], ['range' => [0x03f0, 0x03ff], 'script' => 'Greek'], ['range' => [0x0590, 0x05ff], 'script' => 'Hebrew'], ['range' => [0x0600, 0x06ff], 'script' => 'Arabic'], ['range' => [0x0750, 0x077f], 'script' => 'Arabic'], ['range' => [0x0870, 0x08ff], 'script' => 'Arabic'], ['range' => [0x0900, 0x097f], 'script' => 'Devanagari'], ['range' => [0x1100, 0x11ff], 'script' => 'Hangul'], ['range' => [0x1cd0, 0x1cff], 'script' => 'Common'], ['range' => [0x1e00, 0x1eff], 'script' => 'Latin'], ['range' => [0x1f00, 0x1fff], 'script' => 'Greek'], ['range' => [0x2000, 0x2070], 'script' => 'Common'], ['range' => [0x2071, 0x2071], 'script' => 'Latin'], ['range' => [0x2074, 0x207e], 'script' => 'Common'], ['range' => [0x207f, 0x207f], 'script' => 'Latin'], ['range' => [0x2080, 0x208e], 'script' => 'Common'], ['range' => [0x2090, 0x209c], 'script' => 'Latin'], ['range' => [0x2150, 0x215f], 'script' => 'Common'], ['range' => [0x2160, 0x2188], 'script' => 'Latin'], ['range' => [0x2189, 0x218b], 'script' => 'Common'], ['range' => [0x2190, 0x27ff], 'script' => 'Common'], ['range' => [0x2900, 0x2bff], 'script' => 'Common'], ['range' => [0x2c60, 0x2c7f], 'script' => 'Latin'], ['range' => [0x2e80, 0x2fdf], 'script' => 'Han'], ['range' => [0x3040, 0x309f], 'script' => 'Hiragana'], ['range' => [0x30a0, 0x30ff], 'script' => 'Katakana'], ['range' => [0x3130, 0x318f], 'script' => 'Hangul'], ['range' => [0x31f0, 0x31ff], 'script' => 'Katakana'], ['range' => [0x3200, 0x321e], 'script' => 'Hangul'], ['range' => [0x3260, 0x327f], 'script' => 'Hangul'], ['range' => [0x32d0, 0x3357], 'script' => 'Katakana'], ['range' => [0x3400, 0x4dbf], 'script' => 'Han'], ['range' => [0x4e00, 0x9fff], 'script' => 'Han'], ['range' => [0xa720, 0xa7ff], 'script' => 'Latin'], ['range' => [0xa8e0, 0xa8ff], 'script' => 'Devanagari'], ['range' => [0xa960, 0xa97f], 'script' => 'Hangul'], ['range' => [0xab30, 0xab6f], 'script' => 'Latin'], ['range' => [0xac00, 0xd7ff], 'script' => 'Hangul'], ['range' => [0xf900, 0xfaff], 'script' => 'Han'], ['range' => [0xfb1d, 0xfb4f], 'script' => 'Hebrew'], // some unicode chars aren't assigned ['range' => [0xfb50, 0xfdff], 'script' => 'Arabic'], ['range' => [0xfe70, 0xfefc], 'script' => 'Arabic'], ]; /** * Analyze the stream to get statistics. The stream is assumed to be encoded in utf8, * otherwise, the function will return garbage. * The function will return data based on the "processors" requested. Currently, these are * the available processors: * - "count": to return the number of chars per script ("Latin", "Han", "Hangul", etc) found * in the stream * - "details": to return a list of detailed info per unicode char. The info includes * the byte range for the unicode char, the unicode char, the associated unicode code point * both as int and as hex string, and the detected script. This is mostly for debugging * purposes. * - "lines": to return line information found in the stream. It will return the number of * lines (it might be slightly inaccurate, the end of file could be counted as new line and * count an additional line), and a list containing the chars per line. * * Since the current script list is incomplete, characters that doesn't belong to any known * group will be grouped under the "_unknown" script. Common punctuation symbols are grouped * under the "Common" script * * An possible result could be (assuming all the "processors" are activated) * { * "count": { * "Arabic": 3, * "Common": 1 * }, * "details": [ * { * "range": "0-1", * "str": "ك", * "unicode": 1603, * "unicodeHex": "643", * "script": "Arabic" * }, * { * "range": "2-3", * "str": "ن", * "unicode": 1606, * "unicodeHex": "646", * "script": "Arabic" * }, * { * "range": "4-4", * "str": " ", * "unicode": 32, * "unicodeHex": "20", * "script": "Common" * }, * { * "range": "5-6", * "str": "و", * "unicode": 1608, * "unicodeHex": "648", * "script": "Arabic" * } * ], * "lines": { * "linesNumber": 1, * "lines": [ * [ * "ك", * "ن", * " ", * "و" * ] * ] * } * } * * Each processor will show its information under its own key (matching the processor's name) * * The analisis starts from the current stream pointer position, wherever it is, * until "maxBytes" have been read (PHP_INT_MAX by default) or the end of the stream. * Note that this function won't open nor close the stream, and won't rewind the stream * pointer position * * @param resource $stream the opened stream to be analized * @param array $processors a list containing the processor names to be used. * Known names are "count", "details" and "lines". Processors that aren't in the list * won't be used and won't appear in the result * @param int $maxBytes the maximum number of bytes to read. Some additional bytes might * be read to fit a complete utf8 character * @return array a map as described above. Note that a processor that hasn't been activated * won't be part of the result. If no processor has been activated, this function will just * traverse the stream without showing any result, although some internal processing will * be performed anyway. */ public function analyzeStream($stream, array $processors = [], int $maxBytes = PHP_INT_MAX) { $byteCount = 0; $map = []; $processorActions = [ 'count' => 'processCountChars', 'details' => 'processDetails', 'lines' => 'processLines' ]; foreach ($processors as $name) { $map[$name] = []; } while ($byteCount < $maxBytes && !\feof($stream)) { $lowerBytePos = $byteCount; $mbRead = $this->readMbChar($stream); if ($mbRead === false) { break; } $str = $mbRead[0]; $byteCount += $mbRead[1]; $upperBytePos = $byteCount - 1; $unicodePoint = \mb_ord($str); $index = $this->searchInUnicode($unicodePoint); $params = [ 'range' => [$lowerBytePos, $upperBytePos], 'str' => $str, 'unicodeRangePos' => $index, 'unicodePoint' => $unicodePoint ]; foreach ($processors as $processor) { $actionMethod = $processorActions[$processor]; $this->$actionMethod($params, $map[$processor]); } } return $map; } /** * This is mainly a wrapper around the analyzeStream method in order to work easier with * a string. * @see analyzeStream * @param string $data the string to be analyzed. The whole string will be checked * @param array $processors a list containing the processor names to be used. * Known names are "count", "details" and "lines". Processors that aren't in the list * won't be used and won't appear in the result * @return array a map as described above (see analyzeStream). * */ public function analyzeString(string $data, array $processors = []) { $stream = \fopen('php://memory', 'r+'); \fwrite($stream, $data); \rewind($stream); $result = $this->analyzeStream($stream, $processors); \fclose($stream); return $result; } /** * Read a multibyte char from the stream. The stream is assumed to be utf8-encoded * The function returns an array with the first element being the multibyte char and the * second element the number of bytes read from the stream. [$str, $bytesRead] * It will return false if there is no char to be read */ private function readMbChar($stream) { $byte = \fread($stream, 1); if ($byte === '') { return false; } $byteInt = \ord($byte); if ($this->inRange($byteInt, '4b')) { // we need to read 3 more bytes $str = $byte . \fread($stream, 3); $byteCount = 4; } elseif ($this->inRange($byteInt, '3b')) { // we need to read 2 more bytes $str = $byte . \fread($stream, 2); $byteCount = 3; } elseif ($this->inRange($byteInt, '2b')) { // we need to read another byte $str = $byte . \fread($stream, 1); $byteCount = 2; } else { // either not in a valid range (something broke) or in "1b" range. // in any case, use 1 byte $str = $byte; $byteCount = 1; } return [$str, $byteCount]; } /** * Check if the "byteInt" in a range defined in the utf8Ranges attr. */ private function inRange($byteInt, $range) { return $this->utf8Ranges[$range][0] <= $byteInt && $byteInt <= $this->utf8Ranges[$range][1]; } /** * Search the unicodePoint in the list of unicodeRanges. It uses a binary search approach * so the list in the unicodeRanges attr must be sorted. */ private function searchInUnicode($unicodePoint) { $left = 0; $right = \count($this->unicodeRanges) - 1; while ($left <= $right) { $midpoint = \intval(($left + $right) / 2, 10); $uRange = $this->unicodeRanges[$midpoint]; if ($uRange['range'][0] <= $unicodePoint && $unicodePoint <= $uRange['range'][1]) { return $midpoint; } else { if ($uRange['range'][0] > $unicodePoint) { $right = $midpoint - 1; } else { $left = $midpoint + 1; } } } return null; } /** * Return a map containing the scripts found and the number of chars per script, such as * ["Han" => 57, "Katakana" => 6, "Common" => 34] * @param array $params a map with information about the character to be processed: * - "range" -> the byte range used by the char, as 2 integers [$lowerRange, $upperRange] * - "str" -> the string representing the multibyte char * - "unicodeRangePos" -> the index inside the unicodeRanges array where the char is placed * - "unicodePoint" -> the unicode code point of the char, as integer * @param array $data an array to place the result. The same array will be reused in * multiple calls, until the stream is processed. */ private function processCountChars(array $params, array &$data) { if ($params['unicodeRangePos'] !== null) { $mapIndex = $this->unicodeRanges[$params['unicodeRangePos']]['script']; } else { $mapIndex = '_unknown'; } if (!isset($data[$mapIndex])) { $data[$mapIndex] = 0; } $data[$mapIndex] += 1; } /** * Provide a list with information per char. See "processCountChars" for details on the * parameters * The list will be something like: * [ * ["range" => "0-1", "str" => "ن", "unicode" => 1606, "unicodeHex" => "646", "script" => "Arabic"], * ["range" => "2-2", "str" => " ", "unicode" => 32, "unicodeHex" => "20", "script" => "Common"], * ..... * ] */ private function processDetails(array $params, array &$data) { if ($params['unicodeRangePos'] !== null) { $mapIndex = $this->unicodeRanges[$params['unicodeRangePos']]['script']; } else { $mapIndex = '_unknown'; } $data[] = [ 'range' => "{$params['range'][0]}-{$params['range'][1]}", 'str' => $params['str'], 'unicode' => $params['unicodePoint'], 'unicodeHex' => \dechex($params['unicodePoint']), 'script' => $mapIndex, ]; } /** * Provide information about the lines found. Note that each line will contain an array * with the chars in that line. The "\n" and "\r" chars will be excluded. * For each line you can use the "implode('', $arrayLine)" to build the string, or use * "array_slice" to get a fixed number of chars before building the string. * See "processCountChars" for details on the parameters * Example of output: * [ * "linesNumber" => 2, * "lines" => [ * ["a", "b", "c"], * ["5", "6", "ى"] * ] * ] */ private function processLines(array $params, array &$data) { static $lastProcessedChar = null; if (!isset($data['linesNumber'])) { $data = [ 'linesNumber' => 1, 'lines' => [], ]; } $lineIndex = $data['linesNumber'] - 1; if (!isset($data['lines'][$lineIndex])) { $data['lines'][$lineIndex] = []; } switch ($params['str']) { case "\n": if ($lastProcessedChar !== "\r") { $data['linesNumber']++; } break; case "\r": if ($lastProcessedChar !== "\n") { $data['linesNumber']++; } break; default: $data['lines'][$lineIndex][] = $params['str']; } $lastProcessedChar = $params['str']; } }