<?php
/**
 *
 * @author Juan Pablo Villafáñez Ramos <jvillafanez@owncloud.com>
 * @copyright Copyright (c) 2021, ownCloud GmbH
 * @license AGPL-3.0
 *
 * This code is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License, version 3,
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 *
 */

namespace OC;

class Utf8Analyzer {
	/**
	 * https://en.wikipedia.org/wiki/UTF-8
	 * Ranges to determine how many bytes we need to read based on the first byte read.
	 */
	private $utf8Ranges = [
		"1b" => [0x00, 0x7f],
		"2b" => [0xc0, 0xdf],
		"3b" => [0xe0, 0xef],
		"4b" => [0xf0, 0xf7],
		// ranges [0x80, 0xbf] and [0xf8, 0xff] aren't defined
	];

	/**
	 * https://en.wikipedia.org/wiki/Unicode_block
	 * Unicode block ranges
	 * Keep the list sorted to be able to do a binary search
	 * NOTE: The list is incomplete and might be inaccurate with some symbols
	 */
	private $unicodeRanges = [
		['range' => [0x0000, 0x0040], 'script' => 'Common'],
		['range' => [0x0041, 0x005a], 'script' => 'Latin'],
		['range' => [0x005b, 0x0060], 'script' => 'Common'],
		['range' => [0x0061, 0x007a], 'script' => 'Latin'],
		['range' => [0x007b, 0x00bf], 'script' => 'Common'],
		['range' => [0x00c0, 0x00d6], 'script' => 'Latin'],
		['range' => [0x00d7, 0x00d7], 'script' => 'Common'],
		['range' => [0x00d8, 0x00f6], 'script' => 'Latin'],
		['range' => [0x00f7, 0x00f7], 'script' => 'Common'],
		['range' => [0x00f8, 0x024f], 'script' => 'Latin'],
		['range' => [0x0370, 0x03e1], 'script' => 'Greek'],
		['range' => [0x03f0, 0x03ff], 'script' => 'Greek'],
		['range' => [0x0590, 0x05ff], 'script' => 'Hebrew'],
		['range' => [0x0600, 0x06ff], 'script' => 'Arabic'],
		['range' => [0x0750, 0x077f], 'script' => 'Arabic'],
		['range' => [0x0870, 0x08ff], 'script' => 'Arabic'],
		['range' => [0x0900, 0x097f], 'script' => 'Devanagari'],
		['range' => [0x1100, 0x11ff], 'script' => 'Hangul'],
		['range' => [0x1cd0, 0x1cff], 'script' => 'Common'],
		['range' => [0x1e00, 0x1eff], 'script' => 'Latin'],
		['range' => [0x1f00, 0x1fff], 'script' => 'Greek'],
		['range' => [0x2000, 0x2070], 'script' => 'Common'],
		['range' => [0x2071, 0x2071], 'script' => 'Latin'],
		['range' => [0x2074, 0x207e], 'script' => 'Common'],
		['range' => [0x207f, 0x207f], 'script' => 'Latin'],
		['range' => [0x2080, 0x208e], 'script' => 'Common'],
		['range' => [0x2090, 0x209c], 'script' => 'Latin'],
		['range' => [0x2150, 0x215f], 'script' => 'Common'],
		['range' => [0x2160, 0x2188], 'script' => 'Latin'],
		['range' => [0x2189, 0x218b], 'script' => 'Common'],
		['range' => [0x2190, 0x27ff], 'script' => 'Common'],
		['range' => [0x2900, 0x2bff], 'script' => 'Common'],
		['range' => [0x2c60, 0x2c7f], 'script' => 'Latin'],
		['range' => [0x2e80, 0x2fdf], 'script' => 'Han'],
		['range' => [0x3040, 0x309f], 'script' => 'Hiragana'],
		['range' => [0x30a0, 0x30ff], 'script' => 'Katakana'],
		['range' => [0x3130, 0x318f], 'script' => 'Hangul'],
		['range' => [0x31f0, 0x31ff], 'script' => 'Katakana'],
		['range' => [0x3200, 0x321e], 'script' => 'Hangul'],
		['range' => [0x3260, 0x327f], 'script' => 'Hangul'],
		['range' => [0x32d0, 0x3357], 'script' => 'Katakana'],
		['range' => [0x3400, 0x4dbf], 'script' => 'Han'],
		['range' => [0x4e00, 0x9fff], 'script' => 'Han'],
		['range' => [0xa720, 0xa7ff], 'script' => 'Latin'],
		['range' => [0xa8e0, 0xa8ff], 'script' => 'Devanagari'],
		['range' => [0xa960, 0xa97f], 'script' => 'Hangul'],
		['range' => [0xab30, 0xab6f], 'script' => 'Latin'],
		['range' => [0xac00, 0xd7ff], 'script' => 'Hangul'],
		['range' => [0xf900, 0xfaff], 'script' => 'Han'],
		['range' => [0xfb1d, 0xfb4f], 'script' => 'Hebrew'],  // some unicode chars aren't assigned
		['range' => [0xfb50, 0xfdff], 'script' => 'Arabic'],
		['range' => [0xfe70, 0xfefc], 'script' => 'Arabic'],
	];

	/**
	 * Analyze the stream to get statistics. The stream is assumed to be encoded in utf8,
	 * otherwise, the function will return garbage.
	 * The function will return data based on the "processors" requested. Currently, these are
	 * the available processors:
	 * - "count": to return the number of chars per script ("Latin", "Han", "Hangul", etc) found
	 * in the stream
	 * - "details": to return a list of detailed info per unicode char. The info includes
	 * the byte range for the unicode char, the unicode char, the associated unicode code point
	 * both as int and as hex string, and the detected script. This is mostly for debugging
	 * purposes.
	 * - "lines": to return line information found in the stream. It will return the number of
	 * lines (it might be slightly inaccurate, the end of file could be counted as new line and
	 * count an additional line), and a list containing the chars per line.
	 *
	 * Since the current script list is incomplete, characters that doesn't belong to any known
	 * group will be grouped under the "_unknown" script. Common punctuation symbols are grouped
	 * under the "Common" script
	 *
	 * An possible result could be (assuming all the "processors" are activated)
	 * {
	 *     "count": {
	 *         "Arabic": 3,
	 *         "Common": 1
	 *     },
	 *     "details": [
	 *         {
	 *             "range": "0-1",
	 *             "str": "ك",
	 *             "unicode": 1603,
	 *             "unicodeHex": "643",
	 *             "script": "Arabic"
	 *         },
	 *         {
	 *             "range": "2-3",
	 *             "str": "ن",
	 *             "unicode": 1606,
	 *             "unicodeHex": "646",
	 *             "script": "Arabic"
	 *         },
	 *         {
	 *             "range": "4-4",
	 *             "str": " ",
	 *             "unicode": 32,
	 *             "unicodeHex": "20",
	 *             "script": "Common"
	 *         },
	 *         {
	 *             "range": "5-6",
	 *             "str": "و",
	 *             "unicode": 1608,
	 *             "unicodeHex": "648",
	 *             "script": "Arabic"
	 *         }
	 *     ],
	 *     "lines": {
	 *         "linesNumber": 1,
	 *         "lines": [
	 *             [
	 *                 "ك",
	 *                 "ن",
	 *                 " ",
	 *                 "و"
	 *             ]
	 *         ]
	 *     }
	 * }
	 *
	 * Each processor will show its information under its own key (matching the processor's name)
	 *
	 * The analisis starts from the current stream pointer position, wherever it is,
	 * until "maxBytes" have been read (PHP_INT_MAX by default) or the end of the stream.
	 * Note that this function won't open nor close the stream, and won't rewind the stream
	 * pointer position
	 *
	 * @param resource $stream the opened stream to be analized
	 * @param array $processors a list containing the processor names to be used.
	 * Known names are "count", "details" and "lines". Processors that aren't in the list
	 * won't be used and won't appear in the result
	 * @param int $maxBytes the maximum number of bytes to read. Some additional bytes might
	 * be read to fit a complete utf8 character
	 * @return array a map as described above. Note that a processor that hasn't been activated
	 * won't be part of the result. If no processor has been activated, this function will just
	 * traverse the stream without showing any result, although some internal processing will
	 * be performed anyway.
	 */
	public function analyzeStream($stream, array $processors = [], int $maxBytes = PHP_INT_MAX) {
		$byteCount = 0;
		$map = [];
		$processorActions = [
			'count' => 'processCountChars',
			'details' => 'processDetails',
			'lines' => 'processLines'
		];
		foreach ($processors as $name) {
			$map[$name] = [];
		}

		while ($byteCount < $maxBytes && !\feof($stream)) {
			$lowerBytePos = $byteCount;

			$mbRead = $this->readMbChar($stream);
			if ($mbRead === false) {
				break;
			}
			$str = $mbRead[0];
			$byteCount += $mbRead[1];

			$upperBytePos = $byteCount - 1;

			$unicodePoint = \mb_ord($str);
			$index = $this->searchInUnicode($unicodePoint);

			$params = [
				'range' => [$lowerBytePos, $upperBytePos],
				'str' => $str,
				'unicodeRangePos' => $index,
				'unicodePoint' => $unicodePoint
			];

			foreach ($processors as $processor) {
				$actionMethod = $processorActions[$processor];
				$this->$actionMethod($params, $map[$processor]);
			}
		}
		return $map;
	}

	/**
	 * This is mainly a wrapper around the analyzeStream method in order to work easier with
	 * a string.
	 * @see analyzeStream
	 * @param string $data the string to be analyzed. The whole string will be checked
	 * @param array $processors a list containing the processor names to be used.
	 * Known names are "count", "details" and "lines". Processors that aren't in the list
	 * won't be used and won't appear in the result
	 * @return array a map as described above (see analyzeStream).
	 * */
	public function analyzeString(string $data, array $processors = []) {
		$stream = \fopen('php://memory', 'r+');
		\fwrite($stream, $data);
		\rewind($stream);
		$result = $this->analyzeStream($stream, $processors);
		\fclose($stream);
		return $result;
	}

	/**
	 * Read a multibyte char from the stream. The stream is assumed to be utf8-encoded
	 * The function returns an array with the first element being the multibyte char and the
	 * second element the number of bytes read from the stream. [$str, $bytesRead]
	 * It will return false if there is no char to be read
	 */
	private function readMbChar($stream) {
		$byte = \fread($stream, 1);
		if ($byte === '') {
			return false;
		}

		$byteInt = \ord($byte);
		if ($this->inRange($byteInt, '4b')) {
			// we need to read 3 more bytes
			$str = $byte . \fread($stream, 3);
			$byteCount = 4;
		} elseif ($this->inRange($byteInt, '3b')) {
			// we need to read 2 more bytes
			$str = $byte . \fread($stream, 2);
			$byteCount = 3;
		} elseif ($this->inRange($byteInt, '2b')) {
			// we need to read another byte
			$str = $byte . \fread($stream, 1);
			$byteCount = 2;
		} else {
			// either not in a valid range (something broke) or in "1b" range.
			// in any case, use 1 byte
			$str = $byte;
			$byteCount = 1;
		}
		return [$str, $byteCount];
	}

	/**
	 * Check if the "byteInt" in a range defined in the utf8Ranges attr.
	 */
	private function inRange($byteInt, $range) {
		return $this->utf8Ranges[$range][0] <= $byteInt && $byteInt <= $this->utf8Ranges[$range][1];
	}

	/**
	 * Search the unicodePoint in the list of unicodeRanges. It uses a binary search approach
	 * so the list in the unicodeRanges attr must be sorted.
	 */
	private function searchInUnicode($unicodePoint) {
		$left = 0;
		$right = \count($this->unicodeRanges) - 1;
		while ($left <= $right) {
			$midpoint = \intval(($left + $right) / 2, 10);

			$uRange = $this->unicodeRanges[$midpoint];
			if ($uRange['range'][0] <= $unicodePoint && $unicodePoint <= $uRange['range'][1]) {
				return $midpoint;
			} else {
				if ($uRange['range'][0] > $unicodePoint) {
					$right = $midpoint - 1;
				} else {
					$left = $midpoint + 1;
				}
			}
		}
		return null;
	}

	/**
	 * Return a map containing the scripts found and the number of chars per script, such as
	 * ["Han" => 57, "Katakana" => 6, "Common" => 34]
	 * @param array $params a map with information about the character to be processed:
	 * - "range" -> the byte range used by the char, as 2 integers [$lowerRange, $upperRange]
	 * - "str" -> the string representing the multibyte char
	 * - "unicodeRangePos" -> the index inside the unicodeRanges array where the char is placed
	 * - "unicodePoint" -> the unicode code point of the char, as integer
	 * @param array $data an array to place the result. The same array will be reused in
	 * multiple calls, until the stream is processed.
	 */
	private function processCountChars(array $params, array &$data) {
		if ($params['unicodeRangePos'] !== null) {
			$mapIndex = $this->unicodeRanges[$params['unicodeRangePos']]['script'];
		} else {
			$mapIndex = '_unknown';
		}

		if (!isset($data[$mapIndex])) {
			$data[$mapIndex] = 0;
		}
		$data[$mapIndex] += 1;
	}

	/**
	 * Provide a list with information per char. See "processCountChars" for details on the
	 * parameters
	 * The list will be something like:
	 * [
	 *  ["range" => "0-1", "str" => "ن", "unicode" => 1606, "unicodeHex" => "646", "script" => "Arabic"],
	 *  ["range" => "2-2", "str" => " ", "unicode" => 32, "unicodeHex" => "20", "script" => "Common"],
	 *  .....
	 * ]
	 */
	private function processDetails(array $params, array &$data) {
		if ($params['unicodeRangePos'] !== null) {
			$mapIndex = $this->unicodeRanges[$params['unicodeRangePos']]['script'];
		} else {
			$mapIndex = '_unknown';
		}

		$data[] = [
			'range' => "{$params['range'][0]}-{$params['range'][1]}",
			'str' => $params['str'],
			'unicode' => $params['unicodePoint'],
			'unicodeHex' => \dechex($params['unicodePoint']),
			'script' => $mapIndex,
		];
	}

	/**
	 * Provide information about the lines found. Note that each line will contain an array
	 * with the chars in that line. The "\n" and "\r" chars will be excluded.
	 * For each line you can use the "implode('', $arrayLine)" to build the string, or use
	 * "array_slice" to get a fixed number of chars before building the string.
	 * See "processCountChars" for details on the parameters
	 * Example of output:
	 * [
	 *  "linesNumber" => 2,
	 *  "lines" => [
	 *   ["a", "b", "c"],
	 *   ["5", "6", "ى"]
	 *  ]
	 * ]
	 */
	private function processLines(array $params, array &$data) {
		static $lastProcessedChar = null;

		if (!isset($data['linesNumber'])) {
			$data = [
				'linesNumber' => 1,
				'lines' => [],
			];
		}
		$lineIndex = $data['linesNumber'] - 1;

		if (!isset($data['lines'][$lineIndex])) {
			$data['lines'][$lineIndex] = [];
		}

		switch ($params['str']) {
			case "\n":
				if ($lastProcessedChar !== "\r") {
					$data['linesNumber']++;
				}
				break;
			case "\r":
				if ($lastProcessedChar !== "\n") {
					$data['linesNumber']++;
				}
				break;
			default:
				$data['lines'][$lineIndex][] = $params['str'];
		}

		$lastProcessedChar = $params['str'];
	}
}