scratch – Blame information for rev 87

Subversion Repositories:
Rev:
Rev Author Line No. Line
87 office 1 <?php
2  
3 /*
4 * This file is part of the Symfony package.
5 *
6 * (c) Fabien Potencier <fabien@symfony.com>
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11  
12 namespace Symfony\Polyfill\Mbstring;
13  
14 /**
15 * Partial mbstring implementation in PHP, iconv based, UTF-8 centric.
16 *
17 * Implemented:
18 * - mb_chr - Returns a specific character from its Unicode code point
19 * - mb_convert_encoding - Convert character encoding
20 * - mb_convert_variables - Convert character code in variable(s)
21 * - mb_decode_mimeheader - Decode string in MIME header field
22 * - mb_encode_mimeheader - Encode string for MIME header XXX NATIVE IMPLEMENTATION IS REALLY BUGGED
23 * - mb_convert_case - Perform case folding on a string
24 * - mb_get_info - Get internal settings of mbstring
25 * - mb_http_input - Detect HTTP input character encoding
26 * - mb_http_output - Set/Get HTTP output character encoding
27 * - mb_internal_encoding - Set/Get internal character encoding
28 * - mb_list_encodings - Returns an array of all supported encodings
29 * - mb_ord - Returns the Unicode code point of a character
30 * - mb_output_handler - Callback function converts character encoding in output buffer
31 * - mb_scrub - Replaces ill-formed byte sequences with substitute characters
32 * - mb_strlen - Get string length
33 * - mb_strpos - Find position of first occurrence of string in a string
34 * - mb_strrpos - Find position of last occurrence of a string in a string
35 * - mb_strtolower - Make a string lowercase
36 * - mb_strtoupper - Make a string uppercase
37 * - mb_substitute_character - Set/Get substitution character
38 * - mb_substr - Get part of string
39 * - mb_stripos - Finds position of first occurrence of a string within another, case insensitive
40 * - mb_stristr - Finds first occurrence of a string within another, case insensitive
41 * - mb_strrchr - Finds the last occurrence of a character in a string within another
42 * - mb_strrichr - Finds the last occurrence of a character in a string within another, case insensitive
43 * - mb_strripos - Finds position of last occurrence of a string within another, case insensitive
44 * - mb_strstr - Finds first occurrence of a string within anothers
45 * - mb_strwidth - Return width of string
46 * - mb_substr_count - Count the number of substring occurrences
47 *
48 * Not implemented:
49 * - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more)
50 * - mb_decode_numericentity - Decode HTML numeric string reference to character
51 * - mb_encode_numericentity - Encode character to HTML numeric string reference
52 * - mb_ereg_* - Regular expression with multibyte support
53 * - mb_parse_str - Parse GET/POST/COOKIE data and set global variable
54 * - mb_preferred_mime_name - Get MIME charset string
55 * - mb_regex_encoding - Returns current encoding for multibyte regex as string
56 * - mb_regex_set_options - Set/Get the default options for mbregex functions
57 * - mb_send_mail - Send encoded mail
58 * - mb_split - Split multibyte string using regular expression
59 * - mb_strcut - Get part of string
60 * - mb_strimwidth - Get truncated string with specified width
61 *
62 * @author Nicolas Grekas <p@tchwork.com>
63 *
64 * @internal
65 */
66 final class Mbstring
67 {
68 const MB_CASE_FOLD = PHP_INT_MAX;
69  
70 private static $encodingList = array('ASCII', 'UTF-8');
71 private static $language = 'neutral';
72 private static $internalEncoding = 'UTF-8';
73 private static $caseFold = array(
74 array('µ','ſ',"\xCD\x85",'ς',"\xCF\x90","\xCF\x91","\xCF\x95","\xCF\x96","\xCF\xB0","\xCF\xB1","\xCF\xB5","\xE1\xBA\x9B","\xE1\xBE\xBE"),
75 array('μ','s','ι', 'σ','β', 'θ', 'φ', 'π', 'κ', 'ρ', 'ε', "\xE1\xB9\xA1",'ι'),
76 );
77  
78 public static function mb_convert_encoding($s, $toEncoding, $fromEncoding = null)
79 {
80 if (is_array($fromEncoding) || false !== strpos($fromEncoding, ',')) {
81 $fromEncoding = self::mb_detect_encoding($s, $fromEncoding);
82 } else {
83 $fromEncoding = self::getEncoding($fromEncoding);
84 }
85  
86 $toEncoding = self::getEncoding($toEncoding);
87  
88 if ('BASE64' === $fromEncoding) {
89 $s = base64_decode($s);
90 $fromEncoding = $toEncoding;
91 }
92  
93 if ('BASE64' === $toEncoding) {
94 return base64_encode($s);
95 }
96  
97 if ('HTML-ENTITIES' === $toEncoding || 'HTML' === $toEncoding) {
98 if ('HTML-ENTITIES' === $fromEncoding || 'HTML' === $fromEncoding) {
99 $fromEncoding = 'Windows-1252';
100 }
101 if ('UTF-8' !== $fromEncoding) {
102 $s = iconv($fromEncoding, 'UTF-8//IGNORE', $s);
103 }
104  
105 return preg_replace_callback('/[\x80-\xFF]+/', array(__CLASS__, 'html_encoding_callback'), $s);
106 }
107  
108 if ('HTML-ENTITIES' === $fromEncoding) {
109 $s = html_entity_decode($s, ENT_COMPAT, 'UTF-8');
110 $fromEncoding = 'UTF-8';
111 }
112  
113 return iconv($fromEncoding, $toEncoding.'//IGNORE', $s);
114 }
115  
116 public static function mb_convert_variables($toEncoding, $fromEncoding, &$a = null, &$b = null, &$c = null, &$d = null, &$e = null, &$f = null)
117 {
118 $vars = array(&$a, &$b, &$c, &$d, &$e, &$f);
119  
120 $ok = true;
121 array_walk_recursive($vars, function (&$v) use (&$ok, $toEncoding, $fromEncoding) {
122 if (false === $v = Mbstring::mb_convert_encoding($v, $toEncoding, $fromEncoding)) {
123 $ok = false;
124 }
125 });
126  
127 return $ok ? $fromEncoding : false;
128 }
129  
130 public static function mb_decode_mimeheader($s)
131 {
132 return iconv_mime_decode($s, 2, self::$internalEncoding);
133 }
134  
135 public static function mb_encode_mimeheader($s, $charset = null, $transferEncoding = null, $linefeed = null, $indent = null)
136 {
137 trigger_error('mb_encode_mimeheader() is bugged. Please use iconv_mime_encode() instead', E_USER_WARNING);
138 }
139  
140 public static function mb_convert_case($s, $mode, $encoding = null)
141 {
142 if ('' === $s .= '') {
143 return '';
144 }
145  
146 $encoding = self::getEncoding($encoding);
147  
148 if ('UTF-8' === $encoding) {
149 $encoding = null;
150 } else {
151 $s = iconv($encoding, 'UTF-8//IGNORE', $s);
152 }
153  
154 if (MB_CASE_TITLE == $mode) {
155 $s = preg_replace_callback('/\b\p{Ll}/u', array(__CLASS__, 'title_case_upper'), $s);
156 $s = preg_replace_callback('/\B[\p{Lu}\p{Lt}]+/u', array(__CLASS__, 'title_case_lower'), $s);
157 } else {
158 if (MB_CASE_UPPER == $mode) {
159 static $upper = null;
160 if (null === $upper) {
161 $upper = self::getData('upperCase');
162 }
163 $map = $upper;
164 } else {
165 if (self::MB_CASE_FOLD === $mode) {
166 $s = str_replace(self::$caseFold[0], self::$caseFold[1], $s);
167 }
168  
169 static $lower = null;
170 if (null === $lower) {
171 $lower = self::getData('lowerCase');
172 }
173 $map = $lower;
174 }
175  
176 static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4);
177  
178 $i = 0;
179 $len = strlen($s);
180  
181 while ($i < $len) {
182 $ulen = $s[$i] < "\x80" ? 1 : $ulenMask[$s[$i] & "\xF0"];
183 $uchr = substr($s, $i, $ulen);
184 $i += $ulen;
185  
186 if (isset($map[$uchr])) {
187 $uchr = $map[$uchr];
188 $nlen = strlen($uchr);
189  
190 if ($nlen == $ulen) {
191 $nlen = $i;
192 do {
193 $s[--$nlen] = $uchr[--$ulen];
194 } while ($ulen);
195 } else {
196 $s = substr_replace($s, $uchr, $i - $ulen, $ulen);
197 $len += $nlen - $ulen;
198 $i += $nlen - $ulen;
199 }
200 }
201 }
202 }
203  
204 if (null === $encoding) {
205 return $s;
206 }
207  
208 return iconv('UTF-8', $encoding.'//IGNORE', $s);
209 }
210  
211 public static function mb_internal_encoding($encoding = null)
212 {
213 if (null === $encoding) {
214 return self::$internalEncoding;
215 }
216  
217 $encoding = self::getEncoding($encoding);
218  
219 if ('UTF-8' === $encoding || false !== @iconv($encoding, $encoding, ' ')) {
220 self::$internalEncoding = $encoding;
221  
222 return true;
223 }
224  
225 return false;
226 }
227  
228 public static function mb_language($lang = null)
229 {
230 if (null === $lang) {
231 return self::$language;
232 }
233  
234 switch ($lang = strtolower($lang)) {
235 case 'uni':
236 case 'neutral':
237 self::$language = $lang;
238  
239 return true;
240 }
241  
242 return false;
243 }
244  
245 public static function mb_list_encodings()
246 {
247 return array('UTF-8');
248 }
249  
250 public static function mb_encoding_aliases($encoding)
251 {
252 switch (strtoupper($encoding)) {
253 case 'UTF8':
254 case 'UTF-8':
255 return array('utf8');
256 }
257  
258 return false;
259 }
260  
261 public static function mb_check_encoding($var = null, $encoding = null)
262 {
263 if (null === $encoding) {
264 if (null === $var) {
265 return false;
266 }
267 $encoding = self::$internalEncoding;
268 }
269  
270 return self::mb_detect_encoding($var, array($encoding)) || false !== @iconv($encoding, $encoding, $var);
271 }
272  
273 public static function mb_detect_encoding($str, $encodingList = null, $strict = false)
274 {
275 if (null === $encodingList) {
276 $encodingList = self::$encodingList;
277 } else {
278 if (!is_array($encodingList)) {
279 $encodingList = array_map('trim', explode(',', $encodingList));
280 }
281 $encodingList = array_map('strtoupper', $encodingList);
282 }
283  
284 foreach ($encodingList as $enc) {
285 switch ($enc) {
286 case 'ASCII':
287 if (!preg_match('/[\x80-\xFF]/', $str)) {
288 return $enc;
289 }
290 break;
291  
292 case 'UTF8':
293 case 'UTF-8':
294 if (preg_match('//u', $str)) {
295 return 'UTF-8';
296 }
297 break;
298  
299 default:
300 if (0 === strncmp($enc, 'ISO-8859-', 9)) {
301 return $enc;
302 }
303 }
304 }
305  
306 return false;
307 }
308  
309 public static function mb_detect_order($encodingList = null)
310 {
311 if (null === $encodingList) {
312 return self::$encodingList;
313 }
314  
315 if (!is_array($encodingList)) {
316 $encodingList = array_map('trim', explode(',', $encodingList));
317 }
318 $encodingList = array_map('strtoupper', $encodingList);
319  
320 foreach ($encodingList as $enc) {
321 switch ($enc) {
322 default:
323 if (strncmp($enc, 'ISO-8859-', 9)) {
324 return false;
325 }
326 case 'ASCII':
327 case 'UTF8':
328 case 'UTF-8':
329 }
330 }
331  
332 self::$encodingList = $encodingList;
333  
334 return true;
335 }
336  
337 public static function mb_strlen($s, $encoding = null)
338 {
339 switch ($encoding = self::getEncoding($encoding)) {
340 case 'ASCII':
341 case 'CP850':
342 return strlen($s);
343 }
344  
345 return @iconv_strlen($s, $encoding);
346 }
347  
348 public static function mb_strpos($haystack, $needle, $offset = 0, $encoding = null)
349 {
350 $encoding = self::getEncoding($encoding);
351  
352 if ('' === $needle .= '') {
353 trigger_error(__METHOD__.': Empty delimiter', E_USER_WARNING);
354  
355 return false;
356 }
357  
358 return iconv_strpos($haystack, $needle, $offset, $encoding);
359 }
360  
361 public static function mb_strrpos($haystack, $needle, $offset = 0, $encoding = null)
362 {
363 $encoding = self::getEncoding($encoding);
364  
365 if ($offset != (int) $offset) {
366 $offset = 0;
367 } elseif ($offset = (int) $offset) {
368 if ($offset < 0) {
369 $haystack = self::mb_substr($haystack, 0, $offset, $encoding);
370 $offset = 0;
371 } else {
372 $haystack = self::mb_substr($haystack, $offset, 2147483647, $encoding);
373 }
374 }
375  
376 $pos = iconv_strrpos($haystack, $needle, $encoding);
377  
378 return false !== $pos ? $offset + $pos : false;
379 }
380  
381 public static function mb_strtolower($s, $encoding = null)
382 {
383 return self::mb_convert_case($s, MB_CASE_LOWER, $encoding);
384 }
385  
386 public static function mb_strtoupper($s, $encoding = null)
387 {
388 return self::mb_convert_case($s, MB_CASE_UPPER, $encoding);
389 }
390  
391 public static function mb_substitute_character($c = null)
392 {
393 if (0 === strcasecmp($c, 'none')) {
394 return true;
395 }
396  
397 return null !== $c ? false : 'none';
398 }
399  
400 public static function mb_substr($s, $start, $length = null, $encoding = null)
401 {
402 $encoding = self::getEncoding($encoding);
403  
404 if ($start < 0) {
405 $start = iconv_strlen($s, $encoding) + $start;
406 if ($start < 0) {
407 $start = 0;
408 }
409 }
410  
411 if (null === $length) {
412 $length = 2147483647;
413 } elseif ($length < 0) {
414 $length = iconv_strlen($s, $encoding) + $length - $start;
415 if ($length < 0) {
416 return '';
417 }
418 }
419  
420 return iconv_substr($s, $start, $length, $encoding).'';
421 }
422  
423 public static function mb_stripos($haystack, $needle, $offset = 0, $encoding = null)
424 {
425 $haystack = self::mb_convert_case($haystack, self::MB_CASE_FOLD, $encoding);
426 $needle = self::mb_convert_case($needle, self::MB_CASE_FOLD, $encoding);
427  
428 return self::mb_strpos($haystack, $needle, $offset, $encoding);
429 }
430  
431 public static function mb_stristr($haystack, $needle, $part = false, $encoding = null)
432 {
433 $pos = self::mb_stripos($haystack, $needle, 0, $encoding);
434  
435 return self::getSubpart($pos, $part, $haystack, $encoding);
436 }
437  
438 public static function mb_strrchr($haystack, $needle, $part = false, $encoding = null)
439 {
440 $encoding = self::getEncoding($encoding);
441 $needle = self::mb_substr($needle, 0, 1, $encoding);
442 $pos = iconv_strrpos($haystack, $needle, $encoding);
443  
444 return self::getSubpart($pos, $part, $haystack, $encoding);
445 }
446  
447 public static function mb_strrichr($haystack, $needle, $part = false, $encoding = null)
448 {
449 $needle = self::mb_substr($needle, 0, 1, $encoding);
450 $pos = self::mb_strripos($haystack, $needle, $encoding);
451  
452 return self::getSubpart($pos, $part, $haystack, $encoding);
453 }
454  
455 public static function mb_strripos($haystack, $needle, $offset = 0, $encoding = null)
456 {
457 $haystack = self::mb_convert_case($haystack, self::MB_CASE_FOLD, $encoding);
458 $needle = self::mb_convert_case($needle, self::MB_CASE_FOLD, $encoding);
459  
460 return self::mb_strrpos($haystack, $needle, $offset, $encoding);
461 }
462  
463 public static function mb_strstr($haystack, $needle, $part = false, $encoding = null)
464 {
465 $pos = strpos($haystack, $needle);
466 if (false === $pos) {
467 return false;
468 }
469 if ($part) {
470 return substr($haystack, 0, $pos);
471 }
472  
473 return substr($haystack, $pos);
474 }
475  
476 public static function mb_get_info($type = 'all')
477 {
478 $info = array(
479 'internal_encoding' => self::$internalEncoding,
480 'http_output' => 'pass',
481 'http_output_conv_mimetypes' => '^(text/|application/xhtml\+xml)',
482 'func_overload' => 0,
483 'func_overload_list' => 'no overload',
484 'mail_charset' => 'UTF-8',
485 'mail_header_encoding' => 'BASE64',
486 'mail_body_encoding' => 'BASE64',
487 'illegal_chars' => 0,
488 'encoding_translation' => 'Off',
489 'language' => self::$language,
490 'detect_order' => self::$encodingList,
491 'substitute_character' => 'none',
492 'strict_detection' => 'Off',
493 );
494  
495 if ('all' === $type) {
496 return $info;
497 }
498 if (isset($info[$type])) {
499 return $info[$type];
500 }
501  
502 return false;
503 }
504  
505 public static function mb_http_input($type = '')
506 {
507 return false;
508 }
509  
510 public static function mb_http_output($encoding = null)
511 {
512 return null !== $encoding ? 'pass' === $encoding : 'pass';
513 }
514  
515 public static function mb_strwidth($s, $encoding = null)
516 {
517 $encoding = self::getEncoding($encoding);
518  
519 if ('UTF-8' !== $encoding) {
520 $s = iconv($encoding, 'UTF-8//IGNORE', $s);
521 }
522  
523 $s = preg_replace('/[\x{1100}-\x{115F}\x{2329}\x{232A}\x{2E80}-\x{303E}\x{3040}-\x{A4CF}\x{AC00}-\x{D7A3}\x{F900}-\x{FAFF}\x{FE10}-\x{FE19}\x{FE30}-\x{FE6F}\x{FF00}-\x{FF60}\x{FFE0}-\x{FFE6}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}]/u', '', $s, -1, $wide);
524  
525 return ($wide << 1) + iconv_strlen($s, 'UTF-8');
526 }
527  
528 public static function mb_substr_count($haystack, $needle, $encoding = null)
529 {
530 return substr_count($haystack, $needle);
531 }
532  
533 public static function mb_output_handler($contents, $status)
534 {
535 return $contents;
536 }
537  
538 public static function mb_chr($code, $encoding = null)
539 {
540 if (0x80 > $code %= 0x200000) {
541 $s = chr($code);
542 } elseif (0x800 > $code) {
543 $s = chr(0xC0 | $code >> 6).chr(0x80 | $code & 0x3F);
544 } elseif (0x10000 > $code) {
545 $s = chr(0xE0 | $code >> 12).chr(0x80 | $code >> 6 & 0x3F).chr(0x80 | $code & 0x3F);
546 } else {
547 $s = chr(0xF0 | $code >> 18).chr(0x80 | $code >> 12 & 0x3F).chr(0x80 | $code >> 6 & 0x3F).chr(0x80 | $code & 0x3F);
548 }
549  
550 if ('UTF-8' !== $encoding = self::getEncoding($encoding)) {
551 $s = mb_convert_encoding($s, $encoding, 'UTF-8');
552 }
553  
554 return $s;
555 }
556  
557 public static function mb_ord($s, $encoding = null)
558 {
559 if ('UTF-8' !== $encoding = self::getEncoding($encoding)) {
560 $s = mb_convert_encoding($s, 'UTF-8', $encoding);
561 }
562  
563 $code = ($s = unpack('C*', substr($s, 0, 4))) ? $s[1] : 0;
564 if (0xF0 <= $code) {
565 return (($code - 0xF0) << 18) + (($s[2] - 0x80) << 12) + (($s[3] - 0x80) << 6) + $s[4] - 0x80;
566 }
567 if (0xE0 <= $code) {
568 return (($code - 0xE0) << 12) + (($s[2] - 0x80) << 6) + $s[3] - 0x80;
569 }
570 if (0xC0 <= $code) {
571 return (($code - 0xC0) << 6) + $s[2] - 0x80;
572 }
573  
574 return $code;
575 }
576  
577 private static function getSubpart($pos, $part, $haystack, $encoding)
578 {
579 if (false === $pos) {
580 return false;
581 }
582 if ($part) {
583 return self::mb_substr($haystack, 0, $pos, $encoding);
584 }
585  
586 return self::mb_substr($haystack, $pos, null, $encoding);
587 }
588  
589 private static function html_encoding_callback($m)
590 {
591 $i = 1;
592 $entities = '';
593 $m = unpack('C*', htmlentities($m[0], ENT_COMPAT, 'UTF-8'));
594  
595 while (isset($m[$i])) {
596 if (0x80 > $m[$i]) {
597 $entities .= chr($m[$i++]);
598 continue;
599 }
600 if (0xF0 <= $m[$i]) {
601 $c = (($m[$i++] - 0xF0) << 18) + (($m[$i++] - 0x80) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80;
602 } elseif (0xE0 <= $m[$i]) {
603 $c = (($m[$i++] - 0xE0) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80;
604 } else {
605 $c = (($m[$i++] - 0xC0) << 6) + $m[$i++] - 0x80;
606 }
607  
608 $entities .= '&#'.$c.';';
609 }
610  
611 return $entities;
612 }
613  
614 private static function title_case_lower($s)
615 {
616 return self::mb_convert_case($s[0], MB_CASE_LOWER, 'UTF-8');
617 }
618  
619 private static function title_case_upper($s)
620 {
621 return self::mb_convert_case($s[0], MB_CASE_UPPER, 'UTF-8');
622 }
623  
624 private static function getData($file)
625 {
626 if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) {
627 return require $file;
628 }
629  
630 return false;
631 }
632  
633 private static function getEncoding($encoding)
634 {
635 if (null === $encoding) {
636 return self::$internalEncoding;
637 }
638  
639 $encoding = strtoupper($encoding);
640  
641 if ('8BIT' === $encoding || 'BINARY' === $encoding) {
642 return 'CP850';
643 }
644 if ('UTF8' === $encoding) {
645 return 'UTF-8';
646 }
647  
648 return $encoding;
649 }
650 }