Excuse me for necroposting, but may be somebody will find it useful: code below can work both as replacement for preg_match and preg_match_all functions and returns correct matches with correct offset for UTF8-encoded strings.
mb_internal_encoding('UTF-8'); /** * Returns array of matches in same format as preg_match or preg_match_all * @param bool $matchAll If true, execute preg_match_all, otherwise preg_match * @param string $pattern The pattern to search for, as a string. * @param string $subject The input string. * @param int $offset The place from which to start the search (in bytes). * @return array */ function pregMatchCapture($matchAll, $pattern, $subject, $offset = 0) { $matchInfo = array(); $method = 'preg_match'; $flag = PREG_OFFSET_CAPTURE; if ($matchAll) { $method .= '_all'; } $n = $method($pattern, $subject, $matchInfo, $flag, $offset); $result = array(); if ($n !== 0 && !empty($matchInfo)) { if (!$matchAll) { $matchInfo = array($matchInfo); } foreach ($matchInfo as $matches) { $positions = array(); foreach ($matches as $match) { $matchedText = $match[0]; $matchedLength = $match[1]; $positions[] = array( $matchedText, mb_strlen(mb_strcut($subject, 0, $matchedLength)) ); } $result[] = $positions; } if (!$matchAll) { $result = $result[0]; } } return $result; } $s1 = 'Попробуем русскую строку для теста'; $s2 = 'Try english string for test'; var_dump(pregMatchCapture(true, '/обу/', $s1)); var_dump(pregMatchCapture(false, '/обу/', $s1)); var_dump(pregMatchCapture(true, '/lish/', $s2)); var_dump(pregMatchCapture(false, '/lish/', $s2));
Output of my example:
array(1) { [0]=> array(1) { [0]=> array(2) { [0]=> string(6) "обу" [1]=> int(4) } } } array(1) { [0]=> array(2) { [0]=> string(6) "обу" [1]=> int(4) } } array(1) { [0]=> array(1) { [0]=> array(2) { [0]=> string(4) "lish" [1]=> int(7) } } } array(1) { [0]=> array(2) { [0]=> string(4) "lish" [1]=> int(7) } }