*/
protected function getStringBetween($str, $start, $end)
{
- $expStr = explode($start, $str, 2);
+ $expStr = mb_split($start, $str, 2);
if (count($expStr) > 1) {
- $expStr = explode($end, $expStr[ 1 ]);
+ $expStr = mb_split($end, $expStr[ 1 ]);
if (count($expStr) > 1) {
array_pop($expStr);
*/
protected function isPartOfWord($text)
{
- return ctype_alnum(str_replace($this->config->getSpecialCaseChars(), '', $text));
+ return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
}
/**
$mode = 'character';
$current_word = '';
$words = array();
+ $keepNewLines = $this->getConfig()->isKeepNewLines();
foreach ($characterString as $i => $character) {
switch ($mode) {
case 'character':
$current_word = '<';
$mode = 'tag';
- } elseif (preg_match("/\s/", $character)) {
+ } elseif (preg_match("/\s/u", $character)) {
if ($current_word !== '') {
$words[] = $current_word;
}
- $current_word = preg_replace('/\s+/S', ' ', $character);
+ $current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
$mode = 'whitespace';
} else {
if (
- (ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
+ (($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
) {
$current_word .= $character;
$words[] = $current_word;
$current_word = '';
- if (!preg_match('[^\s]', $character)) {
+ if (!preg_match('[^\s]u', $character)) {
$mode = 'whitespace';
} else {
$mode = 'character';
}
$current_word = '<';
$mode = 'tag';
- } elseif (preg_match("/\s/", $character)) {
+ } elseif (preg_match("/\s/u", $character)) {
$current_word .= $character;
- $current_word = preg_replace('/\s+/S', ' ', $current_word);
+ if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
} else {
if ($current_word != '') {
$words[] = $current_word;
*/
protected function isWhiteSpace($value)
{
- return !preg_match('[^\s]', $value);
+ return !preg_match('[^\s]u', $value);
}
/**
protected function explode($value)
{
// as suggested by @onassar
- return preg_split('//u', $value);
+ return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
+ }
+
+ /**
+ * @param string $str
+ *
+ * @return bool
+ */
+ protected function ctypeAlphanumUnicode($str)
+ {
+ return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str);
}
}