Remove Accents Without Using Iconv

How do I remove accents from characters in a PHP string?

I think the problem here is that your encodings consider ä and å different symbols to 'a'. In fact, the PHP documentation for strtr offers a sample for removing accents the ugly way :(

http://ie2.php.net/strtr

Remove accents without using iconv

Complete working code. I know this is long, but it's a sure-shot way used by Wordpress.

<?php

function seems_utf8($str)
{
$length = strlen($str);
for ($i=0; $i < $length; $i++) {
$c = ord($str[$i]);
if ($c < 0x80) $n = 0; # 0bbbbbbb
elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
else return false; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
return false;
}
}
return true;
}

/**
* Converts all accent characters to ASCII characters.
*
* If there are no accent characters, then the string given is just returned.
*
* @param string $string Text that might have accent characters
* @return string Filtered string with replaced "nice" characters.
*/
function remove_accents($string) {
if ( !preg_match('/[\x80-\xff]/', $string) )
return $string;

if (seems_utf8($string)) {
$chars = array(
// Decompositions for Latin-1 Supplement
chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
chr(195).chr(191) => 'y',
// Decompositions for Latin Extended-A
chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
chr(197).chr(148) => 'R',chr(197).chr(149) => 'r',
chr(197).chr(150) => 'R',chr(197).chr(151) => 'r',
chr(197).chr(152) => 'R',chr(197).chr(153) => 'r',
chr(197).chr(154) => 'S',chr(197).chr(155) => 's',
chr(197).chr(156) => 'S',chr(197).chr(157) => 's',
chr(197).chr(158) => 'S',chr(197).chr(159) => 's',
chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
// Euro Sign
chr(226).chr(130).chr(172) => 'E',
// GBP (Pound) Sign
chr(194).chr(163) => '');

$string = strtr($string, $chars);
} else {
// Assume ISO-8859-1 if not UTF-8
$chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
.chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
.chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
.chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
.chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
.chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
.chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
.chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
.chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
.chr(252).chr(253).chr(255);

$chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";

$string = strtr($string, $chars['in'], $chars['out']);
$double_chars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
$double_chars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
$string = str_replace($double_chars['in'], $double_chars['out'], $string);
}

return $string;
}

$str = "ÈâuÑ";
echo remove_accents($str); // Output: EauN
?>

How to remove accents and turn letters into plain ASCII characters?

If you have iconv installed, try this (the example assumes your input string is in UTF-8):

echo iconv('UTF-8', 'ASCII//TRANSLIT', $string);

(iconv is a library to convert between all kinds of encodings; it's efficient and included with many PHP distributions by default. Most of all, it's definitely easier and more error-proof than trying to roll your own solution (did you know that there's a "Latin letter N with a curl"? Me neither.))

How to remove all of the diacritics from a file?

If you check the man page of the tool iconv:

//TRANSLIT

When the string "//TRANSLIT" is appended to --to-code, transliteration is activated. This means that when a character cannot be represented in the
target character set, it can be approximated through one or several similarly looking characters.

so we could do :

kent$  cat test1
Replace ā, á, ǎ, and à with a.
Replace ē, é, ě, and è with e.
Replace ī, í, ǐ, and ì with i.
Replace ō, ó, ǒ, and ò with o.
Replace ū, ú, ǔ, and ù with u.
Replace ǖ, ǘ, ǚ, and ǜ with ü.
Replace Ā, Á, Ǎ, and À with A.
Replace Ē, É, Ě, and È with E.
Replace Ī, Í, Ǐ, and Ì with I.
Replace Ō, Ó, Ǒ, and Ò with O.
Replace Ū, Ú, Ǔ, and Ù with U.
Replace Ǖ, Ǘ, Ǚ, and Ǜ with U.

kent$ iconv -f utf8 -t ascii//TRANSLIT test1
Replace a, a, a, and a with a.
Replace e, e, e, and e with e.
Replace i, i, i, and i with i.
Replace o, o, o, and o with o.
Replace u, u, u, and u with u.
Replace u, u, u, and u with u.
Replace A, A, A, and A with A.
Replace E, E, E, and E with E.
Replace I, I, I, and I with I.
Replace O, O, O, and O with O.
Replace U, U, U, and U with U.
Replace U, U, U, and U with U.

Unaccent string in bash script (RHEL)

You can use the -c(clear) option in iconv to remove non-ascii chars:

$ echo 'été' | iconv -c -f utf8 -t ascii
t

If you just want to remove the accent:

$ echo 'été' | iconv -f utf8 -t ascii//TRANSLIT
ete

iconv separates accents from letter when using libiconv

The problem with the diacritics is that they are processed differently according to the language, for example in Arabic, diacritics are considered a character that has it's own Unicode code point, and when they join the Arabic letters they still a different character than the parent letter, for example this is a Meem letter "م" and this is a Dammah Diacritic "ُ" when the Dammah joins the Meem they will be 2 characters in the string. That is why you can post nearly empty posts on SE network with these types of diacritics

so removing these diacritic from a string is as simple as searching for these ~8 diacritic and replace them with empty string, while keeping the parent letters untouched.

$withoutDiacritic = str_replace(['ٌ','ُ','ً','َ','ٍ'], "", $string);

The problem with the Latin characters is different, when a diacritic joins a letter they produce 1 letter character with it's own Unicode code point. For example when you join a diacritic to the letter "e" it will be converted to another Unicode character "è" so you can't apply what we do in Arabic diacritics by searching for the diacritics and removing them, instead you must search for "è" character and replace it with "e", and that is what node diacritics does.

I made a PHP version of node diacritics , don't forget to star these guys as they did all the heavy lifting.

<?php

namespace PHPDiacritics;

class PHPDiacritics
{

protected $replacementList = [
["base" => " ", "chars" => '"\u00A0"'],
["base" => "0", "chars" => '"\u07C0"'],
["base" => "A", "chars" => '"\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F"'],
["base" => "AA", "chars" => '"\uA732"'],
["base" => "AE", "chars" => '"\u00C6\u01FC\u01E2"'],
["base" => "AO", "chars" => '"\uA734"'],
["base" => "AU", "chars" => '"\uA736"'],
["base" => "AV", "chars" => '"\uA738\uA73A"'],
["base" => "AY", "chars" => '"\uA73C"'],
["base" => "B", "chars" => '"\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0181"'],
["base" => "C", "chars" => '"\u24b8\uff23\uA73E\u1E08\u0106\u0043\u0108\u010A\u010C\u00C7\u0187\u023B"'],
["base" => "D", "chars" => '"\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018A\u0189\u1D05\uA779"'],
["base" => "Dh", "chars" => '"\u00D0"'],
["base" => "DZ", "chars" => '"\u01F1\u01C4"'],
["base" => "Dz", "chars" => '"\u01F2\u01C5"'],
["base" => "E", "chars" => '"\u025B\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E\u1D07"'],
["base" => "F", "chars" => '"\uA77C\u24BB\uFF26\u1E1E\u0191\uA77B"'],
["base" => "G", "chars" => '"\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E\u0262"'],
["base" => "H", "chars" => '"\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D"'],
["base" => "I", "chars" => '"\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197"'],
["base" => "J", "chars" => '"\u24BF\uFF2A\u0134\u0248\u0237"'],
["base" => "K", "chars" => '"\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2"'],
["base" => "L", "chars" => '"\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780"'],
["base" => "LJ", "chars" => '"\u01C7"'],
["base" => "Lj", "chars" => '"\u01C8"'],
["base" => "M", "chars" => '"\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C\u03FB"'],
["base" => "N", "chars" => '"\uA7A4\u0220\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u019D\uA790\u1D0E"'],
["base" => "NJ", "chars" => '"\u01CA"'],
["base" => "Nj", "chars" => '"\u01CB"'],
["base" => "O", "chars" => '"\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C"'],
["base" => "OE", "chars" => '"\u0152"'],
["base" => "OI", "chars" => '"\u01A2"'],
["base" => "OO", "chars" => '"\uA74E"'],
["base" => "OU", "chars" => '"\u0222"'],
["base" => "P", "chars" => '"\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754"'],
["base" => "Q", "chars" => '"\u24C6\uFF31\uA756\uA758\u024A"'],
["base" => "R", "chars" => '"\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782"'],
["base" => "S", "chars" => '"\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784"'],
["base" => "T", "chars" => '"\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786"'],
["base" => "Th", "chars" => '"\u00DE"'],
["base" => "TZ", "chars" => '"\uA728"'],
["base" => "U", "chars" => '"\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244"'],
["base" => "V", "chars" => '"\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245"'],
["base" => "VY", "chars" => '"\uA760"'],
["base" => "W", "chars" => '"\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72"'],
["base" => "X", "chars" => '"\u24CD\uFF38\u1E8A\u1E8C"'],
["base" => "Y", "chars" => '"\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE"'],
["base" => "Z", "chars" => '"\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762"'],
["base" => "a", "chars" => '"\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250\u0251"'],
["base" => "aa", "chars" => '"\uA733"'],
["base" => "ae", "chars" => '"\u00E6\u01FD\u01E3"'],
["base" => "ao", "chars" => '"\uA735"'],
["base" => "au", "chars" => '"\uA737"'],
["base" => "av", "chars" => '"\uA739\uA73B"'],
["base" => "ay", "chars" => '"\uA73D"'],
["base" => "b", "chars" => '"\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253\u0182"'],
["base" => "c", "chars" => '"\uFF43\u24D2\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184"'],
["base" => "d", "chars" => '"\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\u018B\u13E7\u0501\uA7AA"'],
["base" => "dh", "chars" => '"\u00F0"'],
["base" => "dz", "chars" => '"\u01F3\u01C6"'],
["base" => "e", "chars" => '"\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u01DD"'],
["base" => "f", "chars" => '"\u24D5\uFF46\u1E1F\u0192"'],
["base" => "ff", "chars" => '"\uFB00"'],
["base" => "fi", "chars" => '"\uFB01"'],
["base" => "fl", "chars" => '"\uFB02"'],
["base" => "ffi", "chars" => '"\uFB03"'],
["base" => "ffl", "chars" => '"\uFB04"'],
["base" => "g", "chars" => '"\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\uA77F\u1D79"'],
["base" => "h", "chars" => '"\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265"'],
["base" => "hv", "chars" => '"\u0195"'],
["base" => "i", "chars" => '"\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131"'],
["base" => "j", "chars" => '"\u24D9\uFF4A\u0135\u01F0\u0249"'],
["base" => "k", "chars" => '"\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3"'],
["base" => "l", "chars" => '"\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747\u026D"'],
["base" => "lj", "chars" => '"\u01C9"'],
["base" => "m", "chars" => '"\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F"'],
["base" => "n", "chars" => '"\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5\u043B\u0509"'],
["base" => "nj", "chars" => '"\u01CC"'],
["base" => "o", "chars" => '"\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\uA74B\uA74D\u0275\u0254\u1D11"'],
["base" => "oe", "chars" => '"\u0153"'],
["base" => "oi", "chars" => '"\u01A3"'],
["base" => "oo", "chars" => '"\uA74F"'],
["base" => "ou", "chars" => '"\u0223"'],
["base" => "p", "chars" => '"\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755\u03C1"'],
["base" => "q", "chars" => '"\u24E0\uFF51\u024B\uA757\uA759"'],
["base" => "r", "chars" => '"\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783"'],
["base" => "s", "chars" => '"\u24E2\uFF53\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B\u0282"'],
["base" => "ss", "chars" => '"\u00DF"'],
["base" => "t", "chars" => '"\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787"'],
["base" => "th", "chars" => '"\u00FE"'],
["base" => "tz", "chars" => '"\uA729"'],
["base" => "u", "chars" => '"\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289"'],
["base" => "v", "chars" => '"\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C"'],
["base" => "vy", "chars" => '"\uA761"'],
["base" => "w", "chars" => '"\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73"'],
["base" => "x", "chars" => '"\u24E7\uFF58\u1E8B\u1E8D"'],
["base" => "y", "chars" => '"\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF"'],
["base" => "z", "chars" => '"\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763"']
];

protected $chars = [];

protected $encoding;

public function __construct($encoding = "")
{
if (!$encoding) $encoding = mb_internal_encoding();
if (!$encoding) $encoding = 'UTF-8';
/*
*you can filter the encodings here with the supported encodings of mb_* functions
*https://www.php.net/manual/en/mbstring.supported-encodings.php
*but I will leave mb_* functions generate error of level E_WARNING if unsupported encoding is used
*/
$this->encoding = $encoding;

//$charsCountTotal = 0; // for debugging
//build the indexed array chars for better performance
foreach ($this->replacementList as $replacementList){
$charsString = json_decode($replacementList["chars"]);
//if(!$charsString) die('noooooooooooooooooo'); // debugging
$charsCount = mb_strlen($charsString, $this->encoding);
//$charsCountTotal += $charsCount; // for debugging
for($i = 0; $i < $charsCount; $i++){
$char = mb_substr($charsString, $i, 1, $this->encoding);
$this->chars[$char] = $replacementList["base"];
}
}
//echo "chars count" . $charsCountTotal . "\n"; // for debugging
//echo "array count" . count($this->chars) . "\n"; // for debugging

}

public function removeDiacritics($string)
{

$finalString = "";
$charsCount = mb_strlen($string, $this->encoding);
for($i = 0; $i < $charsCount; $i++){
$char = mb_substr($string, $i, 1, $this->encoding);
$finalString .= !empty($this->chars[$char]) ? $this->chars[$char] : $char;
}
return $finalString;
}

}

Using the class

$phpDiacritics = new PHPDiacritics('UTF-8');

$test1 = "Athènes";
$test2 = "Gdańsk";
$test3 = "niño";

echo $phpDiacritics->removeDiacritics($test1) . "\n";
echo $phpDiacritics->removeDiacritics($test2) . "\n";
echo $phpDiacritics->removeDiacritics($test3) . "\n";

This outputs

Athenes
Gdansk
nino


Related Topics



Leave a reply



Submit