How to convert characters to HTML entities using plain JavaScript
With the help of bucabay and the advice to create my own function i created this one which works for me. What do you guys think, is there a better solution somewhere?
if(typeof escapeHtmlEntities == 'undefined') {
escapeHtmlEntities = function (text) {
return text.replace(/[\u00A0-\u2666<>\&]/g, function(c) {
return '&' +
(escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0)) + ';';
});
};
// all HTML4 entities as defined here: http://www.w3.org/TR/html4/sgml/entities.html
// added: amp, lt, gt, quot and apos
escapeHtmlEntities.entityTable = {
34 : 'quot',
38 : 'amp',
39 : 'apos',
60 : 'lt',
62 : 'gt',
160 : 'nbsp',
161 : 'iexcl',
162 : 'cent',
163 : 'pound',
164 : 'curren',
165 : 'yen',
166 : 'brvbar',
167 : 'sect',
168 : 'uml',
169 : 'copy',
170 : 'ordf',
171 : 'laquo',
172 : 'not',
173 : 'shy',
174 : 'reg',
175 : 'macr',
176 : 'deg',
177 : 'plusmn',
178 : 'sup2',
179 : 'sup3',
180 : 'acute',
181 : 'micro',
182 : 'para',
183 : 'middot',
184 : 'cedil',
185 : 'sup1',
186 : 'ordm',
187 : 'raquo',
188 : 'frac14',
189 : 'frac12',
190 : 'frac34',
191 : 'iquest',
192 : 'Agrave',
193 : 'Aacute',
194 : 'Acirc',
195 : 'Atilde',
196 : 'Auml',
197 : 'Aring',
198 : 'AElig',
199 : 'Ccedil',
200 : 'Egrave',
201 : 'Eacute',
202 : 'Ecirc',
203 : 'Euml',
204 : 'Igrave',
205 : 'Iacute',
206 : 'Icirc',
207 : 'Iuml',
208 : 'ETH',
209 : 'Ntilde',
210 : 'Ograve',
211 : 'Oacute',
212 : 'Ocirc',
213 : 'Otilde',
214 : 'Ouml',
215 : 'times',
216 : 'Oslash',
217 : 'Ugrave',
218 : 'Uacute',
219 : 'Ucirc',
220 : 'Uuml',
221 : 'Yacute',
222 : 'THORN',
223 : 'szlig',
224 : 'agrave',
225 : 'aacute',
226 : 'acirc',
227 : 'atilde',
228 : 'auml',
229 : 'aring',
230 : 'aelig',
231 : 'ccedil',
232 : 'egrave',
233 : 'eacute',
234 : 'ecirc',
235 : 'euml',
236 : 'igrave',
237 : 'iacute',
238 : 'icirc',
239 : 'iuml',
240 : 'eth',
241 : 'ntilde',
242 : 'ograve',
243 : 'oacute',
244 : 'ocirc',
245 : 'otilde',
246 : 'ouml',
247 : 'divide',
248 : 'oslash',
249 : 'ugrave',
250 : 'uacute',
251 : 'ucirc',
252 : 'uuml',
253 : 'yacute',
254 : 'thorn',
255 : 'yuml',
402 : 'fnof',
913 : 'Alpha',
914 : 'Beta',
915 : 'Gamma',
916 : 'Delta',
917 : 'Epsilon',
918 : 'Zeta',
919 : 'Eta',
920 : 'Theta',
921 : 'Iota',
922 : 'Kappa',
923 : 'Lambda',
924 : 'Mu',
925 : 'Nu',
926 : 'Xi',
927 : 'Omicron',
928 : 'Pi',
929 : 'Rho',
931 : 'Sigma',
932 : 'Tau',
933 : 'Upsilon',
934 : 'Phi',
935 : 'Chi',
936 : 'Psi',
937 : 'Omega',
945 : 'alpha',
946 : 'beta',
947 : 'gamma',
948 : 'delta',
949 : 'epsilon',
950 : 'zeta',
951 : 'eta',
952 : 'theta',
953 : 'iota',
954 : 'kappa',
955 : 'lambda',
956 : 'mu',
957 : 'nu',
958 : 'xi',
959 : 'omicron',
960 : 'pi',
961 : 'rho',
962 : 'sigmaf',
963 : 'sigma',
964 : 'tau',
965 : 'upsilon',
966 : 'phi',
967 : 'chi',
968 : 'psi',
969 : 'omega',
977 : 'thetasym',
978 : 'upsih',
982 : 'piv',
8226 : 'bull',
8230 : 'hellip',
8242 : 'prime',
8243 : 'Prime',
8254 : 'oline',
8260 : 'frasl',
8472 : 'weierp',
8465 : 'image',
8476 : 'real',
8482 : 'trade',
8501 : 'alefsym',
8592 : 'larr',
8593 : 'uarr',
8594 : 'rarr',
8595 : 'darr',
8596 : 'harr',
8629 : 'crarr',
8656 : 'lArr',
8657 : 'uArr',
8658 : 'rArr',
8659 : 'dArr',
8660 : 'hArr',
8704 : 'forall',
8706 : 'part',
8707 : 'exist',
8709 : 'empty',
8711 : 'nabla',
8712 : 'isin',
8713 : 'notin',
8715 : 'ni',
8719 : 'prod',
8721 : 'sum',
8722 : 'minus',
8727 : 'lowast',
8730 : 'radic',
8733 : 'prop',
8734 : 'infin',
8736 : 'ang',
8743 : 'and',
8744 : 'or',
8745 : 'cap',
8746 : 'cup',
8747 : 'int',
8756 : 'there4',
8764 : 'sim',
8773 : 'cong',
8776 : 'asymp',
8800 : 'ne',
8801 : 'equiv',
8804 : 'le',
8805 : 'ge',
8834 : 'sub',
8835 : 'sup',
8836 : 'nsub',
8838 : 'sube',
8839 : 'supe',
8853 : 'oplus',
8855 : 'otimes',
8869 : 'perp',
8901 : 'sdot',
8968 : 'lceil',
8969 : 'rceil',
8970 : 'lfloor',
8971 : 'rfloor',
9001 : 'lang',
9002 : 'rang',
9674 : 'loz',
9824 : 'spades',
9827 : 'clubs',
9829 : 'hearts',
9830 : 'diams',
338 : 'OElig',
339 : 'oelig',
352 : 'Scaron',
353 : 'scaron',
376 : 'Yuml',
710 : 'circ',
732 : 'tilde',
8194 : 'ensp',
8195 : 'emsp',
8201 : 'thinsp',
8204 : 'zwnj',
8205 : 'zwj',
8206 : 'lrm',
8207 : 'rlm',
8211 : 'ndash',
8212 : 'mdash',
8216 : 'lsquo',
8217 : 'rsquo',
8218 : 'sbquo',
8220 : 'ldquo',
8221 : 'rdquo',
8222 : 'bdquo',
8224 : 'dagger',
8225 : 'Dagger',
8240 : 'permil',
8249 : 'lsaquo',
8250 : 'rsaquo',
8364 : 'euro'
};
}
usage example:
var text = "Übergroße Äpfel mit Würmern";
alert(escapeHtmlEntities (text));
result:
Übergroße Äpfel mit Würmern
Update1: Thanks bucabay again for the || - hint
Update2: Updated entity table with amp,lt,gt,apos,quot, thanks
richardtallent for the hintUpdate3(in 2014): Mathias Bynens created a lib called 'he', maybe it serves your need.
Encode HTML entities in JavaScript
You can use regex to replace any character in a given unicode range with its html entity equivalent. The code would look something like this:
var encodedStr = rawStr.replace(/[\u00A0-\u9999<>\&]/g, function(i) {
return ''+i.charCodeAt(0)+';';
});
This code will replace all characters in the given range (unicode 00A0 - 9999, as well as ampersand, greater & less than) with their html entity equivalents, which is simply nnn;
where nnn
is the unicode value we get from charCodeAt
.
See it in action here: http://jsfiddle.net/E3EqX/13/ (this example uses jQuery for element selectors used in the example. The base code itself, above, does not use jQuery)
Making these conversions does not solve all the problems -- make sure you're using UTF8 character encoding, make sure your database is storing the strings in UTF8. You still may see instances where the characters do not display correctly, depending on system font configuration and other issues out of your control.
Documentation
String.charCodeAt
- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt- HTML Character entities - http://www.chucke.com/entities.html
Convert special characters to HTML in JavaScript
You need a function that does something like
return mystring.replace(/&/g, "&").replace(/>/g, ">").replace(/</g, "<").replace(/"/g, """);
But taking into account your desire for different handling of single/double quotes.
A plain JavaScript way to decode HTML entities, works on both browsers and Node
There are many similar questions and useful answers in stackoverflow but I can't find a way works both on browsers and Node.js. So I'd like to share my opinion.
For html codes like
<
>
'
and even Chinese characters.
I suggest to use this function. (Inspired by some other answers)
function decodeEntities(encodedString) {
var translate_re = /&(nbsp|amp|quot|lt|gt);/g;
var translate = {
"nbsp":" ",
"amp" : "&",
"quot": "\"",
"lt" : "<",
"gt" : ">"
};
return encodedString.replace(translate_re, function(match, entity) {
return translate[entity];
}).replace(/(\d+);/gi, function(match, numStr) {
var num = parseInt(numStr, 10);
return String.fromCharCode(num);
});
}
This implement also works in Node.js environment.
decodeEntities("哈哈 '这个'&"那个"好玩<>") //哈哈 '这个'&"那个"好玩<>
As a new user, I only have 1 reputation :(
I can't make comments or answers to existing posts so that's the only way I can do for now.
Edit 1
I think this answer works even better than mine. Although no one gave him up vote.
How to convert unicode characters to HTML numeric entities using plain Javascript
Javascript strings are UTF-16. A character in the surrogate range takes up two 16-bit words. The length
property of a string is the count of the number of 16-bit words. Thus "quot;.length
will return 2.
codePointAt(i)
is not the ith character, but the ith 16-bit word. Hence, a surrogate character will appear over two consecutive codePointAt
invocations. From the specs, if "quot;.toString(0)
is the high surrogate, the function will return the code point value, ie 128578, but "quot;.toString(1)
will return only the lower surrogate 56898, that black diamond.
Thus you need to skip one position if codePointAt
returns a high surrogate.
Following the example in the specs, instead of iterating through each 16-bit word in the string, use a method that loops through each character. for let (char in aString) {}
does just that.
function html_encode(s) {
var ret_val = '';
for (let char of s) {
const code = char.codePointAt(0);
if (code > 127) {
ret_val += '' + code + ';';
} else {
ret_val += char;
}
}
return ret_val;
}
let v = html_encode(document.getElementById('orig').innerHTML);
document.getElementById('copy').innerHTML = v;
document.getElementById('values').value = v;
div {
padding:10px;
border:solid 1px grey;
}
textarea {
width:calc(100% - 30px);
height:50px;
padding:10px;
}
Orig:<div id='orig'>1:__2:__3:ß__4:Ü__5:X__6:Y__7:팆__8:Z__9:⚠️__10:⚠️__11:⚠__12:🙂</div>
Copy:<div id='copy'></div>
Values:<textarea id='values'></textarea>
HTML Entity Decode
You could try something like:
var Title = $('<textarea />').html("Chris' corner").text();console.log(Title);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
Native JavaScript or ES6 way to encode and decode HTML entities?
There is no native function in the JavaScript API that convert ASCII characters to their "html-entities" equivalent.
Here is a beginning of a solution and an easy trick that you may like
Unescape HTML entities in JavaScript?
EDIT: You should use the DOMParser API as Wladimir suggests, I edited my previous answer since the function posted introduced a security vulnerability.
The following snippet is the old answer's code with a small modification: using a textarea
instead of a div
reduces the XSS vulnerability, but it is still problematic in IE9 and Firefox.
function htmlDecode(input){
var e = document.createElement('textarea');
e.innerHTML = input;
// handle case of empty input
return e.childNodes.length === 0 ? "" : e.childNodes[0].nodeValue;
}
htmlDecode("<img src='myimage.jpg'>");
// returns "<img src='myimage.jpg'>"
Basically I create a DOM element programmatically, assign the encoded HTML to its innerHTML and retrieve the nodeValue from the text node created on the innerHTML insertion. Since it just creates an element but never adds it, no site HTML is modified.
It will work cross-browser (including older browsers) and accept all the HTML Character Entities.
EDIT: The old version of this code did not work on IE with blank inputs, as evidenced here on jsFiddle (view in IE). The version above works with all inputs.
UPDATE: appears this doesn't work with large string, and it also introduces a security vulnerability, see comments.
Convert HTML Character Entities back to regular text using javascript
You could do something like this:
String.prototype.decodeHTML = function() {
var map = {"gt":">" /* , … */};
return this.replace(/&(#(?:x[0-9a-f]+|\d+)|[a-z]+);?/gi, function($0, $1) {
if ($1[0] === "#") {
return String.fromCharCode($1[1].toLowerCase() === "x" ? parseInt($1.substr(2), 16) : parseInt($1.substr(1), 10));
} else {
return map.hasOwnProperty($1) ? map[$1] : $0;
}
});
};
Related Topics
Onchange Event Handler for Radio Button (Input Type="Radio") Doesn't Work as One Value
How to Stop a Web Page from Scrolling to the Top When a Link Is Clicked That Triggers JavaScript
Detect Which Word Has Been Clicked on Within a Text
Disable Pasting Text into HTML Form
How to Get Selected Text from a Textbox Control With JavaScript
Html5 Canvas Resize (Downscale) Image High Quality
Onchange Event on Input Type=Range Is Not Triggering in Firefox While Dragging
Are HTML Comments Inside Script Tags a Best Practice
Use Html5 to Resize an Image Before Upload
JavaScript; Communication Between Tabs/Windows With Same Origin
Populate One Dropdown Based on Selection in Another
Difference Between ≪Script Src="Foo.Js"≫≪/Script≫ and ≪Script Src="Foo.Js"/≫
How to Scroll to an Element Inside a Div
How to Store Objects in Html5 Localstorage