Postgresql - Replace HTML Entities

PostgreSQL replace HTML entities function

Yes it is possible to replace HTML or other character entities with the respective characters using a function. First create a character entity table:

create table character_entity(
name text primary key,
ch char(1) unique
);
insert into character_entity (ch, name) values
(E'\u00C6','AElig'),(E'\u00C1','Aacute'),(E'\u00C2','Acirc'),(E'\u00C0','Agrave'),(E'\u0391','Alpha'),(E'\u00C5','Aring'),(E'\u00C3','Atilde'),(E'\u00C4','Auml'),(E'\u0392','Beta'),(E'\u00C7','Ccedil'),
(E'\u03A7','Chi'),(E'\u2021','Dagger'),(E'\u0394','Delta'),(E'\u00D0','ETH'),(E'\u00C9','Eacute'),(E'\u00CA','Ecirc'),(E'\u00C8','Egrave'),(E'\u0395','Epsilon'),(E'\u0397','Eta'),(E'\u00CB','Euml'),
(E'\u0393','Gamma'),(E'\u00CD','Iacute'),(E'\u00CE','Icirc'),(E'\u00CC','Igrave'),(E'\u0399','Iota'),(E'\u00CF','Iuml'),(E'\u039A','Kappa'),(E'\u039B','Lambda'),(E'\u039C','Mu'),(E'\u00D1','Ntilde'),
(E'\u039D','Nu'),(E'\u0152','OElig'),(E'\u00D3','Oacute'),(E'\u00D4','Ocirc'),(E'\u00D2','Ograve'),(E'\u03A9','Omega'),(E'\u039F','Omicron'),(E'\u00D8','Oslash'),(E'\u00D5','Otilde'),(E'\u00D6','Ouml'),
(E'\u03A6','Phi'),(E'\u03A0','Pi'),(E'\u2033','Prime'),(E'\u03A8','Psi'),(E'\u03A1','Rho'),(E'\u0160','Scaron'),(E'\u03A3','Sigma'),(E'\u00DE','THORN'),(E'\u03A4','Tau'),(E'\u0398','Theta'),
(E'\u00DA','Uacute'),(E'\u00DB','Ucirc'),(E'\u00D9','Ugrave'),(E'\u03A5','Upsilon'),(E'\u00DC','Uuml'),(E'\u039E','Xi'),(E'\u00DD','Yacute'),(E'\u0178','Yuml'),(E'\u0396','Zeta'),(E'\u00E1','aacute'),
(E'\u00E2','acirc'),(E'\u00B4','acute'),(E'\u00E6','aelig'),(E'\u00E0','agrave'),(E'\u2135','alefsym'),(E'\u03B1','alpha'),(E'\u0026','amp'),(E'\u2227','and'),(E'\u2220','ang'),(E'\u00E5','aring'),
(E'\u2248','asymp'),(E'\u00E3','atilde'),(E'\u00E4','auml'),(E'\u201E','bdquo'),(E'\u03B2','beta'),(E'\u00A6','brvbar'),(E'\u2022','bull'),(E'\u2229','cap'),(E'\u00E7','ccedil'),(E'\u00B8','cedil'),
(E'\u00A2','cent'),(E'\u03C7','chi'),(E'\u02C6','circ'),(E'\u2663','clubs'),(E'\u2245','cong'),(E'\u00A9','copy'),(E'\u21B5','crarr'),(E'\u222A','cup'),(E'\u00A4','curren'),(E'\u21D3','dArr'),
(E'\u2020','dagger'),(E'\u2193','darr'),(E'\u00B0','deg'),(E'\u03B4','delta'),(E'\u2666','diams'),(E'\u00F7','divide'),(E'\u00E9','eacute'),(E'\u00EA','ecirc'),(E'\u00E8','egrave'),(E'\u2205','empty'),
(E'\u2003','emsp'),(E'\u2002','ensp'),(E'\u03B5','epsilon'),(E'\u2261','equiv'),(E'\u03B7','eta'),(E'\u00F0','eth'),(E'\u00EB','euml'),(E'\u20AC','euro'),(E'\u2203','exist'),(E'\u0192','fnof'),
(E'\u2200','forall'),(E'\u00BD','frac12'),(E'\u00BC','frac14'),(E'\u00BE','frac34'),(E'\u2044','frasl'),(E'\u03B3','gamma'),(E'\u2265','ge'),(E'\u003E','gt'),(E'\u21D4','hArr'),(E'\u2194','harr'),
(E'\u2665','hearts'),(E'\u2026','hellip'),(E'\u00ED','iacute'),(E'\u00EE','icirc'),(E'\u00A1','iexcl'),(E'\u00EC','igrave'),(E'\u2111','image'),(E'\u221E','infin'),(E'\u222B','int'),(E'\u03B9','iota'),
(E'\u00BF','iquest'),(E'\u2208','isin'),(E'\u00EF','iuml'),(E'\u03BA','kappa'),(E'\u21D0','lArr'),(E'\u03BB','lambda'),(E'\u2329','lang'),(E'\u00AB','laquo'),(E'\u2190','larr'),(E'\u2308','lceil'),
(E'\u201C','ldquo'),(E'\u2264','le'),(E'\u230A','lfloor'),(E'\u2217','lowast'),(E'\u25CA','loz'),(E'\u200E','lrm'),(E'\u2039','lsaquo'),(E'\u2018','lsquo'),(E'\u003C','lt'),(E'\u00AF','macr'),
(E'\u2014','mdash'),(E'\u00B5','micro'),(E'\u00B7','middot'),(E'\u2212','minus'),(E'\u03BC','mu'),(E'\u2207','nabla'),(E'\u00A0','nbsp'),(E'\u2013','ndash'),(E'\u2260','ne'),(E'\u220B','ni'),
(E'\u00AC','not'),(E'\u2209','notin'),(E'\u2284','nsub'),(E'\u00F1','ntilde'),(E'\u03BD','nu'),(E'\u00F3','oacute'),(E'\u00F4','ocirc'),(E'\u0153','oelig'),(E'\u00F2','ograve'),(E'\u203E','oline'),
(E'\u03C9','omega'),(E'\u03BF','omicron'),(E'\u2295','oplus'),(E'\u2228','or'),(E'\u00AA','ordf'),(E'\u00BA','ordm'),(E'\u00F8','oslash'),(E'\u00F5','otilde'),(E'\u2297','otimes'),(E'\u00F6','ouml'),
(E'\u00B6','para'),(E'\u2202','part'),(E'\u2030','permil'),(E'\u22A5','perp'),(E'\u03C6','phi'),(E'\u03C0','pi'),(E'\u03D6','piv'),(E'\u00B1','plusmn'),(E'\u00A3','pound'),(E'\u2032','prime'),
(E'\u220F','prod'),(E'\u221D','prop'),(E'\u03C8','psi'),(E'\u0022','quot'),(E'\u21D2','rArr'),(E'\u221A','radic'),(E'\u232A','rang'),(E'\u00BB','raquo'),(E'\u2192','rarr'),(E'\u2309','rceil'),
(E'\u201D','rdquo'),(E'\u211C','real'),(E'\u00AE','reg'),(E'\u230B','rfloor'),(E'\u03C1','rho'),(E'\u200F','rlm'),(E'\u203A','rsaquo'),(E'\u2019','rsquo'),(E'\u201A','sbquo'),(E'\u0161','scaron'),
(E'\u22C5','sdot'),(E'\u00A7','sect'),(E'\u00AD','shy'),(E'\u03C3','sigma'),(E'\u03C2','sigmaf'),(E'\u223C','sim'),(E'\u2660','spades'),(E'\u2282','sub'),(E'\u2286','sube'),(E'\u2211','sum'),
(E'\u2283','sup'),(E'\u00B9','sup1'),(E'\u00B2','sup2'),(E'\u00B3','sup3'),(E'\u2287','supe'),(E'\u00DF','szlig'),(E'\u03C4','tau'),(E'\u2234','there4'),(E'\u03B8','theta'),(E'\u03D1','thetasym'),
(E'\u2009','thinsp'),(E'\u00FE','thorn'),(E'\u02DC','tilde'),(E'\u00D7','times'),(E'\u2122','trade'),(E'\u21D1','uArr'),(E'\u00FA','uacute'),(E'\u2191','uarr'),(E'\u00FB','ucirc'),(E'\u00F9','ugrave'),
(E'\u00A8','uml'),(E'\u03D2','upsih'),(E'\u03C5','upsilon'),(E'\u00FC','uuml'),(E'\u2118','weierp'),(E'\u03BE','xi'),(E'\u00FD','yacute'),(E'\u00A5','yen'),(E'\u00FF','yuml'),(E'\u03B6','zeta'),
(E'\u200D','zwj'),(E'\u200C','zwnj')
;

This is the function:

create or replace function entity2char(t text)
returns text as $body$
declare
r record;
begin
for r in
select distinct ce.ch, ce.name
from
character_entity ce
inner join (
select name[1] "name"
from regexp_matches(t, '&([A-Za-z]+?);', 'g') r(name)
) s on ce.name = s.name
loop
t := replace(t, '&' || r.name || ';', r.ch);
end loop;

for r in
select distinct
hex[1] hex,
('x' || repeat('0', 8 - length(hex[1])) || hex[1])::bit(32)::int codepoint
from regexp_matches(t, '&#x([0-9a-f]{1,8}?);', 'gi') s(hex)
loop
t := regexp_replace(t, '&#x' || r.hex || ';', chr(r.codepoint), 'gi');
end loop;

for r in
select distinct
chr(codepoint[1]::int) ch,
codepoint[1] codepoint
from regexp_matches(t, '&#([0-9]{1,10}?);', 'g') s(codepoint)
loop
t := replace(t, '&#' || r.codepoint || ';', r.ch);
end loop;

return t;
end;
$body$
language plpgsql immutable;

Use it like this:

select entity2char('HH■XXXÆYYY×ZZZ■UUU');
entity2char
--------------------
HH■XXXÆYYY×ZZZ■UUU

It only works for UTF-8.

PostgreSQL - Replace HTML Entities

Write a function using pl/perlu and use this module https://metacpan.org/pod/HTML::Entities

Of course you need to have perl installed and pl/perl available.

1)
First of all create the procedural language pl/perlu:

CREATE EXTENSION plperlu;

2) Then create a function like this:

CREATE FUNCTION decode_html_entities(text) RETURNS TEXT AS $$
use HTML::Entities;
return decode_entities($_[0]);
$$ LANGUAGE plperlu;

3) Then you can use it like this:

select decode_html_entities('aaabbb&.... asasdasdasd …');
decode_html_entities
---------------------------
aaabbb&.... asasdasdasd …
(1 row)

Stripping HTML tags in PostgreSQL

Any solution performed in the RDBMS is going to involve either string handling or regexes: to my knowledge there is NO way to manipulate HTML in a standards-compliant, safe way in the database. To reiterate, what you are asking for is very, VERY unsafe.

A much better option is to do this in your application. This is application logic, and NOT the job or concern of your storage layer.

A great way to do this (in PHP, at least) would be HTML purifier. Don't do this in JavaScript, the user can tamper with it very easily.

Field stored with htmlentities in PostgreSQL, how do I search (sql select) for special characters, say, without doing a search?

I am not quite sure I understand what you really want. If you want to know, however, how to search for rows in a table that contain the < character efficiently, you could create an index on an expression like this:

CREATE INDEX index_name ON table (position('<' in column));

Then you can search for the rows whose desired column contains the pattern by doing:

SELECT * FROM table WHERE position('<' in column) > 0;

I assume, of course, that most rows in your table will not contain such a pattern, or otherwise this index wouldn't be of much use.

NOTICE:
You could also create an index with the help of the pg_trgm module if you are using PostgreSQL >= 9.1. This would also enable you to do other text searches with only one index, and may prove to be much more useful should you need those.

CREATE INDEX index_name ON table USING gist (column gist_trgm_ops);

Then you can search for your rows like this:

SELECT * FROM table WHERE column LIKE '%<%';

For more information, check pg_trgm in the official docs.

Remove html entities from a databases

Since no-one could provide a satisfying SQL-only solution, I solved it with a script similar to this one.
Note that it only works if all the tables you use it on have a primary key, but this will usually be the case

<?php
// Specify which columns need to be de-entitiezed
$affected = array(
'table1' => array('column1', 'column2'),
'table2' => array('column1', 'column2'),
);

// Make database connection
$db = new PDO("mysql:dbname=yourdb;host=yourhost", "user", "pass");

foreach($affected as $table => $columns){
// Start a transaction for each table
$db->beginTransaction();

// Find the table primary key. PHP5.4 syntax!
$pk = $db->query("SHOW INDEX FROM " . $table . " WHERE Key_name = 'PRIMARY'")->fetch()[0];

foreach($columns as $column){
// Construct a prepared statement for this column
$ps = $db->prepare("UPDATE " . $table . " SET " . $column . " . = ? WHERE " . $pk . " = ?");

// Go through all rows
foreach( $db->query("SELECT " . $column . ", " . $pk . " FROM " . $table) as $row){
$row[0] = html_entity_decode($row[0]); // Actual processing
$ps->execute($row);
}
}

// Everything went well for this table, commit
$db->commit();
}
?>

Postgresql : regexp_replace to remove special characters

You can use translate to convert those specific characters to spaces:

select translate('Test.010. " @ $ %. تجربه', '''",./\|][{}&*-%^!@#', '                   ');
translate
--------------------------
Test 010 $ تجربه

SQL Server : replace & to character entities (&)

  1. If you really want to replace it in your database, you can try running
    UPDATE Table1 SET Column1 = REPLACE(Column1, '&', '&');

  1. I suppose you want to do this because you want to display data on the site exactly the same way in your database. So I suggest you to escape when you display it (in the application side) since it is not easy to maintain when you have lots of &amp; or &nbsp; in your database.

For example:

In php, you can use htmlspecialchars();

In java, you can import static org.apache.commons.lang.StringEscapeUtils.escapeHtml; and then use escapeHtml();

In ruby on rails, you can use HTMLEntities.new.encode(); (If you use rails3 or newer version, escaping should be done by default.)

SQL - Remove all HTML tags in a string

DECLARE @x XML = '<html><head></head><body><p>My text.</p></body></html>'

SELECT t.c.value('.', 'NVARCHAR(MAX)')
FROM @x.nodes('*') t(c)

Update - For strings with unclosed tags:

DECLARE @x NVARCHAR(MAX) = '<html><head></head><body><p>My text.<br>More text.</p></body></html>'

SELECT x.value('.', 'NVARCHAR(MAX)')
FROM (
SELECT x = CAST(REPLACE(REPLACE(@x, '>', '/>'), '</', '<') AS XML)
) r


Related Topics



Leave a reply



Submit