How to Load an Entire HTML Document into a Document Fragment in Internet Explorer

Can I load an entire HTML document into a document fragment in Internet Explorer?

Fiddle: http://jsfiddle.net/JFSKe/6/

DocumentFragment doesn't implement DOM methods. Using document.createElement in conjunction with innerHTML removes the <head> and <body> tags (even when the created element is a root element, <html>). Therefore, the solution should be sought elsewhere. I have created a cross-browser string-to-DOM function, which makes use of an invisible inline-frame.

All external resources and scripts will be disabled. See Explanation of the code for more information.

Code

/*
 @param String html    The string with HTML which has be converted to a DOM object
 @param func callback  (optional) Callback(HTMLDocument doc, function destroy)
 @returns              undefined if callback exists, else: Object
                        HTMLDocument doc  DOM fetched from Parameter:html
                        function destroy  Removes HTMLDocument doc.         */
function string2dom(html, callback){
    /* Sanitise the string */
    html = sanitiseHTML(html); /*Defined at the bottom of the answer*/

    /* Create an IFrame */
    var iframe = document.createElement("iframe");
    iframe.style.display = "none";
    document.body.appendChild(iframe);

    var doc = iframe.contentDocument || iframe.contentWindow.document;
    doc.open();
    doc.write(html);
    doc.close();

    function destroy(){
        iframe.parentNode.removeChild(iframe);
    }
    if(callback) callback(doc, destroy);
    else return {"doc": doc, "destroy": destroy};
}

/* @name sanitiseHTML
   @param String html  A string representing HTML code
   @return String      A new string, fully stripped of external resources.
                       All "external" attributes (href, src) are prefixed by data- */

function sanitiseHTML(html){
    /* Adds a <!-\"'--> before every matched tag, so that unterminated quotes
        aren't preventing the browser from splitting a tag. Test case:
       '<input style="foo;b:url(0);><input onclick="<input type=button onclick="too() href=;>">' */
    var prefix = "<!--\"'-->";
    /*Attributes should not be prefixed by these characters. This list is not
     complete, but will be sufficient for this function.
      (see http://www.w3.org/TR/REC-xml/#NT-NameChar) */
    var att = "[^-a-z0-9:._]";
    var tag = "<[a-z]";
    var any = "(?:[^<>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^<>]*";
    var etag = "(?:>|(?=<))";

    /*
      @name ae
      @description          Converts a given string in a sequence of the
                             original input and the HTML entity
      @param String string  String to convert
      */
    var entityEnd = "(?:;|(?!\\d))";
    var ents = {" ":"(?:\\s| ?|�*32"+entityEnd+"|�*20"+entityEnd+")",
                "(":"(?:\\(|�*40"+entityEnd+"|�*28"+entityEnd+")",
                ")":"(?:\\)|�*41"+entityEnd+"|�*29"+entityEnd+")",
                ".":"(?:\\.|�*46"+entityEnd+"|�*2e"+entityEnd+")"};
                /*Placeholder to avoid tricky filter-circumventing methods*/
    var charMap = {};
    var s = ents[" "]+"*"; /* Short-hand space */
    /* Important: Must be pre- and postfixed by < and >. RE matches a whole tag! */
    function ae(string){
        var all_chars_lowercase = string.toLowerCase();
        if(ents[string]) return ents[string];
        var all_chars_uppercase = string.toUpperCase();
        var RE_res = "";
        for(var i=0; i<string.length; i++){
            var char_lowercase = all_chars_lowercase.charAt(i);
            if(charMap[char_lowercase]){
                RE_res += charMap[char_lowercase];
                continue;
            }
            var char_uppercase = all_chars_uppercase.charAt(i);
            var RE_sub = [char_lowercase];
            RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd);
            RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
            if(char_lowercase != char_uppercase){
                RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd);   
                RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
            }
            RE_sub = "(?:" + RE_sub.join("|") + ")";
            RE_res += (charMap[char_lowercase] = RE_sub);
        }
        return(ents[string] = RE_res);
    }
    /*
      @name by
      @description  second argument for the replace function.
      */
    function by(match, group1, group2){
        /* Adds a data-prefix before every external pointer */
        return group1 + "data-" + group2 
    }
    /*
      @name cr
      @description            Selects a HTML element and performs a
                                  search-and-replace on attributes
      @param String selector  HTML substring to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String marker    Optional RegExp-escaped; marks the prefix
      @param String delimiter Optional RegExp escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cr(selector, attribute, marker, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        marker = typeof marker == "string" ? marker : "\\s*=";
        delimiter = typeof delimiter == "string" ? delimiter : "";
        end = typeof end == "string" ? end : "";
        var is_end = end && "?";
        var re1 = new RegExp("("+att+")("+attribute+marker+"(?:\\s*\"[^\""+delimiter+"]*\"|\\s*'[^'"+delimiter+"]*'|[^\\s"+delimiter+"]+"+is_end+")"+end+")", "gi");
        html = html.replace(selector, function(match){
            return prefix + match.replace(re1, by);
        });
    }
    /* 
      @name cri
      @description            Selects an attribute of a HTML element, and
                               performs a search-and-replace on certain values
      @param String selector  HTML element to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String front     RegExp-escaped; attribute value, prefix to match
      @param String flags     Optional RegExp flags, default "gi"
      @param String delimiter Optional RegExp-escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cri(selector, attribute, front, flags, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        flags = typeof flags == "string" ? flags : "gi";
         var re1 = new RegExp("("+att+attribute+"\\s*=)((?:\\s*\"[^\"]*\"|\\s*'[^']*'|[^\\s>]+))", "gi");

        end = typeof end == "string" ? end + ")" : ")";
        var at1 = new RegExp('(")('+front+'[^"]+")', flags);
        var at2 = new RegExp("(')("+front+"[^']+')", flags);
        var at3 = new RegExp("()("+front+'(?:"[^"]+"|\'[^\']+\'|(?:(?!'+delimiter+').)+)'+end, flags);

        var handleAttr = function(match, g1, g2){
            if(g2.charAt(0) == '"') return g1+g2.replace(at1, by);
            if(g2.charAt(0) == "'") return g1+g2.replace(at2, by);
            return g1+g2.replace(at3, by);
        };
        html = html.replace(selector, function(match){
             return prefix + match.replace(re1, handleAttr);
        });
    }

    /* <meta http-equiv=refresh content="  ; url= " > */
    html = html.replace(new RegExp("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+etag+"|'"+ae("refresh")+"'"+any+etag+"|"+ae("refresh")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "gi"), "<!-- meta http-equiv=refresh stripped-->");

    /* Stripping all scripts */
    html = html.replace(new RegExp("<script"+any+">\\s*//\\s*<\\[CDATA\\[[\\S\\s]*?]]>\\s*</script[^>]*>", "gi"), "<!--CDATA script-->");
    html = html.replace(/<script[\S\s]+?<\/script\s*>/gi, "<!--Non-CDATA script-->");
    cr(tag+any+att+"on[-a-z0-9:_.]+="+any+etag, "on[-a-z0-9:_.]+"); /* Event listeners */

    cr(tag+any+att+"href\\s*="+any+etag, "href"); /* Linked elements */
    cr(tag+any+att+"src\\s*="+any+etag, "src"); /* Embedded elements */

    cr("<object"+any+att+"data\\s*="+any+etag, "data"); /* <object data= > */
    cr("<applet"+any+att+"codebase\\s*="+any+etag, "codebase"); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+etag+"|'"+ae("movie")+"'"+any+etag+"|"+ae("movie")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "value");

    /* <style> and < style=  > url()*/
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")"));

    /* IE7- CSS expression() */
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "expression", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("expression")+s+ae("(")+s, 0, s+ae(")"), ae(")"));
    return html.replace(new RegExp("(?:"+prefix+")+", "g"), prefix);
}

Explanation of the code

The sanitiseHTML function is based on my replace_all_rel_by_abs function (see this answer). The sanitiseHTML function is completely rewritten though, in order to achieve maximum efficiency and reliability.

Additionally, a new set of RegExps are added to remove all scripts and event handlers (including CSS expression(), IE7-). To make sure that all tags are parsed as expected, the adjusted tags are prefixed by . This prefix is necessary to correctly parse nested "event handlers" in conjunction with unterminated quotes: <a id="><input onclick="<div onmousemove=evil()>">.

These RegExps are dynamically created using an internal function cr/cri (Create Replace [Inline]). These functions accept a list of arguments, and create and execute an advanced RE replacement. To make sure that HTML entities aren't breaking a RegExp (refresh in <meta http-equiv=refresh> could be written in various ways), the dynamically created RegExps are partially constructed by function ae (Any Entity).

The actual replacements are done by function by (replace by). In this implementation, by adds data- before all matched attributes.

All <script>//<[CDATA[ .. //]]></script> occurrences are striped. This step is necessary, because CDATA sections allow </script> strings inside the code. After this replacement has been executed, it's safe to go to the next replacement:
The remaining <script>...</script> tags are removed.
The <meta http-equiv=refresh .. > tag is removed
All event listeners and external pointers/attributes (href, src, url()) are prefixed by data-, as described previously.
An IFrame object is created. IFrames are less likely to leak memory (contrary to the htmlfile ActiveXObject). The IFrame becomes invisible, and is appended to the document, so that the DOM can be accessed. document.write() are used to write HTML to the IFrame. document.open() and document.close() are used to empty the previous contents of the document, so that the generated document is an exact copy of the given html string.
If a callback function has been specified, the function will be called with two arguments. The first argument is a reference to the generated document object. The second argument is a function, which destroys the generated DOM tree when called. This function should be called when you don't need the tree any more.
If the callback function isn't specified, the function returns an object consisting of two properties (doc and destroy), which behave the same as the previously mentioned arguments.

Additional notes

Setting the designMode property to "On" will stop a frame from executing scripts (not supported in Chrome). If you have to preserve the <script> tags for a specific reason, you can use iframe.designMode = "On" instead of the script stripping feature.
I wasn't able to find a reliable source for the htmlfile activeXObject. According to this source, htmlfile is slower than IFrames, and more susceptible to memory leaks.
All affected attributes (href, src, ...) are prefixed by data-. An example of getting/changing these attributes is shown for data-href:
elem.getAttribute("data-href") and elem.setAttribute("data-href", "...")
elem.dataset.href and elem.dataset.href = "...".
External resources have been disabled. As a result, the page may look completely different:
~~<link rel="stylesheet" href="main.css" />~~ No external styles
~~<script>document.body.bgColor="red";</script>~~ No scripted styles
<img src="128x128.png" /> No images: the size of the element may be completely different.

Examples

sanitiseHTML(html)

Paste this bookmarklet in the location's bar. It will offer an option to inject a textarea, showing the sanitised HTML string.

javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/html-sanitizer.js";document.body.appendChild(s)})();

Code examples - string2dom(html):

string2dom("<html><head><title>Test</title></head></html>", function(doc, destroy){
    alert(doc.title); /* Alert: "Test" */
    destroy();
});

var test = string2dom("<div id='secret'></div>");
alert(test.doc.getElementById("secret").tagName); /* Alert: "DIV" */
test.destroy();

Notable references

SO: JS RE to change all relative to absolute URLs - Function sanitiseHTML(html) is based on my previously created replace_all_rel_by_abs(html) function.
Elements - Embedded content - A full list of standard embedded elements
Elements - Previous HTML elements - An additional list of (deprecated) elements (such as <applet>)
The htmlfile ActiveX object - "Slower than iframe sandboxes. Leaks memory if not managed"

Inserting arbitrary HTML into a DocumentFragment

Here is a way in modern browsers without looping:

var temp = document.createElement('template');
temp.innerHTML = '<div>x</div><span>y</span>';

var frag = temp.content;

or, as a re-usable

function fragmentFromString(strHTML) {
    var temp = document.createElement('template');
    temp.innerHTML = strHTML;
    return temp.content;
}

UPDATE:
I found a simpler way to use Pete's main idea, which adds IE11 to the mix:

function fragmentFromString(strHTML) {
    return document.createRange().createContextualFragment(strHTML);
}

The coverage is better than the <template> method and tested ok in IE11, Ch, FF.

Live test/demo available http://pagedemos.com/str2fragment/

Using documentFragment to parse HTML without sending HTTP requests

I've found answer of my question here on stackoverflow, this answer. the answer consists of a piece of code which parses HTML using native browser functionality but in a semi-sandboxed environment which doesn't send HTTP requests. hope it helps others as well.

Transform DOM Document to fragment with XSL in Internet Explorer

try

// code for IE
if (window.ActiveXObject)
 {
 ex=xml.transformNode(xsl);
 document.getElementById("demo").innerHTML=ex;
 }
 // code for Mozilla, Firefox, Opera, etc.
 else if (document.implementation && document.implementation.createDocument)
  {
  xsltProcessor=new XSLTProcessor();
   xsltProcessor.importStylesheet(xsl);
   resultDocument = xsltProcessor.transformToFragment(xml,document);
  document.getElementById("demo").appendChild(resultDocument);
 }
}

XSL processing a fragment in IE

After much more trial and error, I came up with something that works. Here's what I did:

Create a xsltProcessor as usual, and call the transform method. This results in xsltProcessor.output being a HTML formatted string. Of course, I want a DOM element, so I had to convert the HTML string to a DOM. Luckily, because I'm the author of the XSL stylesheet too, I know exactly what I'm expecting to come back. In my case, the output HTML string would be some number of <tr>...</tr> elements. I initially tried setting my resTable (a table DOM element) innerHTML to the output string, but that did not work. I'm still not sure why, but it seems like it has something specific to do with the fact they were <tr>s and it wasn't able to be parsed when set to innerHTML outside the context of a table tag.

At any rate, I created a temporary div element and set ITs innerHTML to a string having the xsltProcessor's output string encased in a <table></table> tag. Now the temp div element is DOM table, which I then stepped through and grabbed just the child nodes (which are the tr nodes that the xsl processor returned in the first place). Seems kind of ridiculous to do all this, but it works, and thats the first time I can say that.. Here's the final version that works in all the browsers I've tested..

var resTable = document.createElement('table');

for (i = 0; i < xmlNames.length; i++)
{
  // code for IE
  if (window.ActiveXObject)
  {
    var xml = new ActiveXObject("Microsoft.XMLDOM");
    xml.async = false;
    xml.load(xmlNames[i]);

    var xslt = new ActiveXObject("Msxml2.XSLTemplate");
    var xsl = new ActiveXObject("Msxml2.FreeThreadedDOMDocument.3.0");

    var xsltProcessor;
    xsl.async = false;
    xsl.resolveExternals = false;
    xsl.load(xslName);
    xslt.stylesheet = xsl;

    xsltProcessor = xslt.createProcessor();
    xsltProcessor.input = xml;

    //This transform results in one or more tr.../tr HTML tag(s)
    xsltProcessor.transform();

    //Create a temp div element which is used to convert the HTML
    //string to a DOM element so I can grab just the part I want..
    tmp = document.createElement('div');

    //Can't set innerHTML to tr tags directly I guess, so have to put
    //in context of a table so it can be parsed...
    tmp.innerHTML = "<table>" + xsltProcessor.output + "</table>";

    //Now I need to grad the tr children from inside the table node, since
    //the table was only to please the parser
    for (tmpChildInd = 0; tmpChildInd <  tmp.childNodes[0].childNodes.length;   tmpChildInd++)
    {
      //finally, append the temporary elements children (the tr tags) 
      //to the overall table I created before the loop.
      resTable.appendChild(tmp.childNodes[0].childNodes[tmpChildInd]);
    }  
  }
  // code for Mozilla, Firefox, Opera, etc.
  else if (document.implementation && document.implementation.createDocument)
  {
    xml=loadXMLDoc(xmlNames[i]);
    xsl=loadXMLDoc(xslName);
    xsltProcessor=new XSLTProcessor();
    xsltProcessor.importStylesheet(xsl);
    resultDocument = xsltProcessor.transformToFragment(xml,document);
    resTable.appendChild(resultDocument);
  }
}

//put the full table at the div location "theDoc" now..
document.getElementById("theDoc").appendChild(resTable);

Not sure how often folks try to do this, but hopefully this helps someone else out there..

Bad performance IE using documentFragment

Think I've found it: it looks like, although a documentFragment should be an 'off line' element (an element that is not part of the live DOM) IE doesn't treat it as such. The way to force the fragment to really be off line is to append some element to it, set its display property to none and append the rest of elements to that element. After you are done, remove the display:none property and the documentFragment can be appended to the DOM.

It is still three times slower (on my PC it still takes around 1-1.5 seconds, versus around 2-300 ms in Chrome/Firefox for 10000 elements). So, for IE (even version 10), using innerHTML to add a bunch of elements to the DOM is the faster way. IE remains a developers nightmare, I'd say.

Locate data in a document fragment that is compatible with IE8

This becomes quite hackish but I believe I found a possible solution.

The problem with IE8 is that is adds \r\n after some tags for a reason I just don't understand

Unlike what it says in the spec, if you use .replace("\n", "") this is not a global replace, ie8 does
.replace(/\n/, "") instead of .replace(/\n/gm, ""). Also, because it adds a \r\n and not just \n, that replacement is not enough, that's what has mostly been puzzling me. In order to get a proper replacement you need .replace(/\r\n/mg, "") (no need for the pipe character, it's not an OR it's the \r followed by the \n).

In the end, this may and probably is hackish but that's the way to deal with IE8 and before, as far as I know....

By using .replace(/\r\n/mg, "") I was able to get a match in the exec() method.