Can Itextsharp.Xmlworker Render Embedded Images

Can itextsharp.xmlworker render embedded images?

We need to write our own ImageTagProcessor to support processing of base 64 images:

public class CustomImageTagProcessor : iTextSharp.tool.xml.html.Image
{
    public override IList<IElement> End(IWorkerContext ctx, Tag tag, IList<IElement> currentContent)
    {
        IDictionary<string, string> attributes = tag.Attributes;
        string src;
        if (!attributes.TryGetValue(HTML.Attribute.SRC, out src))
            return new List<IElement>(1);

        if (string.IsNullOrEmpty(src))
            return new List<IElement>(1);

        if (src.StartsWith("data:image/", StringComparison.InvariantCultureIgnoreCase))
        {
            // data:[<MIME-type>][;charset=<encoding>][;base64],<data>
            var base64Data = src.Substring(src.IndexOf(",") + 1);
            var imagedata = Convert.FromBase64String(base64Data);
            var image = iTextSharp.text.Image.GetInstance(imagedata);

            var list = new List<IElement>();
            var htmlPipelineContext = GetHtmlPipelineContext(ctx);
            list.Add(GetCssAppliers().Apply(new Chunk((iTextSharp.text.Image)GetCssAppliers().Apply(image, tag, htmlPipelineContext), 0, 0, true), tag, htmlPipelineContext));
            return list;
        }
        else
        {
            return base.End(ctx, tag, currentContent);
        }
    }
}

Then we can inject this new processor into the HtmlPipelineContext:

        using (var doc = new Document(PageSize.A4))
        {
            var writer = PdfWriter.GetInstance(doc, new FileStream("test.pdf", FileMode.Create));
            doc.Open();
            var html = @"<img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD4AAABQCAMAAAB24TZcAAAABGdBTUEAANbY1E9YMgAAABl0RVh0U29mdHdhcmUAQWRvYmUgSW1hZ2VSZWFkeXHJZTwAAAGAUExURdSmeJp2SHlbQIRoSUg2J499a8KebqeHZuGufBEVJPz7+3NWPVxGMduwhPXEktnX1mtROLq7t5WDc2VMNv3LmKB8TMSidMbFxLGlmXlhSMSddpJUL+y8i3VlVqedlOzr6gUIF2lXRLCLY4ZyXLyYaYhtUYiJhJFyU1dBLLiVZnlwZrWRY/Hx8b+2rbySaJh9YqeooDw4NygnKvvJlpyblzksIUhGRryYckc7MPjGlKODX5x8VVA8K+azgM3FvDInHK2JW2ZbUOHh4Xt2cFpaWKeAUM6kel1RRJmUjo5vSrWzrJJ1WFhLQCQmMuK1iJiMgmthWPPCkOm3hEtBOunm5LCNXnJtZquEXmNkYvG+i7Ctq+y5hrWRbKqSeaN/WqmFVYFgQh8aGOa4isWkd8mcby4vONDNy0AwI5h2U19JMxkdLzIuL1JBMjQ3P5Z6Ve6/j93c2+Xi34KAfJ5/Xvj4+O/u7sSKVJd4Wo6QjXE+IeOwfQcNJoBeQ8Gdbf/Mmf///5GX6NEAAAcrSURBVHja3JbpX9pIGMchiWkgEaOBtaGinBLEyopFBeMqtYKI4kGt2lILFsUoXa3WdZcc/dd3JheHAvaz7/Z5Ec2Q7/yeaw7Lz/9klv8rfnM+Orz5cXLjZsL+67h9eCq9Vaxvzc6v3W6+/TX85kN6ixdokkQQCaE5vrg28Qv4a2yFQcpSi/HzH6efi+/UaEAwWAtepuvv3tw/B//hqZGQqDFSmyHC7v0z8EldlZQQEgTfMgF23h8/T+gEhQGrcQYrMBKVtvfDb4qU/j3DMK3SdIKWsNs++M1iS8R8W/gULyG1771w+/stQWpTpFpzByb09MRHEwaoxUxToGtaZiBrE72cXzMyhcDiIRgCHxJPIxKt5aF23gMf0iquz8BJmAAFpUStxvG0xIA3arcHPsvrJM1wvFTDeEGQeKCewCo1jgRDwKuJrrh9C3osIfyiz+NboZFKxU0xJEYmeJbBhPoKiKyMDXfHd0mJWSETnoKiKCmgSioFDKFr4T1lbn/fgkHf+PGu+A+A12imMqdAqzNUXlFCFP+gOD41CKJBcCB4bKSnOmitB5VWSgnMrSjhCnu8D1hoS1xP/KcH1BhZdGi4c4VNAh/I5PGyRjdQqje+A6YXPIpup/DhHlMUh44f1hAJ6x77z3OwVjG/0ml7Ot4gOWnxvkfbALw+2EnPGc43ojWk3qNt7hdpiSp0ajcMukHQPB/4o3vPf8TKQgc+pqXdkpEtgGewE7THel/j66dtdBLA1XAYRXK8AGbxC/6RHvjbCuOE0Kklk8lcg/+OicaJcOhfTflTVYCHuYvX3XH7QCxcUAol9i6VursLha+VfcLPHwamZjfSAgxi6QId6oFnC5awsjdoWYjFPrOlB3QONAtJjrwsetiq2jkzgfc9nPdklJBDyXvGj+Zf+jIKe7pPoNFoOHwyoyaQKFcD9z3wzbwSGnT6fCMB9u5UmWMLYwTJQo5QC2AB6r122ukBJeVWnA6HIwlLnp/bI/w5wI3tJR3LjcZMbvVzL/xHwOG+M6s2mFeSjRm0QRyDYnyCOEv/0fOYGM/vha4N3J1S5hoZhCAcYBro/AwV63NIjafuzL4rLSjOZYKeIT45j9XUnQTs/Y7Inbqp/pABeIPBqsTystr0/pd9T9jprZIGO9CHa4gTPHairxr/eP/rwai+YdzlWQfALSHu4qTxfHxiQKVTaBINvfCjDFo1Fmzjor/zP+0BNXdgxSTdqRe5w0bT2hq+293mdWDOSJ5DWbgwd4uGpSPxXW5WGzGddhYWHsDRguqpO5x9jjq4HY3BnjtcRRGGe/Xqn38YC6SraVt84jnXwo0FgC8kOK7s+mv91St6RhVnZ72Vqeln4EM+cFY43SHgdj584c9ormdFbx3Jbk73v9PuvNCCvx67ntPzlmG2xUvUhQpZz9roxHdwXx4e7Yb/fdXc7o81PFcUxW2ry+Wy5miM4gQkEAh0uxKfXWbdLXs1XGxZURRnXZpZrVbXegT/rUvm571itnncQPctWZso2hAdd61GIzIuf32y5zduL0VxtwQPWG2vB7QP0OKKVaejOI7L8lP4+S3r+wY+zSZfGPvGPlFlt8FQ3BCPQPYpfOjWs3QHtMVLJqmU0NLe9XVhsBpOwyER0+D1oE534t8Hsn/KctwLokxUgeunD6FwCA2xMGtAPAdhjkr55afwoaksGpHlAKTnWUK9ZIAt15k/U+mK5voSuoI9Vre/fZPOBcFQKg4+PXsXg7urVra0Stvqmud4mTp4hN/s+lAIy8ErIC7Oz8aITzqegYkUL4tawQ+ivEvudP7Gt6SPpCpewJ8BfN+pb/aq71dG2kjayLuJ3/vC+gB+EBe9Xm/8KEQs67hShMmgIRsNylFuFe9UL1IGHXHNAtr77ZYN7htNB8LxJmCnyaBZULpJ6/g4ZZQCX83FAS1u3675xnTaX/GKFdLl+gIaDZeFpU78rS9oDnzZEmHstqPJKc9n90LJPThyBUZIVRtMv8Q1v9Xx8bzxigddWo1t7yZ//zgSCwRiK6CO0PUD2OR4hMnhHfiPtYiJr4a8Jj4MbHNe7UC4RtTfc5wsd+DD6RbxxTZ8chtkrcJGIlqX41GqTVzFp3wmfmCNi5rNT74Z3nwHi2BjZW11AtdzgvxIfSBl4l/Klzr+bfLvzSNYA1u9xTfmz8f4lLmA5HWfgV8eTa7BEohxox1xeZ1F5Ef4fTrYnL4oGjb7QZ3JVgk2W4KJPMZvmWbo9KWJ27QsXKHm3DkhJT/Gs6z55lo0abV5wCSL5txL/CMa4PYPUXN+5qwTj68aXwa5MP4Efj/VDA4TW3BV3PQMp7Wlgnfg555mcPFO8RbXMbXv8Oh6pG3J7IRM8bq3Q/zKLFqUQ3GteNYvbepG1XG57O0Qt9Hmd1bOKC1qbZH/zbK78FWzYMJ2aZoXPq7kr8ZvORr+iUSjJzQb/Gpa5l8BBgBZTppAyfsf0wAAAABJRU5ErkJggg==' width='62' height='80' style='float: left; margin-right: 28px;' />";

            var tagProcessors = (DefaultTagProcessorFactory)Tags.GetHtmlTagProcessorFactory();
            tagProcessors.RemoveProcessor(HTML.Tag.IMG); // remove the default processor
            tagProcessors.AddProcessor(HTML.Tag.IMG, new CustomImageTagProcessor()); // use our new processor

            CssFilesImpl cssFiles = new CssFilesImpl();
            cssFiles.Add(XMLWorkerHelper.GetInstance().GetDefaultCSS()); 
            var cssResolver = new StyleAttrCSSResolver(cssFiles);
            cssResolver.AddCss(@"code { padding: 2px 4px; }", "utf-8", true);
            var charset = Encoding.UTF8;
            var hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
            hpc.SetAcceptUnknown(true).AutoBookmark(true).SetTagFactory(tagProcessors); // inject the tagProcessors
            var htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(doc, writer));
            var pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
            var worker = new XMLWorker(pipeline, true);
            var xmlParser = new XMLParser(true, worker, charset);
            xmlParser.Parse(new StringReader(html));
        }
        Process.Start("test.pdf");

iTextSharp XMLWorkerHelper and Images for HTML to PDF

XML Worker isn't mentioned in the book, because the book was written in 2009 and the development on XML Worker started somewhere in 2011. Your question is very long, yet it is missing an important element: an HTML sample like the one's provided for the sandbox examples (which you don't mention). For instance: when the parse the thoreau.html example using ParseHtmlImagesLinksOops, we lose all images: thoreau_oops.pdf; when we use ParseHtmlImagesLinks, we use an ImageProvider that makes sure we get the correct paths to the images and the result looks quite OK: thoreau.pdf (so do the links, by the way).

However, when I look at the actual requirement, I see that you want to create a letter with an image for letterhead. In that case, I would use page events to add company stationary to each page. How to do that is explained in the book.

How can I use iText to convert HTML with images and hyperlinks to PDF?

Out of the box XMLWorker only understands absolute URIs, so the described issues are expected behavior. The parser can't automagically deduce URI schemes or paths without some additional information.

Implementing an ILinkProvider fixes the broken hyperlink problem, and implementing an IImageProvider fixes the broken image problem. Since both implementations must perform URI resolution, that's the first step. The following helper class does that, and also tries to make web (ASP.NET) context calls (examples follow) as simple as possible:

// resolve URIs for LinkProvider & ImageProvider
public class UriHelper
{
    /* IsLocal; when running in web context:
     * [1] give LinkProvider http[s] scheme; see CreateBase(string baseUri)
     * [2] give ImageProvider relative path starting with '/' - see:
     *     Join(string relativeUri)
     */
    public bool IsLocal { get; set; }
    public HttpContext HttpContext { get; private set; }
    public Uri BaseUri { get; private set; }

    public UriHelper(string baseUri) : this(baseUri, true) {}
    public UriHelper(string baseUri, bool isLocal)
    {
        IsLocal = isLocal;
        HttpContext = HttpContext.Current;
        BaseUri = CreateBase(baseUri);
    }

    /* get URI for IImageProvider to instantiate iTextSharp.text.Image for 
     * each <img> element in the HTML.
     */
    public string Combine(string relativeUri)
    {
        /* when running in a web context, the HTML is coming from a MVC view 
         * or web form, so convert the incoming URI to a **local** path
         */
        if (HttpContext != null && !BaseUri.IsAbsoluteUri && IsLocal)
        {
            return HttpContext.Server.MapPath(
                // Combine() checks directory traversal exploits
                VirtualPathUtility.Combine(BaseUri.ToString(), relativeUri)
            );
        }
        return BaseUri.Scheme == Uri.UriSchemeFile 
            ? Path.Combine(BaseUri.LocalPath, relativeUri)
            // for this example we're assuming URI.Scheme is http[s]
            : new Uri(BaseUri, relativeUri).AbsoluteUri;
    }

    private Uri CreateBase(string baseUri)
    {
        if (HttpContext != null)
        {   // running on a web server; need to update original value  
            var req = HttpContext.Request;
            baseUri = IsLocal
                // IImageProvider; absolute virtual path (starts with '/')
                // used to convert to local file system path. see:
                // Combine(string relativeUri)
                ? req.ApplicationPath
                // ILinkProvider; absolute http[s] URI scheme
                : req.Url.GetLeftPart(UriPartial.Authority)
                    + HttpContext.Request.ApplicationPath;
        }

        Uri uri;
        if (Uri.TryCreate(baseUri, UriKind.RelativeOrAbsolute, out uri)) return uri;

        throw new InvalidOperationException("cannot create a valid BaseUri");
    }
}

Implementing ILinkProvider is pretty simple now that UriHelper gives the base URI. We just need the correct URI scheme (file or http[s]):

// make hyperlinks with relative URLs absolute
public class LinkProvider : ILinkProvider
{
    // rfc1738 - file URI scheme section 3.10
    public const char SEPARATOR = '/';
    public string BaseUrl { get; private set; }

    public LinkProvider(UriHelper uriHelper)
    {
        var uri = uriHelper.BaseUri;
        /* simplified implementation that only takes into account:
         * Uri.UriSchemeFile || Uri.UriSchemeHttp || Uri.UriSchemeHttps
         */
        BaseUrl = uri.Scheme == Uri.UriSchemeFile
            // need trailing separator or file paths break
            ? uri.AbsoluteUri.TrimEnd(SEPARATOR) + SEPARATOR
            // assumes Uri.UriSchemeHttp || Uri.UriSchemeHttps
            : BaseUrl = uri.AbsoluteUri;
    }

    public string GetLinkRoot()
    {
        return BaseUrl;
    }
}

IImageProvider only requires implementing a single method, Retrieve(string src), but Store(string src, Image img) is easy - note inline comments there and for GetImageRootPath():

// handle <img> elements in HTML  
public class ImageProvider : IImageProvider
{
    private UriHelper _uriHelper;
    // see Store(string src, Image img)
    private Dictionary<string, Image> _imageCache = 
        new Dictionary<string, Image>();

    public virtual float ScalePercent { get; set; }
    public virtual Regex Base64 { get; set; }

    public ImageProvider(UriHelper uriHelper) : this(uriHelper, 67f) { }
    //              hard-coded based on general past experience ^^^
    // but call the overload to supply your own
    public ImageProvider(UriHelper uriHelper, float scalePercent)
    {
        _uriHelper = uriHelper;
        ScalePercent = scalePercent;
        Base64 = new Regex( // rfc2045, section 6.8 (alphabet/padding)
            @"^data:image/[^;]+;base64,(?<data>[a-z0-9+/]+={0,2})$",
            RegexOptions.Compiled | RegexOptions.IgnoreCase
        );
    }

    public virtual Image ScaleImage(Image img)
    {
        img.ScalePercent(ScalePercent);
        return img;
    }

    public virtual Image Retrieve(string src)
    {
        if (_imageCache.ContainsKey(src)) return _imageCache[src];

        try
        {
            if (Regex.IsMatch(src, "^https?://", RegexOptions.IgnoreCase))
            {
                return ScaleImage(Image.GetInstance(src));
            }

            Match match;
            if ((match = Base64.Match(src)).Length > 0)
            {
                return ScaleImage(Image.GetInstance(
                    Convert.FromBase64String(match.Groups["data"].Value)
                ));
            }

            var imgPath = _uriHelper.Combine(src);
            return ScaleImage(Image.GetInstance(imgPath));
        }
        // not implemented to keep the SO answer (relatively) short
        catch (BadElementException ex) { return null; }
        catch (IOException ex) { return null; }
        catch (Exception ex) { return null; }
    }

    /*
     * always called after Retrieve(string src):
     * [1] cache any duplicate <img> in the HTML source so the image bytes
     *     are only written to the PDF **once**, which reduces the 
     *     resulting file size.
     * [2] the cache can also **potentially** save network IO if you're
     *     running the parser in a loop, since Image.GetInstance() creates
     *     a WebRequest when an image resides on a remote server. couldn't
     *     find a CachePolicy in the source code
     */
    public virtual void Store(string src, Image img)
    {
        if (!_imageCache.ContainsKey(src)) _imageCache.Add(src, img);
    }

    /* XMLWorker documentation for ImageProvider recommends implementing
     * GetImageRootPath():
     * 
     * http://demo.itextsupport.com/xmlworker/itextdoc/flatsite.html#itextdoc-menu-10
     * 
     * but a quick run through the debugger never hits the breakpoint, so 
     * not sure if I'm missing something, or something has changed internally 
     * with XMLWorker....
     */
    public virtual string GetImageRootPath() { return null; }
    public virtual void Reset() { }
}

Based on the XML Worker documentation it's pretty straightforward to hook the implementations of ILinkProvider and IImageProvider above into a simple parser class:

/* a simple parser that uses XMLWorker and XMLParser to handle converting 
 * (most) images and hyperlinks internally
 */
public class SimpleParser
{
    public virtual ILinkProvider LinkProvider { get; set; }
    public virtual IImageProvider ImageProvider { get; set; }

    public virtual HtmlPipelineContext HtmlPipelineContext { get; set; }
    public virtual ITagProcessorFactory TagProcessorFactory { get; set; }
    public virtual ICSSResolver CssResolver { get; set; }

    /* overloads simplfied to keep SO answer (relatively) short. if needed
     * set LinkProvider/ImageProvider after instantiating SimpleParser()
     * to override the defaults (e.g. ImageProvider.ScalePercent)
     */
    public SimpleParser() : this(null) { }
    public SimpleParser(string baseUri)
    {
        LinkProvider = new LinkProvider(new UriHelper(baseUri, false));
        ImageProvider = new ImageProvider(new UriHelper(baseUri, true));

        HtmlPipelineContext = new HtmlPipelineContext(null);

        // another story altogether, and not implemented for simplicity 
        TagProcessorFactory = Tags.GetHtmlTagProcessorFactory();
        CssResolver = XMLWorkerHelper.GetInstance().GetDefaultCssResolver(true);
    }

    /*
     * when sending XHR via any of the popular JavaScript frameworks,
     * <img> tags are **NOT** always closed, which results in the 
     * infamous iTextSharp.tool.xml.exceptions.RuntimeWorkerException:
     * 'Invalid nested tag a found, expected closing tag img.' a simple
     * workaround.
     */
    public virtual string SimpleAjaxImgFix(string xHtml)
    {
        return Regex.Replace(
            xHtml,
            "(?<image><img[^>]+)(?<=[^/])>",
            new MatchEvaluator(match => match.Groups["image"].Value + " />"),
            RegexOptions.IgnoreCase | RegexOptions.Multiline
        );
    }

    public virtual void Parse(Stream stream, string xHtml)
    {
        xHtml = SimpleAjaxImgFix(xHtml);

        using (var stringReader = new StringReader(xHtml))
        {
            using (Document document = new Document())
            {
                PdfWriter writer = PdfWriter.GetInstance(document, stream);
                document.Open();

                HtmlPipelineContext
                    .SetTagFactory(Tags.GetHtmlTagProcessorFactory())
                    .SetLinkProvider(LinkProvider)
                    .SetImageProvider(ImageProvider)
                ;
                var pdfWriterPipeline = new PdfWriterPipeline(document, writer);
                var htmlPipeline = new HtmlPipeline(HtmlPipelineContext, pdfWriterPipeline);
                var cssResolverPipeline = new CssResolverPipeline(CssResolver, htmlPipeline);

                XMLWorker worker = new XMLWorker(cssResolverPipeline, true);
                XMLParser parser = new XMLParser(worker);
                parser.Parse(stringReader);
            }
        }
    }
}

As commented inline, SimpleAjaxImgFix(string xHtml) specifically handles XHR that may send unclosed <img> tags, which is valid HTML, but invalid XML that will break XMLWorker . A simple explanation & implementation of how to receive a PDF or other binary data with XHR and iTextSharp can be found here.

A Regex was used in SimpleAjaxImgFix(string xHtml) so that anyone using (copy/paste?) the code doesn't need to add another nuget package, but a HTML parser like HtmlAgilityPack should be used, since it's turns this:

<div><img src='a.gif'><br><hr></div>

into this:

<div><img src='a.gif' /><br /><hr /></div>

with only a few lines of code:

var hDocument = new HtmlDocument()
{
    OptionWriteEmptyNodes = true,
    OptionAutoCloseOnEnd = true
};
hDocument.LoadHtml("<div><img src='a.gif'><br><hr></div>");
var closedTags  = hDocument.DocumentNode.WriteTo();

Also of note - use SimpleParser.Parse() above as a general blueprint to additionally implement a custom ICSSResolver or ITagProcessorFactory, which is explained in the documentation.

Now the issues described in the question should be taken care of. Called from a MVC Action Method:

[HttpPost]  // some browsers have URL length limits
[ValidateInput(false)] // or throws HttpRequestValidationException
public ActionResult Index(string xHtml)
{
    Response.ContentType = "application/pdf";
    Response.AppendHeader(
        "Content-Disposition", "attachment; filename=test.pdf"
    );
    var simpleParser = new SimpleParser();
    simpleParser.Parse(Response.OutputStream, xHtml);

    return new EmptyResult();
}

or from a Web Form that gets HTML from a server control:

Response.ContentType = "application/pdf";
Response.AppendHeader("Content-Disposition", "attachment; filename=test.pdf");
using (var stringWriter = new StringWriter())
{
    using (var htmlWriter = new HtmlTextWriter(stringWriter))
    {
        ConvertControlToPdf.RenderControl(htmlWriter);
    }
    var simpleParser = new SimpleParser();
    simpleParser.Parse(Response.OutputStream, stringWriter.ToString());
}
Response.End();

or a simple HTML file with hyperlinks and images on the file system:

<h1>HTML Page 00 on Local File System</h1>
<div>
    <div>
        Relative <img>: <img src='Images/alt-gravatar.png' />
    </div>
    <div>
        Hyperlink to file system HTML page: 
        <a href='file-system-html-01.html'>Page 01</a>
    </div>
</div>

or HTML from a remote web site:

<div>
    <div>
        <img width="200" alt="Wikipedia Logo"
             src="portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png">
    </div>
    <div lang="en">
        <a href="https://en.wikipedia.org/">English</a>
    </div>
    <div lang="en">
        <a href="wiki/IText">iText</a>
    </div>
</div>

Above two HTML snippets run from a console app:

var filePaths = Path.Combine(basePath, "file-system-html-00.html");
var htmlFile = File.ReadAllText(filePaths);
var remoteUrl = Path.Combine(basePath, "wikipedia.html");
var htmlRemote = File.ReadAllText(remoteUrl);
var outputFile = Path.Combine(basePath, "filePaths.pdf");
var outputRemote = Path.Combine(basePath, "remoteUrl.pdf");

using (var stream = new FileStream(outputFile, FileMode.Create))
{
    var simpleParser = new SimpleParser(basePath);
    simpleParser.Parse(stream, htmlFile);
}
using (var stream = new FileStream(outputRemote, FileMode.Create))
{
    var simpleParser = new SimpleParser("https://wikipedia.org");
    simpleParser.Parse(stream, htmlRemote);
}

Quite a long answer, but taking a look at questions here at SO tagged html, pdf, and itextsharp, as of this writing (2016-02-23) there are 776 results against 4,063 total tagged itextsharp - that's 19%.

Render images in iTextSharp HTML5 to PDF

It turns out that iTextSharp does not honor the doctype and instead forces XHTML. According to XHTML, the image tag needs to be closed. If you use it seems to generate without the exception. However, I was not able to get the base64 encoded content to render. There is no error in this case but the image does not show up.

Can Itextsharp.Xmlworker Render Embedded Images