Using Itextsharp to Extract and Update Links in an Existing PDF

using ITextSharp to extract and update links in an existing PDF

This one is a little complicated if you don't know the internals of the PDF format and iText/iTextSharp's abstraction/implementation of it. You need to understand how to use PdfDictionary objects and look things up by their PdfName key. Once you get that you can read through the official PDF spec and poke around a document pretty easily. If you do care I've included the relevant parts of the PDF spec in parenthesis where applicable.

Anyways, a link within a PDF is stored as an annotation (PDF Ref 12.5). Annotations are page-based so you need to first get each page's annotation array individually. There's a bunch of different possible types of annotations so you need to check each one's SUBTYPE and see if its set to LINK (12.5.6.5). Every link should have an ACTION dictionary associated with it (12.6.2) and you want to check the action's S key to see what type of action it is. There's a bunch of possible ones for this, link's specifically could be internal links or open file links or play sound links or something else (12.6.4.1). You are looking only for links that are of type URI (note the letter I and not the letter L). URI Actions (12.6.4.7) have a URI key that holds the actual address to navigate to. (There's also an IsMap property for image maps that I can't actually imagine anyone using.)

Whew. Still reading? Below is a full working VS 2010 C# WinForms app based on my post here targeting iTextSharp 5.1.1.0. This code does two main things: 1) Create a sample PDF with a link in it pointing to Google.com and 2) replaces that link with a link to bing.com. The code should be pretty well commented but feel free to ask any questions that you might have.

using System;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;

namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{

//Folder that we are working in
private static readonly string WorkingFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Hyperlinked PDFs");
//Sample PDF
private static readonly string BaseFile = Path.Combine(WorkingFolder, "OldFile.pdf");
//Final file
private static readonly string OutputFile = Path.Combine(WorkingFolder, "NewFile.pdf");

public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{
CreateSamplePdf();
UpdatePdfLinks();
this.Close();
}

private static void CreateSamplePdf()
{
//Create our output directory if it does not exist
Directory.CreateDirectory(WorkingFolder);

//Create our sample PDF
using (iTextSharp.text.Document Doc = new iTextSharp.text.Document(PageSize.LETTER))
{
using (FileStream FS = new FileStream(BaseFile, FileMode.Create, FileAccess.Write, FileShare.Read))
{
using (PdfWriter writer = PdfWriter.GetInstance(Doc, FS))
{
Doc.Open();

//Turn our hyperlink blue
iTextSharp.text.Font BlueFont = FontFactory.GetFont("Arial", 12, iTextSharp.text.Font.NORMAL, iTextSharp.text.BaseColor.BLUE);

Doc.Add(new Paragraph(new Chunk("Go to URL", BlueFont).SetAction(new PdfAction("http://www.google.com/", false))));

Doc.Close();
}
}
}
}

private static void UpdatePdfLinks()
{
//Setup some variables to be used later
PdfReader R = default(PdfReader);
int PageCount = 0;
PdfDictionary PageDictionary = default(PdfDictionary);
PdfArray Annots = default(PdfArray);

//Open our reader
R = new PdfReader(BaseFile);
//Get the page cont
PageCount = R.NumberOfPages;

//Loop through each page
for (int i = 1; i <= PageCount; i++)
{
//Get the current page
PageDictionary = R.GetPageN(i);

//Get all of the annotations for the current page
Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);

//Make sure we have something
if ((Annots == null) || (Annots.Length == 0))
continue;

//Loop through each annotation

foreach (PdfObject A in Annots.ArrayList)
{
//Convert the itext-specific object as a generic PDF object
PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);

//Make sure this annotation has a link
if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
continue;

//Make sure this annotation has an ACTION
if (AnnotationDictionary.Get(PdfName.A) == null)
continue;

//Get the ACTION for the current annotation
PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);

//Test if it is a URI action
if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
{
//Change the URI to something else
AnnotationAction.Put(PdfName.URI, new PdfString("http://www.bing.com/"));
}
}
}

//Next we create a new document add import each page from the reader above
using (FileStream FS = new FileStream(OutputFile, FileMode.Create, FileAccess.Write, FileShare.None))
{
using (Document Doc = new Document())
{
using (PdfCopy writer = new PdfCopy(Doc, FS))
{
Doc.Open();
for (int i = 1; i <= R.NumberOfPages; i++)
{
writer.AddPage(writer.GetImportedPage(R, i));
}
Doc.Close();
}
}
}
}
}
}

EDIT

I should note, this only changes the actual link. Any text within the document won't get updated. Annotations are drawn on top of text but aren't really tied to the text underneath in anyway. That's another topic completely.

Convert external links in PDF to link to embedded attachment in iTextSharp

As noted in the comments you can only link to an embedded PDF file.

You are only changing the the D entry. You need to override the entire A entry, but make sure to keep the target's location.

Here is a quick POC I created:

PdfReader reader = new PdfReader(INPUT_FILE);
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(OUTPUT_FILE));

PdfFileSpecification fs = PdfFileSpecification.fileEmbedded(stamper.getWriter(), null, "EmbeddedFile.pdf", FileUtils.readFileToByteArray(new File(INPUT_FOLDER + "embeddedfile.pdf")));
fs.addDescription("specificname", false);
stamper.getWriter().addFileAttachment(fs);
PdfTargetDictionary targetDictionary = new PdfTargetDictionary(true);
targetDictionary.setEmbeddedFileName("specificname");

PdfDestination dest = new PdfDestination(PdfDestination.FIT);
dest.addFirst(new PdfNumber(1));
PdfAction action = PdfAction.gotoEmbedded(null, targetDictionary, dest, true);

PdfDictionary page = reader.getPageN(1);
PdfArray annotations = page.getAsArray(PdfName.ANNOTS);

for(int x=0;x<annotations.size();x++) {
PdfDictionary annotation = annotations.getAsDict(x);
PdfArray location = annotation.getAsArray(PdfName.RECT);
action.put(PdfName.RECT,location);
annotation.put(PdfName.A, action);
}

stamper.close();

INPUT_FILE points to the original file, OUTPUT_FILE points to where you want it to be saved, and INPUT_FOLDER + "embeddedFile.pdf" points to the PDF file you want your annotation to link to.

So action is your new action that points to the embedded PDF file (and only a PDF file). We just replace the old annotation's A entry with action. Then we make sure to set action's location to the location of the old annotation.

Adding pdf file to existing pdf collection using iTextSharp

One way to do this is by marking page 2 for post-processing using a PdfPageLabels object. Some sample code. See the links in the first paragraph of the link to the sample code for another way to do this by adding hidden fields to the document.

Alter Links to Other PDF Documents

In case anyone cares, I was able to use the code linked by Chris Haas in the first comment above but modified it as follows:

foreach (FileInfo file in files)
{

PdfReader reader = default(PdfReader);

bool linkReplaced = false;

//Setup some variables to be used later
reader = new PdfReader(file.FullName);

int pageCount = reader.NumberOfPages;
PdfDictionary pageDictionary = default(PdfDictionary);
PdfArray annots = default(PdfArray);

//Loop through each page
for (int i = 1; i <= pageCount; i++)
{
//Get the current page
pageDictionary = reader.GetPageN(i);

//Get all of the annotations for the current page
annots = pageDictionary.GetAsArray(PdfName.ANNOTS);

//Make sure we have something
if ((annots == null) || (annots.Length == 0))
continue;

foreach (PdfObject A in annots.ArrayList)
{
//Convert the itext-specific object as a generic PDF object
PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);

//Make sure this annotation has a link
if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
continue;

//Make sure this annotation has an ACTION
if (AnnotationDictionary.Get(PdfName.A) == null)
continue;

string fValue = string.Empty;
string ufValue = string.Empty;
string uriValue = string.Empty;

PdfObject a = AnnotationDictionary.Get(PdfName.A);
if (a.IsDictionary())
{
//Get the ACTION for the current annotation
PdfDictionary AnnotationAction = (PdfDictionary)a;

//Test if it is a URI action
if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
{
uriValue = AnnotationAction.Get(PdfName.URI).ToString();

if ((uriValue.IndexOf(findValue, StringComparison.OrdinalIgnoreCase) > -1))
{
string uriValueReplace = Replace(uriValue, findValue, replaceValue, StringComparison.OrdinalIgnoreCase);

//Change the URI to something else
AnnotationAction.Put(PdfName.URI, new PdfString(uriValueReplace));
linkReplaced = true;
}
}
}
else if (a.IsIndirect())
{
// Get the indirect reference
PdfIndirectReference indirectRef = (PdfIndirectReference)a;

// Get the GoToR type object which is at the document level
PdfDictionary goToR = (PdfDictionary)reader.GetPdfObject(indirectRef.Number);

// Get the FileSpec object whic his at the document lelvel
PdfObject f = goToR.Get(PdfName.F);

if (f == null || !f.IsIndirect())
continue;

PdfObject fileSpecObject = reader.GetPdfObject(((PdfIndirectReference)goToR.Get(PdfName.F)).Number);

if (!fileSpecObject.IsDictionary())
continue;

PdfDictionary fileSpec = (PdfDictionary)fileSpecObject;

fValue = fileSpec.Get(PdfName.F).ToString();
ufValue = fileSpec.Get(PdfName.UF).ToString();

if ((fValue.IndexOf(findValue, StringComparison.OrdinalIgnoreCase) > -1) || (ufValue.IndexOf(findValue, StringComparison.OrdinalIgnoreCase) > -1))
{
string fValueReplace = Replace(fValue, findValue, replaceValue, StringComparison.OrdinalIgnoreCase);// fValue.Replace(findValue, replaceValue);
string ufValueReplace = Replace(fValue, findValue, replaceValue, StringComparison.OrdinalIgnoreCase);// ufValue.Replace(findValue, replaceValue);

// Update the references to the file
fileSpec.Put(PdfName.F, new PdfString(fValueReplace));
fileSpec.Put(PdfName.UF, new PdfString(ufValueReplace));

linkReplaced = true;
}
}
}
}
}


Related Topics



Leave a reply



Submit