11 July 2010

PDFBox: insert/extract metadata from/into a PDF document


The apache project PDFBox contains is an API for handling some PDF documents. In the current post I'll show how I've used the PDFBox API to insert and extract some XMP metadata into/from a PDFDocument.

Extracting metadata from a PDF document

Reading the metadat is as simple as:
InputStream in=new FileInputStream(pdfFile);
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}

Inserting metadata into a PDF document

The metadata to be inserted are stored in a XML file.
<?xml version="1.0" encoding="UTF-8"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://a3.twimg.com/profile_images/51679789/photoIG_bigger.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>

This XML file is loaded as a DOM object in memory:
DocumentBuilderFactory f= DocumentBuilderFactory.newInstance();
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
The pdf source is opened and the DOM document is inserted as a metadata. The pdf is then saved:
InputStream in=new FileInputStream(pdfIn);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);

Source code: ExtractXMP.java

import java.io.*;
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;


public class ExtractXMP
{
static private void extract(InputStream in)
throws Exception
{
PDDocument document=null;
try
{
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = cat.getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
}
catch(Exception err)
{
throw err;
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
}
}

static public void main(String args[])
{
try
{
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("pdf1 pdf2 pdf3 ....");
return;
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
extract(System.in);
}
else
{
while(optind< args.length)
{
String filename=args[optind++];
InputStream in=new FileInputStream(filename);
extract(in);
in.close();
}
}


}
catch(Throwable err)
{
err.printStackTrace();
}
}
}

Source code: InsertXMP.java

import java.io.*;
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;


public class InsertXMP
{



static public void main(String args[])
{
PDDocument document=null;
InputStream in=null;
try
{
String xmpIn=null;
String pdfIn=null;
String pdfOut=null;
Document xmpDoc=null;
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("-pdfin|-i <pdf-in>");
System.err.println("-xmpin|-x <xmp-in>");
System.err.println("-pdfout|-o <pdf-out>");
return;
}
else if(args[optind].equals("-xmpin") || args[optind].equals("-x"))
{
xmpIn= args[++optind];
}
else if(args[optind].equals("-pdfin") || args[optind].equals("-i"))
{
pdfIn= args[++optind];
}
else if(args[optind].equals("-pdfout") || args[optind].equals("-o"))
{
pdfOut= args[++optind];
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind!=args.length)
{
System.err.println("Illegal number of arguments");
return;
}
if(pdfIn==null)
{
System.err.println("pdf-in missing");
return;
}
if(pdfOut==null)
{
System.err.println("pdf-out missing");
return;
}
if(pdfIn.equals(pdfOut))
{
System.err.println("pdf-out is same as pdf-in");
return;
}
if(xmpIn==null)
{
System.err.println("XMP missing");
return;
}
else
{
DocumentBuilderFactory f= DocumentBuilderFactory.newInstance();
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
}

in=new FileInputStream(pdfIn);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Warning ! Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);
}
catch(Throwable err)
{
err.printStackTrace();
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
if(in!=null) try { in.close();} catch(Throwable err2) {}
}
}
}

Example

The following Makefile downloads a pdf file, compiles both program , inserts and extracts the metadata:
CLASSPATH=pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:.
test: InsertXMP.class ExtractXMP.class article.pdf
echo "Metadata in article"
java -cp ${CLASSPATH} ExtractXMP article.pdf
echo "Insert Metadata in article"
java -cp ${CLASSPATH} InsertXMP -i article.pdf -o article_meta.pdf -x metadata.xmp
echo "Metadata in new article"
java -cp ${CLASSPATH} ExtractXMP article_meta.pdf
InsertXMP.class:InsertXMP.java
javac -cp ${CLASSPATH} InsertXMP.java
ExtractXMP.class:ExtractXMP.java
javac -cp ${CLASSPATH} ExtractXMP.java

article.pdf:
wget -O $@ "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"

Output

javac -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. InsertXMP.java
javac -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP.java

wget -O article.pdf "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"
--2010-07-11 13:15:10-- http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf
Resolving www.biomedcentral.com... 213.219.33.18
Connecting to www.biomedcentral.com|213.219.33.18|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1295524 (1.2M) [application/pdf]
Saving to: `article.pdf'

100%[================================================================================>] 1,295,524 123K/s in 11s

2010-07-11 13:15:21 (113 KB/s) - `article.pdf' saved [1295524/1295524]

Metadata in article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article.pdf

<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="3.1-701">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about=""
xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
<pdf:Producer>Acrobat Distiller 7.0 (Windows)</pdf:Producer>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xap="http://ns.adobe.com/xap/1.0/">
<xap:CreateDate>2009-05-05T19:28:38Z</xap:CreateDate>
<xap:CreatorTool>FrameMaker 7.1</xap:CreatorTool>
<xap:ModifyDate>2009-05-06T02:18:59+05:30</xap:ModifyDate>
<xap:MetadataDate>2009-05-06T02:18:59+05:30</xap:MetadataDate>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:format>application/pdf</dc:format>
<dc:title>
<rdf:Alt>
<rdf:li xml:lang="x-default">1471-2156-10-16.fm</rdf:li>
</rdf:Alt>
</dc:title>
<dc:creator>
<rdf:Seq>
<rdf:li>Ezhilan</rdf:li>
</rdf:Seq>
</dc:creator>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xapMM="http://ns.adobe.com/xap/1.0/mm/">
<xapMM:DocumentID>uuid:d1d0f8d9-8321-4e4b-828b-d31b75daba0f</xapMM:DocumentID>
<xapMM:InstanceID>uuid:39d7db98-d873-4b33-be85-87319547e81c</xapMM:InstanceID>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>

<?xpacket end="w"?>



Insert Metadata in article


java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. InsertXMP -i article.pdf -o article_meta.pdf -x meta.xmp

Metadata in new article


java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article_meta.pdf
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://profile.ak.facebook.com/profile5/1306/97/s501154465_2583.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>


That's it

Pierre

3 comments:

Anonymous said...

How can I run this in Eclipse? Thanks!

Anonymous said...

How can I incorporate this with eclipse? Thank you.

Unknown said...

From this code I got metadata... but i need keyword value. how we can extract metadata.getInputStreamAsString() values

InputStream in=new FileInputStream(pdfFile);
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}