PDFBox: insert/extract metadata from/into a PDF document
The apache project PDFBox contains is an API for handling some PDF documents. In the current post I'll show how I've used the PDFBox API to insert and extract some XMP metadata into/from a PDFDocument.
Extracting metadata from a PDF document
Reading the metadat is as simple as:InputStream in=new FileInputStream(pdfFile);
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
Inserting metadata into a PDF document
The metadata to be inserted are stored in a XML file.<?xml version="1.0" encoding="UTF-8"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://a3.twimg.com/profile_images/51679789/photoIG_bigger.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://a3.twimg.com/profile_images/51679789/photoIG_bigger.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>
This XML file is loaded as a DOM object in memory:
DocumentBuilderFactory f= DocumentBuilderFactory.newInstance();
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
The pdf source is opened and the DOM document is inserted as a metadata. The pdf is then saved:f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
InputStream in=new FileInputStream(pdfIn);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);
Source code: ExtractXMP.java
import java.io.*;
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;
public class ExtractXMP
{
static private void extract(InputStream in)
throws Exception
{
PDDocument document=null;
try
{
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = cat.getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
}
catch(Exception err)
{
throw err;
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
}
}
static public void main(String args[])
{
try
{
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("pdf1 pdf2 pdf3 ....");
return;
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
extract(System.in);
}
else
{
while(optind< args.length)
{
String filename=args[optind++];
InputStream in=new FileInputStream(filename);
extract(in);
in.close();
}
}
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;
public class ExtractXMP
{
static private void extract(InputStream in)
throws Exception
{
PDDocument document=null;
try
{
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = cat.getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
}
catch(Exception err)
{
throw err;
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
}
}
static public void main(String args[])
{
try
{
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("pdf1 pdf2 pdf3 ....");
return;
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
extract(System.in);
}
else
{
while(optind< args.length)
{
String filename=args[optind++];
InputStream in=new FileInputStream(filename);
extract(in);
in.close();
}
}
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}
Source code: InsertXMP.java
import java.io.*;
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class InsertXMP
{
static public void main(String args[])
{
PDDocument document=null;
InputStream in=null;
try
{
String xmpIn=null;
String pdfIn=null;
String pdfOut=null;
Document xmpDoc=null;
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("-pdfin|-i <pdf-in>");
System.err.println("-xmpin|-x <xmp-in>");
System.err.println("-pdfout|-o <pdf-out>");
return;
}
else if(args[optind].equals("-xmpin") || args[optind].equals("-x"))
{
xmpIn= args[++optind];
}
else if(args[optind].equals("-pdfin") || args[optind].equals("-i"))
{
pdfIn= args[++optind];
}
else if(args[optind].equals("-pdfout") || args[optind].equals("-o"))
{
pdfOut= args[++optind];
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind!=args.length)
{
System.err.println("Illegal number of arguments");
return;
}
if(pdfIn==null)
{
System.err.println("pdf-in missing");
return;
}
if(pdfOut==null)
{
System.err.println("pdf-out missing");
return;
}
if(pdfIn.equals(pdfOut))
{
System.err.println("pdf-out is same as pdf-in");
return;
}
if(xmpIn==null)
{
System.err.println("XMP missing");
return;
}
else
{
DocumentBuilderFactory f= DocumentBuilderFactory.newInstance();
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
}
in=new FileInputStream(pdfIn);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Warning ! Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);
}
catch(Throwable err)
{
err.printStackTrace();
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
if(in!=null) try { in.close();} catch(Throwable err2) {}
}
}
}
import org.apache.pdfbox.pdfparser.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.jempbox.xmp.XMPMetadata;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class InsertXMP
{
static public void main(String args[])
{
PDDocument document=null;
InputStream in=null;
try
{
String xmpIn=null;
String pdfIn=null;
String pdfOut=null;
Document xmpDoc=null;
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2010");
System.err.println("-h this screen");
System.err.println("-pdfin|-i <pdf-in>");
System.err.println("-xmpin|-x <xmp-in>");
System.err.println("-pdfout|-o <pdf-out>");
return;
}
else if(args[optind].equals("-xmpin") || args[optind].equals("-x"))
{
xmpIn= args[++optind];
}
else if(args[optind].equals("-pdfin") || args[optind].equals("-i"))
{
pdfIn= args[++optind];
}
else if(args[optind].equals("-pdfout") || args[optind].equals("-o"))
{
pdfOut= args[++optind];
}
else if (args[optind].equals("--"))
{
++optind;
break;
}
else if (args[optind].startsWith("-"))
{
System.err.println("bad argument " + args[optind]);
System.exit(-1);
}
else
{
break;
}
++optind;
}
if(optind!=args.length)
{
System.err.println("Illegal number of arguments");
return;
}
if(pdfIn==null)
{
System.err.println("pdf-in missing");
return;
}
if(pdfOut==null)
{
System.err.println("pdf-out missing");
return;
}
if(pdfIn.equals(pdfOut))
{
System.err.println("pdf-out is same as pdf-in");
return;
}
if(xmpIn==null)
{
System.err.println("XMP missing");
return;
}
else
{
DocumentBuilderFactory f= DocumentBuilderFactory.newInstance();
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
f.setCoalescing(true);
f.setNamespaceAware(true);
DocumentBuilder builder=f.newDocumentBuilder();
xmpDoc= builder.parse(xmpIn);
}
in=new FileInputStream(pdfIn);
PDFParser parser=new PDFParser(in);
parser.parse();
document= parser.getPDDocument();
if(document.isEncrypted())
{
System.err.println("Warning ! Document is Encrypted!");
}
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(document);
metadata.importXMPMetadata(new XMPMetadata(xmpDoc));
cat.setMetadata(metadata);
document.save(pdfOut);
}
catch(Throwable err)
{
err.printStackTrace();
}
finally
{
if(document!=null) try { document.close();} catch(Throwable err2) {}
if(in!=null) try { in.close();} catch(Throwable err2) {}
}
}
}
Example
The following Makefile downloads a pdf file, compiles both program , inserts and extracts the metadata:CLASSPATH=pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:.
test: InsertXMP.class ExtractXMP.class article.pdf
echo "Metadata in article"
java -cp ${CLASSPATH} ExtractXMP article.pdf
echo "Insert Metadata in article"
java -cp ${CLASSPATH} InsertXMP -i article.pdf -o article_meta.pdf -x metadata.xmp
echo "Metadata in new article"
java -cp ${CLASSPATH} ExtractXMP article_meta.pdf
InsertXMP.class:InsertXMP.java
javac -cp ${CLASSPATH} InsertXMP.java
ExtractXMP.class:ExtractXMP.java
javac -cp ${CLASSPATH} ExtractXMP.java
article.pdf:
wget -O $@ "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"
test: InsertXMP.class ExtractXMP.class article.pdf
echo "Metadata in article"
java -cp ${CLASSPATH} ExtractXMP article.pdf
echo "Insert Metadata in article"
java -cp ${CLASSPATH} InsertXMP -i article.pdf -o article_meta.pdf -x metadata.xmp
echo "Metadata in new article"
java -cp ${CLASSPATH} ExtractXMP article_meta.pdf
InsertXMP.class:InsertXMP.java
javac -cp ${CLASSPATH} InsertXMP.java
ExtractXMP.class:ExtractXMP.java
javac -cp ${CLASSPATH} ExtractXMP.java
article.pdf:
wget -O $@ "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"
Output
javac -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. InsertXMP.java
javac -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP.java
wget -O article.pdf "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"
--2010-07-11 13:15:10-- http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf
Resolving www.biomedcentral.com... 213.219.33.18
Connecting to www.biomedcentral.com|213.219.33.18|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1295524 (1.2M) [application/pdf]
Saving to: `article.pdf'
100%[================================================================================>] 1,295,524 123K/s in 11s
2010-07-11 13:15:21 (113 KB/s) - `article.pdf' saved [1295524/1295524]
Metadata in article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article.pdf
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="3.1-701">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about=""
xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
<pdf:Producer>Acrobat Distiller 7.0 (Windows)</pdf:Producer>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xap="http://ns.adobe.com/xap/1.0/">
<xap:CreateDate>2009-05-05T19:28:38Z</xap:CreateDate>
<xap:CreatorTool>FrameMaker 7.1</xap:CreatorTool>
<xap:ModifyDate>2009-05-06T02:18:59+05:30</xap:ModifyDate>
<xap:MetadataDate>2009-05-06T02:18:59+05:30</xap:MetadataDate>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:format>application/pdf</dc:format>
<dc:title>
<rdf:Alt>
<rdf:li xml:lang="x-default">1471-2156-10-16.fm</rdf:li>
</rdf:Alt>
</dc:title>
<dc:creator>
<rdf:Seq>
<rdf:li>Ezhilan</rdf:li>
</rdf:Seq>
</dc:creator>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xapMM="http://ns.adobe.com/xap/1.0/mm/">
<xapMM:DocumentID>uuid:d1d0f8d9-8321-4e4b-828b-d31b75daba0f</xapMM:DocumentID>
<xapMM:InstanceID>uuid:39d7db98-d873-4b33-be85-87319547e81c</xapMM:InstanceID>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>
Insert Metadata in article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. InsertXMP -i article.pdf -o article_meta.pdf -x meta.xmp
Metadata in new article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article_meta.pdf
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://profile.ak.facebook.com/profile5/1306/97/s501154465_2583.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>
javac -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP.java
wget -O article.pdf "http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf"
--2010-07-11 13:15:10-- http://www.biomedcentral.com/content/pdf/1471-2156-10-16.pdf
Resolving www.biomedcentral.com... 213.219.33.18
Connecting to www.biomedcentral.com|213.219.33.18|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1295524 (1.2M) [application/pdf]
Saving to: `article.pdf'
100%[================================================================================>] 1,295,524 123K/s in 11s
2010-07-11 13:15:21 (113 KB/s) - `article.pdf' saved [1295524/1295524]
Metadata in article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article.pdf
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="3.1-701">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about=""
xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
<pdf:Producer>Acrobat Distiller 7.0 (Windows)</pdf:Producer>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xap="http://ns.adobe.com/xap/1.0/">
<xap:CreateDate>2009-05-05T19:28:38Z</xap:CreateDate>
<xap:CreatorTool>FrameMaker 7.1</xap:CreatorTool>
<xap:ModifyDate>2009-05-06T02:18:59+05:30</xap:ModifyDate>
<xap:MetadataDate>2009-05-06T02:18:59+05:30</xap:MetadataDate>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:format>application/pdf</dc:format>
<dc:title>
<rdf:Alt>
<rdf:li xml:lang="x-default">1471-2156-10-16.fm</rdf:li>
</rdf:Alt>
</dc:title>
<dc:creator>
<rdf:Seq>
<rdf:li>Ezhilan</rdf:li>
</rdf:Seq>
</dc:creator>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:xapMM="http://ns.adobe.com/xap/1.0/mm/">
<xapMM:DocumentID>uuid:d1d0f8d9-8321-4e4b-828b-d31b75daba0f</xapMM:DocumentID>
<xapMM:InstanceID>uuid:39d7db98-d873-4b33-be85-87319547e81c</xapMM:InstanceID>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>
Insert Metadata in article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. InsertXMP -i article.pdf -o article_meta.pdf -x meta.xmp
Metadata in new article
java -cp pdfbox-app-1.2.1.jar:pdfbox-app-1.2.1.jar:. ExtractXMP article_meta.pdf
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="">
<dc:creator rdf:resource="mailto:plindenbaum@yahoo.fr"/>
<dc:title>Hello World</dc:title>
<dc:date>2010-07-11</dc:date>
</rdf:Description>
<foaf:Person rdf:about="mailto:plindenbaum@yahoo.fr">
<foaf:name>Pierre Lindenbaum</foaf:name>
<foaf:depiction rdf:resource="http://profile.ak.facebook.com/profile5/1306/97/s501154465_2583.jpg"/>
</foaf:Person>
</rdf:RDF>
</x:xmpmeta>
That's it
Pierre
3 comments:
How can I run this in Eclipse? Thanks!
How can I incorporate this with eclipse? Thank you.
From this code I got metadata... but i need keyword value. how we can extract metadata.getInputStreamAsString() values
InputStream in=new FileInputStream(pdfFile);
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}
Post a Comment