/*
Usage:
java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess file.pdf | tee PDFExcess.log
To search every PDF file in your home directory:
(find ~ -name '*.pdf' -print0 | xargs -0 -n 1 java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess) | tee PDFExcess.log
Then you want to search for anything interesting in the *.excess files
the program writes, for example with
strings *.excess
Scans PDF files for excess data (uninitialized memory) in grayscale
images, as left by versions of pdfTeX before r43637:
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=796490#72
https://www.tug.org/pipermail/pdftex/2017-March/009100.html
When embedding a PNG image with an alpha channel, these versions of
pdfTeX allocate a buffer twice as big as needed for the grayscale alpha
mask, and copy the entire buffer (including the uninitialized second
half) to the output stream.
Requires Apache PDFBox (https://pdfbox.apache.org/) version 1.8 (not
2.0). On Debian, do
apt-get install libpdfbox-java libcommons-logging-java
javac -classpath /usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar PDFExcess.java
On other platforms, you may be able to install dependencies using Maven:
https://pdfbox.apache.org/1.8/dependencies.html#xmp-metadata
org.apache.pdfbox
pdfbox
1.8.12
The output format is tab-separated columns:
filename objectNumber expectedLength actualLength excessFilename
The program writes an output line for every grayscale image whose
actualLength is greater than its expectedLength. The expectedLength is
computed as width * height * bitsPerPixel / 8. For each output line, the
program also saves a file containing all the excess bytes in the stream,
with the filename .excess, where is the SHA-256 digest
of the excess bytes. The idea is that you search for any interesting
information in the *.excess files, then use the log file to map back to
an input PDF file.
Images affected by the pdfTeX bug will have an actualLength that is
exactly two times expectedLength. This program might find some other
false positives(?) that are not related to the pdfTeX bug, where
actualLength is not exactly twice expectedLength. If you want to
prefilter for files produced by pdfTeX only, you can do
(find ~ -name '*.pdf' -print0 | xargs -0 grep -lZ 'This is pdfTeX' | xargs -0 -n 1 java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess) | tee PDFExcess.log
*/
import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.security.MessageDigest;
import java.security.DigestInputStream;
import java.util.Map;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
/*
This originally was based on the ExtractImages tool that comes with
PDFBox, but I had to switch over to using the COS representation rather
than the PD representation in order to get low-level enough access to
the image XObjects.
https://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
https://pdfbox.apache.org/1.8/architecture.html
*/
public class PDFExcess
{
public static void main(String[] args)
{
int exitStatus = 0;
for (String filename : args) {
try {
process(filename);
} catch (IOException e) {
System.err.println(filename + ": " + e.toString());
exitStatus = 1;
}
}
System.exit(exitStatus);
}
private static void process(String filename) throws IOException
{
System.err.println(filename);
COSDocument document = PDDocument.load(new File(filename)).getDocument();
try {
processDocument(document, filename);
} finally {
document.close();
}
}
private static void processDocument(COSDocument document, String filename) throws IOException
{
for (COSObject object : document.getObjectsByType("XObject")) {
processObject(object, filename);
}
}
private static void processObject(COSObject object, String filename) throws IOException
{
long objectNumber = object.getObjectNumber().longValue();
COSStream stream = (COSStream) object.getObject();
// Uncomment this to view XObject dictionaries for debugging.
/*
System.err.println(filename + " obj " + objectNumber + " = {");
dumpDictionary(stream, System.err);
System.err.println("}");
*/
// We're only interested in grayscale images.
if (stream.getItem(COSName.SUBTYPE) != COSName.IMAGE)
return;
if (stream.getDictionaryObject(COSName.COLORSPACE, COSName.CS) != COSName.DEVICEGRAY)
return;
int width, height, bitsPerComponent;
try {
width = getInt(stream, COSName.WIDTH);
height = getInt(stream, COSName.HEIGHT);
bitsPerComponent = getInt(stream, COSName.BITS_PER_COMPONENT);
} catch (IOException e) {
throw new IOException("object " + objectNumber + ": " + e.toString());
}
long expectedLength = width * height * bitsPerComponent / 8;
long actualLength = streamLength(stream.getUnfilteredStream());
if (actualLength <= expectedLength)
return;
InputStream excessStream;
excessStream = stream.getUnfilteredStream();
excessStream.skip(expectedLength);
String excessDigest = hexEncode(streamDigest(excessStream));
String excessFilename = excessDigest + ".excess";
excessStream = stream.getUnfilteredStream();
excessStream.skip(expectedLength);
Path tempFile = Files.createTempFile(Paths.get("."), "PDFExcess.", ".excess");
Files.copy(excessStream, tempFile, StandardCopyOption.REPLACE_EXISTING);
Files.move(tempFile, Paths.get(excessFilename),
StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
System.out.println(String.format("%s\t%d\t%d\t%d\t%s",
filename, objectNumber, expectedLength, actualLength, excessFilename));
System.out.flush();
}
// Get an integer value from the dictionary and throw an IOException if
// the name is not present.
private static int getInt(COSDictionary dict, COSName name) throws IOException
{
int value = dict.getInt(name);
if (value == -1)
throw new IOException("bad " + name.getName());
return value;
}
// Consume an InputStream to count how long it is.
private static long streamLength(InputStream stream) throws IOException {
byte[] buf = new byte[1024];
long total = 0;
for (;;) {
long n = stream.read(buf);
if (n == -1)
break;
total += n;
}
return total;
}
private static byte[] streamDigest(InputStream stream) throws IOException {
MessageDigest digest;
try {
digest = MessageDigest.getInstance("SHA-256");
} catch (java.security.NoSuchAlgorithmException e) {
System.err.println("no SHA-256");
System.exit(1);
return null;
}
DigestInputStream digestStream = new DigestInputStream(stream, digest);
// Read the rest of the stream and feed it to the digest.
streamLength(digestStream);
return digest.digest();
}
private static String hexEncode(byte[] data) {
StringBuilder s = new StringBuilder();
for (byte b : data) {
s.append(String.format("%02x", b));
}
return s.toString();
}
// Dump the contents of a COSDictionary, for debugging purposes.
private static void dumpDictionary(COSDictionary dict, PrintStream out)
{
for (Map.Entry entry : dict.entrySet()) {
out.println(entry.getKey() + " = " + entry.getValue());
}
}
}