/* Usage: java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess file.pdf | tee PDFExcess.log To search every PDF file in your home directory: (find ~ -name '*.pdf' -print0 | xargs -0 -n 1 java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess) | tee PDFExcess.log Then you want to search for anything interesting in the *.excess files the program writes, for example with strings *.excess Scans PDF files for excess data (uninitialized memory) in grayscale images, as left by versions of pdfTeX before r43637: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=796490#72 https://www.tug.org/pipermail/pdftex/2017-March/009100.html When embedding a PNG image with an alpha channel, these versions of pdfTeX allocate a buffer twice as big as needed for the grayscale alpha mask, and copy the entire buffer (including the uninitialized second half) to the output stream. Requires Apache PDFBox (https://pdfbox.apache.org/) version 1.8 (not 2.0). On Debian, do apt-get install libpdfbox-java libcommons-logging-java javac -classpath /usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar PDFExcess.java On other platforms, you may be able to install dependencies using Maven: https://pdfbox.apache.org/1.8/dependencies.html#xmp-metadata org.apache.pdfbox pdfbox 1.8.12 The output format is tab-separated columns: filename objectNumber expectedLength actualLength excessFilename The program writes an output line for every grayscale image whose actualLength is greater than its expectedLength. The expectedLength is computed as width * height * bitsPerPixel / 8. For each output line, the program also saves a file containing all the excess bytes in the stream, with the filename .excess, where is the SHA-256 digest of the excess bytes. The idea is that you search for any interesting information in the *.excess files, then use the log file to map back to an input PDF file. Images affected by the pdfTeX bug will have an actualLength that is exactly two times expectedLength. This program might find some other false positives(?) that are not related to the pdfTeX bug, where actualLength is not exactly twice expectedLength. If you want to prefilter for files produced by pdfTeX only, you can do (find ~ -name '*.pdf' -print0 | xargs -0 grep -lZ 'This is pdfTeX' | xargs -0 -n 1 java -classpath '/usr/share/java/pdfbox.jar:/usr/share/java/commons-logging.jar:.' PDFExcess) | tee PDFExcess.log */ import java.io.File; import java.io.InputStream; import java.io.IOException; import java.io.PrintStream; import java.nio.file.FileAlreadyExistsException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.security.MessageDigest; import java.security.DigestInputStream; import java.util.Map; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; /* This originally was based on the ExtractImages tool that comes with PDFBox, but I had to switch over to using the COS representation rather than the PD representation in order to get low-level enough access to the image XObjects. https://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java https://pdfbox.apache.org/1.8/architecture.html */ public class PDFExcess { public static void main(String[] args) { int exitStatus = 0; for (String filename : args) { try { process(filename); } catch (IOException e) { System.err.println(filename + ": " + e.toString()); exitStatus = 1; } } System.exit(exitStatus); } private static void process(String filename) throws IOException { System.err.println(filename); COSDocument document = PDDocument.load(new File(filename)).getDocument(); try { processDocument(document, filename); } finally { document.close(); } } private static void processDocument(COSDocument document, String filename) throws IOException { for (COSObject object : document.getObjectsByType("XObject")) { processObject(object, filename); } } private static void processObject(COSObject object, String filename) throws IOException { long objectNumber = object.getObjectNumber().longValue(); COSStream stream = (COSStream) object.getObject(); // Uncomment this to view XObject dictionaries for debugging. /* System.err.println(filename + " obj " + objectNumber + " = {"); dumpDictionary(stream, System.err); System.err.println("}"); */ // We're only interested in grayscale images. if (stream.getItem(COSName.SUBTYPE) != COSName.IMAGE) return; if (stream.getDictionaryObject(COSName.COLORSPACE, COSName.CS) != COSName.DEVICEGRAY) return; int width, height, bitsPerComponent; try { width = getInt(stream, COSName.WIDTH); height = getInt(stream, COSName.HEIGHT); bitsPerComponent = getInt(stream, COSName.BITS_PER_COMPONENT); } catch (IOException e) { throw new IOException("object " + objectNumber + ": " + e.toString()); } long expectedLength = width * height * bitsPerComponent / 8; long actualLength = streamLength(stream.getUnfilteredStream()); if (actualLength <= expectedLength) return; InputStream excessStream; excessStream = stream.getUnfilteredStream(); excessStream.skip(expectedLength); String excessDigest = hexEncode(streamDigest(excessStream)); String excessFilename = excessDigest + ".excess"; excessStream = stream.getUnfilteredStream(); excessStream.skip(expectedLength); Path tempFile = Files.createTempFile(Paths.get("."), "PDFExcess.", ".excess"); Files.copy(excessStream, tempFile, StandardCopyOption.REPLACE_EXISTING); Files.move(tempFile, Paths.get(excessFilename), StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); System.out.println(String.format("%s\t%d\t%d\t%d\t%s", filename, objectNumber, expectedLength, actualLength, excessFilename)); System.out.flush(); } // Get an integer value from the dictionary and throw an IOException if // the name is not present. private static int getInt(COSDictionary dict, COSName name) throws IOException { int value = dict.getInt(name); if (value == -1) throw new IOException("bad " + name.getName()); return value; } // Consume an InputStream to count how long it is. private static long streamLength(InputStream stream) throws IOException { byte[] buf = new byte[1024]; long total = 0; for (;;) { long n = stream.read(buf); if (n == -1) break; total += n; } return total; } private static byte[] streamDigest(InputStream stream) throws IOException { MessageDigest digest; try { digest = MessageDigest.getInstance("SHA-256"); } catch (java.security.NoSuchAlgorithmException e) { System.err.println("no SHA-256"); System.exit(1); return null; } DigestInputStream digestStream = new DigestInputStream(stream, digest); // Read the rest of the stream and feed it to the digest. streamLength(digestStream); return digest.digest(); } private static String hexEncode(byte[] data) { StringBuilder s = new StringBuilder(); for (byte b : data) { s.append(String.format("%02x", b)); } return s.toString(); } // Dump the contents of a COSDictionary, for debugging purposes. private static void dumpDictionary(COSDictionary dict, PrintStream out) { for (Map.Entry entry : dict.entrySet()) { out.println(entry.getKey() + " = " + entry.getValue()); } } }