|
1 | 1 | package net.pdfix; |
2 | 2 |
|
| 3 | +import java.nio.channels.NonReadableChannelException; |
3 | 4 | import java.util.ArrayList; |
4 | 5 | import java.util.List; |
5 | 6 |
|
6 | 7 | import net.pdfix.pdfixlib.*; |
7 | 8 |
|
8 | 9 | public class FindDuplicateMcid { |
9 | | - // Helper function to get a readable object type |
10 | | - private static String getNiceObjType(PdfPageObjectType type) { |
11 | | - switch (type) { |
12 | | - case kPdsPageText: |
13 | | - return "text"; |
14 | | - case kPdsPagePath: |
15 | | - return "path"; |
16 | | - case kPdsPageImage: |
17 | | - return "image"; |
18 | | - case kPdsPageShading: |
19 | | - return "shading"; |
20 | | - case kPdsPageForm: |
21 | | - return "form"; |
22 | | - default: |
23 | | - return "unknown"; |
| 10 | + // Helper function to get a readable object type |
| 11 | + private static String getNiceObjType(PdfPageObjectType type) { |
| 12 | + switch (type) { |
| 13 | + case kPdsPageText: |
| 14 | + return "text"; |
| 15 | + case kPdsPagePath: |
| 16 | + return "path"; |
| 17 | + case kPdsPageImage: |
| 18 | + return "image"; |
| 19 | + case kPdsPageShading: |
| 20 | + return "shading"; |
| 21 | + case kPdsPageForm: |
| 22 | + return "form"; |
| 23 | + default: |
| 24 | + return "unknown"; |
| 25 | + } |
24 | 26 | } |
25 | | - } |
26 | | - |
27 | | - // Helper function to get object information |
28 | | - private static String getObjBBox(PdsPageObject obj) { |
29 | | - StringBuilder info = new StringBuilder(); |
30 | | - PdfRect bbox = obj.GetBBox(); |
31 | | - info.append(String.format("[%.2f, %.2f, %.2f, %.2f]", bbox.left, bbox.bottom, bbox.right, bbox.top)); |
32 | | - return info.toString(); |
33 | | - } |
34 | | - |
35 | | - private static String getObjContent(PdsPageObject obj) { |
36 | | - StringBuilder info = new StringBuilder(); |
37 | | - if (obj.GetObjectType() == PdfPageObjectType.kPdsPageText) { |
38 | | - PdsText textObj = (PdsText) obj; |
39 | | - info.append(textObj.GetText()); |
| 27 | + |
| 28 | + // Helper function to get object information |
| 29 | + private static String getObjBBox(PdsPageObject obj) { |
| 30 | + StringBuilder info = new StringBuilder(); |
| 31 | + PdfRect bbox = obj.GetBBox(); |
| 32 | + info.append(String.format("[%.2f, %.2f, %.2f, %.2f]", bbox.left, bbox.bottom, bbox.right, bbox.top)); |
| 33 | + return info.toString(); |
40 | 34 | } |
41 | | - return info.toString(); |
42 | | - } |
43 | 35 |
|
44 | | - // Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids |
45 | | - // found |
46 | | - public static int checkDuplicateMcid(String path) throws Exception { |
47 | | - Pdfix pdfix = new Pdfix(); |
| 36 | + private static String getObjContent(PdsPageObject obj) { |
| 37 | + StringBuilder info = new StringBuilder(); |
| 38 | + if (obj.GetObjectType() == PdfPageObjectType.kPdsPageText) { |
| 39 | + PdsText textObj = (PdsText) obj; |
| 40 | + info.append(textObj.GetText()); |
| 41 | + } |
| 42 | + return info.toString(); |
| 43 | + } |
48 | 44 |
|
49 | | - PdfDoc doc = pdfix.OpenDoc(path, ""); |
50 | | - if (doc == null) { |
51 | | - throw new RuntimeException(pdfix.GetError()); |
| 45 | + public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mcid) { |
| 46 | + System.out.println("Duplicate MCID Found:"); |
| 47 | + String objType = getNiceObjType(obj.GetObjectType()); |
| 48 | + String objBBox = getObjBBox(obj); |
| 49 | + String objContent = getObjContent(obj); |
| 50 | + |
| 51 | + StringBuilder info = new StringBuilder(); |
| 52 | + info.append(String.format(" %-10s: %d\n", "MCID", mcid)); |
| 53 | + info.append(String.format(" %-10s: %d\n", "Page", pageNum + 1)); |
| 54 | + info.append(String.format(" %-10s: %d\n", "Index", index)); |
| 55 | + info.append(String.format(" %-10s: %s\n", "Type", objType)); |
| 56 | + info.append(String.format(" %-10s: %s\n", "BBox", objBBox)); |
| 57 | + if (!objContent.isEmpty()) { |
| 58 | + String truncatedContent = objContent.length() > 80 ? objContent.substring(0, 80) + "…" |
| 59 | + : objContent; |
| 60 | + info.append(String.format(" %-10s: %s\n", "Content", truncatedContent)); |
| 61 | + } |
| 62 | + |
| 63 | + System.out.println(info.toString()); |
52 | 64 | } |
53 | 65 |
|
54 | | - int found = 0; |
55 | | - |
56 | | - for (int i = 0; i < doc.GetNumPages(); i++) { |
57 | | - PdfPage page = doc.AcquirePage(i); |
58 | | - if (page == null) { |
59 | | - System.out.println("Warning: Unable to load page " + (i + 1)); |
60 | | - continue; |
61 | | - } |
62 | | - |
63 | | - PdsContent content = page.GetContent(); |
64 | | - if (content == null) { |
65 | | - page.Release(); |
66 | | - continue; |
67 | | - } |
68 | | - |
69 | | - int lastMcid = -1; |
70 | | - List<Integer> mcids = new ArrayList<Integer>(); |
71 | | - for (int j = 0; j < content.GetNumObjects(); j++) { |
72 | | - PdsPageObject obj = content.GetObject(j); |
73 | | - int mcid = obj.GetMcid(); |
74 | | - if (mcid != lastMcid) { |
75 | | - lastMcid = mcid; |
76 | | - if (mcid == -1) { |
77 | | - continue; |
78 | | - } |
79 | | - |
80 | | - if (mcids.contains(mcid)) { |
81 | | - System.out.println("Duplicate MCID Found:"); |
82 | | - String objType = getNiceObjType(obj.GetObjectType()); |
83 | | - String objBBox = getObjBBox(obj); |
84 | | - String objContent = getObjContent(obj); |
85 | | - |
86 | | - StringBuilder info = new StringBuilder(); |
87 | | - info.append(String.format(" %-10s: %d\n", "MCID", mcid)); |
88 | | - info.append(String.format(" %-10s: %d\n", "Page", i + 1)); |
89 | | - info.append(String.format(" %-10s: %d\n", "Index", j)); |
90 | | - info.append(String.format(" %-10s: %s\n", "Type", objType)); |
91 | | - info.append(String.format(" %-10s: %s\n", "BBox", objBBox)); |
92 | | - if (!objContent.isEmpty()) { |
93 | | - String truncatedContent = objContent.length() > 80 ? objContent.substring(0, 80) + "…" |
94 | | - : objContent; |
95 | | - info.append(String.format(" %-10s: %s\n", "Content", truncatedContent)); |
96 | | - } |
| 66 | + // Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids |
| 67 | + // found |
| 68 | + public static int checkDuplicateMcid(String path) throws Exception { |
| 69 | + Pdfix pdfix = new Pdfix(); |
97 | 70 |
|
98 | | - System.out.println(info.toString()); |
99 | | - found++; |
100 | | - } |
101 | | - mcids.add(mcid); |
| 71 | + PdfDoc doc = pdfix.OpenDoc(path, ""); |
| 72 | + if (doc == null) { |
| 73 | + throw new RuntimeException(pdfix.GetError()); |
102 | 74 | } |
103 | | - } |
104 | 75 |
|
105 | | - page.Release(); |
106 | | - } |
| 76 | + int found = 0; |
107 | 77 |
|
108 | | - doc.Close(); |
109 | | - pdfix.Destroy(); |
| 78 | + for (int i = 0; i < doc.GetNumPages(); i++) { |
| 79 | + PdfPage page = doc.AcquirePage(i); |
| 80 | + if (page == null) { |
| 81 | + System.out.println("Warning: Unable to load page " + (i + 1)); |
| 82 | + continue; |
| 83 | + } |
110 | 84 |
|
111 | | - return found; |
112 | | - } |
| 85 | + PdsContent content = page.GetContent(); |
| 86 | + if (content == null) { |
| 87 | + page.Release(); |
| 88 | + continue; |
| 89 | + } |
| 90 | + |
| 91 | + int lastMcid = -1; |
| 92 | + List<Integer> mcids = new ArrayList<Integer>(); |
| 93 | + PdsPageObject lastObj = null; |
| 94 | + for (int j = 0; j < content.GetNumObjects(); j++) { |
| 95 | + PdsPageObject obj = content.GetObject(j); |
| 96 | + int mcid = obj.GetMcid(); |
| 97 | + if ((mcid != -1) && (mcid == lastMcid)) { |
| 98 | + // content marks must be equal for equal mcid |
| 99 | + if (lastObj != null) { |
| 100 | + if (obj.GetNumEqualTags(lastObj) != obj.GetContentMark().GetNumTags()) { |
| 101 | + reportMcid(i, obj, j, mcid); |
| 102 | + found++; |
| 103 | + } |
| 104 | + } |
| 105 | + } else if (mcid != lastMcid) { |
| 106 | + lastMcid = mcid; |
| 107 | + if (mcid == -1) { |
| 108 | + continue; |
| 109 | + } |
| 110 | + |
| 111 | + if (mcids.contains(mcid)) { |
| 112 | + reportMcid(i, obj, j, mcid); |
| 113 | + found++; |
| 114 | + } |
| 115 | + mcids.add(mcid); |
| 116 | + } |
| 117 | + lastObj = obj; |
| 118 | + } |
| 119 | + |
| 120 | + page.Release(); |
| 121 | + } |
| 122 | + |
| 123 | + doc.Close(); |
| 124 | + pdfix.Destroy(); |
| 125 | + |
| 126 | + return found; |
| 127 | + } |
113 | 128 | } |
0 commit comments