Skip to content

Commit ee35e30

Browse files
committed
updated check for duplicate mcid in non-equal marked contents
1 parent f00e7ed commit ee35e30

File tree

1 file changed

+108
-93
lines changed

1 file changed

+108
-93
lines changed
Lines changed: 108 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,113 +1,128 @@
11
package net.pdfix;
22

3+
import java.nio.channels.NonReadableChannelException;
34
import java.util.ArrayList;
45
import java.util.List;
56

67
import net.pdfix.pdfixlib.*;
78

89
public class FindDuplicateMcid {
9-
// Helper function to get a readable object type
10-
private static String getNiceObjType(PdfPageObjectType type) {
11-
switch (type) {
12-
case kPdsPageText:
13-
return "text";
14-
case kPdsPagePath:
15-
return "path";
16-
case kPdsPageImage:
17-
return "image";
18-
case kPdsPageShading:
19-
return "shading";
20-
case kPdsPageForm:
21-
return "form";
22-
default:
23-
return "unknown";
10+
// Helper function to get a readable object type
11+
private static String getNiceObjType(PdfPageObjectType type) {
12+
switch (type) {
13+
case kPdsPageText:
14+
return "text";
15+
case kPdsPagePath:
16+
return "path";
17+
case kPdsPageImage:
18+
return "image";
19+
case kPdsPageShading:
20+
return "shading";
21+
case kPdsPageForm:
22+
return "form";
23+
default:
24+
return "unknown";
25+
}
2426
}
25-
}
26-
27-
// Helper function to get object information
28-
private static String getObjBBox(PdsPageObject obj) {
29-
StringBuilder info = new StringBuilder();
30-
PdfRect bbox = obj.GetBBox();
31-
info.append(String.format("[%.2f, %.2f, %.2f, %.2f]", bbox.left, bbox.bottom, bbox.right, bbox.top));
32-
return info.toString();
33-
}
34-
35-
private static String getObjContent(PdsPageObject obj) {
36-
StringBuilder info = new StringBuilder();
37-
if (obj.GetObjectType() == PdfPageObjectType.kPdsPageText) {
38-
PdsText textObj = (PdsText) obj;
39-
info.append(textObj.GetText());
27+
28+
// Helper function to get object information
29+
private static String getObjBBox(PdsPageObject obj) {
30+
StringBuilder info = new StringBuilder();
31+
PdfRect bbox = obj.GetBBox();
32+
info.append(String.format("[%.2f, %.2f, %.2f, %.2f]", bbox.left, bbox.bottom, bbox.right, bbox.top));
33+
return info.toString();
4034
}
41-
return info.toString();
42-
}
4335

44-
// Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids
45-
// found
46-
public static int checkDuplicateMcid(String path) throws Exception {
47-
Pdfix pdfix = new Pdfix();
36+
private static String getObjContent(PdsPageObject obj) {
37+
StringBuilder info = new StringBuilder();
38+
if (obj.GetObjectType() == PdfPageObjectType.kPdsPageText) {
39+
PdsText textObj = (PdsText) obj;
40+
info.append(textObj.GetText());
41+
}
42+
return info.toString();
43+
}
4844

49-
PdfDoc doc = pdfix.OpenDoc(path, "");
50-
if (doc == null) {
51-
throw new RuntimeException(pdfix.GetError());
45+
public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mcid) {
46+
System.out.println("Duplicate MCID Found:");
47+
String objType = getNiceObjType(obj.GetObjectType());
48+
String objBBox = getObjBBox(obj);
49+
String objContent = getObjContent(obj);
50+
51+
StringBuilder info = new StringBuilder();
52+
info.append(String.format(" %-10s: %d\n", "MCID", mcid));
53+
info.append(String.format(" %-10s: %d\n", "Page", pageNum + 1));
54+
info.append(String.format(" %-10s: %d\n", "Index", index));
55+
info.append(String.format(" %-10s: %s\n", "Type", objType));
56+
info.append(String.format(" %-10s: %s\n", "BBox", objBBox));
57+
if (!objContent.isEmpty()) {
58+
String truncatedContent = objContent.length() > 80 ? objContent.substring(0, 80) + "…"
59+
: objContent;
60+
info.append(String.format(" %-10s: %s\n", "Content", truncatedContent));
61+
}
62+
63+
System.out.println(info.toString());
5264
}
5365

54-
int found = 0;
55-
56-
for (int i = 0; i < doc.GetNumPages(); i++) {
57-
PdfPage page = doc.AcquirePage(i);
58-
if (page == null) {
59-
System.out.println("Warning: Unable to load page " + (i + 1));
60-
continue;
61-
}
62-
63-
PdsContent content = page.GetContent();
64-
if (content == null) {
65-
page.Release();
66-
continue;
67-
}
68-
69-
int lastMcid = -1;
70-
List<Integer> mcids = new ArrayList<Integer>();
71-
for (int j = 0; j < content.GetNumObjects(); j++) {
72-
PdsPageObject obj = content.GetObject(j);
73-
int mcid = obj.GetMcid();
74-
if (mcid != lastMcid) {
75-
lastMcid = mcid;
76-
if (mcid == -1) {
77-
continue;
78-
}
79-
80-
if (mcids.contains(mcid)) {
81-
System.out.println("Duplicate MCID Found:");
82-
String objType = getNiceObjType(obj.GetObjectType());
83-
String objBBox = getObjBBox(obj);
84-
String objContent = getObjContent(obj);
85-
86-
StringBuilder info = new StringBuilder();
87-
info.append(String.format(" %-10s: %d\n", "MCID", mcid));
88-
info.append(String.format(" %-10s: %d\n", "Page", i + 1));
89-
info.append(String.format(" %-10s: %d\n", "Index", j));
90-
info.append(String.format(" %-10s: %s\n", "Type", objType));
91-
info.append(String.format(" %-10s: %s\n", "BBox", objBBox));
92-
if (!objContent.isEmpty()) {
93-
String truncatedContent = objContent.length() > 80 ? objContent.substring(0, 80) + "…"
94-
: objContent;
95-
info.append(String.format(" %-10s: %s\n", "Content", truncatedContent));
96-
}
66+
// Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids
67+
// found
68+
public static int checkDuplicateMcid(String path) throws Exception {
69+
Pdfix pdfix = new Pdfix();
9770

98-
System.out.println(info.toString());
99-
found++;
100-
}
101-
mcids.add(mcid);
71+
PdfDoc doc = pdfix.OpenDoc(path, "");
72+
if (doc == null) {
73+
throw new RuntimeException(pdfix.GetError());
10274
}
103-
}
10475

105-
page.Release();
106-
}
76+
int found = 0;
10777

108-
doc.Close();
109-
pdfix.Destroy();
78+
for (int i = 0; i < doc.GetNumPages(); i++) {
79+
PdfPage page = doc.AcquirePage(i);
80+
if (page == null) {
81+
System.out.println("Warning: Unable to load page " + (i + 1));
82+
continue;
83+
}
11084

111-
return found;
112-
}
85+
PdsContent content = page.GetContent();
86+
if (content == null) {
87+
page.Release();
88+
continue;
89+
}
90+
91+
int lastMcid = -1;
92+
List<Integer> mcids = new ArrayList<Integer>();
93+
PdsPageObject lastObj = null;
94+
for (int j = 0; j < content.GetNumObjects(); j++) {
95+
PdsPageObject obj = content.GetObject(j);
96+
int mcid = obj.GetMcid();
97+
if ((mcid != -1) && (mcid == lastMcid)) {
98+
// content marks must be equal for equal mcid
99+
if (lastObj != null) {
100+
if (obj.GetNumEqualTags(lastObj) != obj.GetContentMark().GetNumTags()) {
101+
reportMcid(i, obj, j, mcid);
102+
found++;
103+
}
104+
}
105+
} else if (mcid != lastMcid) {
106+
lastMcid = mcid;
107+
if (mcid == -1) {
108+
continue;
109+
}
110+
111+
if (mcids.contains(mcid)) {
112+
reportMcid(i, obj, j, mcid);
113+
found++;
114+
}
115+
mcids.add(mcid);
116+
}
117+
lastObj = obj;
118+
}
119+
120+
page.Release();
121+
}
122+
123+
doc.Close();
124+
pdfix.Destroy();
125+
126+
return found;
127+
}
113128
}

0 commit comments

Comments
 (0)