package cn.flightfeather.supervision.docx4j.simpleDemo;
|
|
import org.apache.commons.io.FileUtils;
|
import org.docx4j.Docx4J;
|
import org.docx4j.anon.Anonymize;
|
import org.docx4j.anon.AnonymizeResult;
|
import org.docx4j.openpackaging.exceptions.Docx4JException;
|
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
|
import org.docx4j.openpackaging.parts.Part;
|
|
import java.io.File;
|
import java.io.IOException;
|
import java.io.PrintStream;
|
|
|
public class AnonCorpus {
|
|
private final static String DIR_IN = System.getProperty("user.dir") + "/corpus/";
|
|
private final static String DIR_OUT = System.getProperty("user.dir") + "/OUT6/";
|
|
private final static String DIR_HANDLED = System.getProperty("user.dir") + "/corpus-handled/";
|
|
private static final String DIR_OK = "ok";
|
private static final String DIR_LEAKS = "leaks";
|
private static final String DIR_ERRORS = "errors";
|
private static final String DIR_GLYPH = "glyph-issues";
|
|
private int oks = 0;
|
private int leaks = 0;
|
private int errors = 0;
|
|
private StringBuffer sbLeaks = new StringBuffer();
|
|
public static void main(String[] args) throws Exception {
|
|
AnonCorpus corpusAnon = new AnonCorpus();
|
|
corpusAnon.createDirs();
|
corpusAnon.walk(DIR_IN);
|
|
System.out.println(corpusAnon.sbLeaks.toString());
|
|
System.out.println("leaks: " + corpusAnon.leaks);
|
System.out.println("errors: " + corpusAnon.errors);
|
System.out.println("oks: " + corpusAnon.oks);
|
}
|
|
private void createDirs() throws IOException {
|
|
// create OK, leak dirs
|
FileUtils.forceMkdir(new File(DIR_OUT+DIR_OK));
|
FileUtils.forceMkdir(new File(DIR_OUT+DIR_LEAKS));
|
FileUtils.forceMkdir(new File(DIR_OUT+DIR_ERRORS));
|
FileUtils.forceMkdir(new File(DIR_OUT+DIR_GLYPH));
|
|
}
|
|
int docNum = 1;
|
|
public void walk( String path ) throws IOException {
|
|
File root = new File( path );
|
File[] list = root.listFiles();
|
|
if (list == null) return;
|
|
for ( File f : list ) {
|
if ( f.isDirectory() ) {
|
walk( f.getAbsolutePath() );
|
//System.out.println( "Dir:" + f.getAbsoluteFile() );
|
}
|
else {
|
// System.out.println( "File:" + f.getAbsoluteFile() );
|
|
if (f.getName().endsWith("docx")
|
|| f.getName().endsWith("docm")) {
|
|
try {
|
handle(f) ;
|
|
FileUtils.moveFile(f, new File(DIR_HANDLED + f.getName()));
|
|
} catch (Exception e) {
|
|
if (e.getMessage()!=null
|
&& e.getMessage().startsWith("Ran out of patience")) {
|
|
FileUtils.copyFile(f, new File(DIR_OUT+DIR_GLYPH+"/" + f.getName()+".docx"));
|
|
} else if (e.getMessage()!=null
|
&& e.getMessage().startsWith("This file seems to be a binary doc")) {
|
|
FileUtils.copyFile(f, new File(DIR_OUT+DIR_ERRORS+"/" + f.getName()+".doc"));
|
|
// rename the original
|
FileUtils.moveFile(f, new File(f.getAbsolutePath()+".doc"));
|
|
} else {
|
errors++;
|
|
e.printStackTrace();
|
FileUtils.copyFile(f, new File(DIR_OUT+DIR_ERRORS+"/" + f.getName()));
|
|
File file = new File(DIR_OUT+DIR_ERRORS+"/" + f.getName() + "err.txt");
|
PrintStream ps = new PrintStream(file);
|
e.printStackTrace(ps);
|
ps.close();
|
}
|
}
|
|
docNum++;
|
}
|
}
|
}
|
}
|
|
|
|
private void handle(File fIn) throws Docx4JException {
|
|
System.out.println("\n\n " + docNum + " Processing " + fIn.getName() + "\n\n");
|
|
WordprocessingMLPackage pkg = null;
|
try {
|
pkg = Docx4J.load(fIn);
|
} catch (ClassCastException e) {
|
// eg dodgy docx: CustomXmlDataStoragePart cannot be cast to org.docx4j.openpackaging.parts.CustomXmlDataStoragePropertiesPart
|
throw new Docx4JException(e.getMessage(), e);
|
}
|
|
Anonymize anon = new Anonymize(pkg);
|
AnonymizeResult result = anon.go();
|
|
|
|
String lang = "default";
|
if (result.hasHiragana || result.hasKatakana) {
|
lang="Japanese";
|
} else if (result.hasArabic) {
|
lang="Arabic";
|
} else if (result.hasHebrew) {
|
lang="Hebrew";
|
} else if (result.hasCyrillic) {
|
lang="Cyrillic";
|
} else if (result.hasGreek) {
|
lang="Greek";
|
} else if (result.hasCJK) {
|
lang = "CJK";
|
}
|
|
|
if (result.isOK()) {
|
|
oks++;
|
|
System.out.println("document successfully anonymised.");
|
|
File dir = new File(DIR_OUT+DIR_OK+"/"+lang);
|
dir.mkdirs();
|
|
Docx4J.save(pkg, new File(DIR_OUT+DIR_OK+"/"+lang + "/"+ fIn.getName()));
|
|
} else {
|
|
leaks++;
|
|
// Report
|
reportLeak("\n\n REPORT for " + fIn.getName() + "\n\n");
|
|
File dir = new File(DIR_OUT+DIR_LEAKS+"/"+lang);
|
dir.mkdirs();
|
|
String outputfilepath = DIR_OUT+DIR_LEAKS+"/"+lang + "/"+ fIn.getName();
|
|
Docx4J.save(pkg, new File(outputfilepath));
|
|
reportLeak("document partially anonymised; please check " + outputfilepath);
|
|
if (result.getUnsafeParts().size()>0) {
|
reportLeak("The following parts may leak info:");
|
for(Part p : result.getUnsafeParts()) {
|
reportLeak(p.getPartName().getName() + ", of type " + p.getClass().getName() );
|
}
|
}
|
|
// unsafe objects
|
reportLeak(result.reportUnsafeObjects());
|
|
System.out.println("\n\n .. end REPORT for " + fIn.getName() + "\n\n");
|
|
}
|
|
if (result.getFieldsPresent().size()>0) {
|
|
for (String s : result.getFieldsPresent()) {
|
System.out.println(s);
|
}
|
}
|
|
|
}
|
|
private void reportLeak(String message) {
|
|
System.out.println(message);
|
sbLeaks.append(message + "\n");
|
|
}
|
|
|
}
|