From 5214820b5a740794ac857e995b586523a01e9f7e Mon Sep 17 00:00:00 2001 From: lindenb Date: Tue, 4 Feb 2020 17:08:27 +0100 Subject: [PATCH] ssl anomaly in pubmed dump --- docs/Pubmed404.md | 43 ++++++ docs/PubmedDump.md | 8 ++ .../github/lindenb/jvarkit/io/IOUtils.java | 4 +- .../jvarkit/tools/pubmed/Pubmed404.java | 20 +-- .../jvarkit/tools/pubmed/PubmedDump.java | 134 ++++++++---------- .../jvarkit/util/ncbi/NcbiConstants.java | 2 + 6 files changed, 128 insertions(+), 83 deletions(-) diff --git a/docs/Pubmed404.md b/docs/Pubmed404.md index 419de33b3..ce9b5173e 100644 --- a/docs/Pubmed404.md +++ b/docs/Pubmed404.md @@ -51,6 +51,11 @@ $ ./gradlew pubmed404 The java jar file will be installed in the `dist` directory. + +## Creation Date + +20181210 + ## Source code [https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java](https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java) @@ -76,3 +81,41 @@ The current reference is: > Lindenbaum, Pierre (2015): JVarkit: java-based utilities for Bioinformatics. figshare. > [http://dx.doi.org/10.6084/m9.figshare.1425030](http://dx.doi.org/10.6084/m9.figshare.1425030) + +## Example + +``` +$ java -jar dist/pubmeddump.jar 'bioinformatics 2001' 2> /dev/null |\ + java -jar dist/pubmed404.jar 2> /dev/null + +#PMID TITLE YEAR URL Status +29520589 Expression of Colocasia esculenta tuber agglutinin in Indian mustard provides resistance against Lipaphis erysimi and the expressed protein is non-allergenic.2018 http://www.fao.org/docrep/007/y0820e/y0820e00.HTM 200 +29520589 Expression of Colocasia esculenta tuber agglutinin in Indian mustard provides resistance against Lipaphis erysimi and the expressed protein is non-allergenic.2018 http://www.icmr.nic.in/guide/Guidelines%20for%20Genetically%20Engineered%20Plants.pdf -1 +28482857 Horizontal gene transfer is not a hallmark of the human genome. 2017 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0607-3 200 +27899642 The UCSC Genome Browser database: 2017 update. 2017 http://genome.ucsc.edu/ 200 +27797935 High hospital research participation and improved colorectal cancer survival outcomes: a population-based study. 2017 http://www.bmj.com/company/products-services/rights-and-licensing/ 403 +25505092 NMRFAM-SPARKY: enhanced software for biomolecular NMR spectroscopy. 2015 http://pine.nmrfam.wisc.edu/download_packages.html 200 +25505092 NMRFAM-SPARKY: enhanced software for biomolecular NMR spectroscopy. 2015 http://www.nmrfam.wisc.edu/nmrfam-sparky-distribution.htm 200 +25428374 The UCSC Genome Browser database: 2015 update. 2015 http://genome.ucsc.edu 200 +26356339 A Simple but Powerful Heuristic Method for Accelerating k-Means Clustering of Large-Scale Data in Life Science. null http://mlab.cb.k.u-tokyo.ac.jp/~ichikawa/boostKCP/ 200 +24794704 Usefulness of the Shock Index as a secondary triage tool. 2015 http://group.bmj.com/group/rights-licensing/permissions 403 +24225322 Progenetix: 12 years of oncogenomic data curation. 2014 http://www.progenetix.org 200 +24137000 Updates of the HbVar database of human hemoglobin variants and thalassemia mutations. 2014 http://globin.bx.psu.edu/hbvar 200 +24137000 Updates of the HbVar database of human hemoglobin variants and thalassemia mutations. 2014 http://www.findbase.org 200 +24137000 Updates of the HbVar database of human hemoglobin variants and thalassemia mutations. 2014 http://www.lovd.nl 200 +23564938 DAMBE5: a comprehensive software package for data analysis in molecular biology and evolution. 2013 http://dambe.bio.uottawa.ca 200 +22689647 SIFT web server: predicting effects of amino acid substitutions on proteins. 2012 http://sift-dna.org 200 +22600740 Cyber-T web server: differential analysis of high-throughput data. 2012 http://cybert.ics.uci.edu/ 200 +21742331 An open source lower limb model: Hip joint validation. 2011 https://simtk.org/home/low_limb_london 200 +21593132 Java bioinformatics analysis web services for multiple sequence alignment--JABAWS:MSA. 2011 http://www.compbio.dundee.ac.uk/jabaws 200 +20228129 DensiTree: making sense of sets of phylogenetic trees. 2010 http://compevol.auckland.ac.nz/software/DensiTree/ 404 +19380317 CELLULAR OPEN RESOURCE (COR): current status and future directions. 2009 http://www.cellml.org/specifications/ 200 +18948284 OperonDB: a comprehensive database of predicted operons in microbial genomes. 2009 http://operondb.cbcb.umd.edu 200 +18368364 Simulator for neural networks and action potentials. 2007 http://snnap.uth.tmc.edu -1 +18367465 An improved general amino acid replacement matrix. 2008 http://atgc.lirmm.fr/LG 404 +18238804 Interoperability with Moby 1.0--it's better than sharing your toothbrush! 2008 http://www.biomoby.org/ 200 +18174178 PRALINETM: a strategy for improved multiple alignment of transmembrane proteins. 2008 http://www.ibi.vu.nl/programs/pralinewww 200 +17221864 HbVar database of human hemoglobin variants and thalassemia mutations: 2007 update. 2007 http://globin.bx.psu.edu/hbvar 200 +17221864 HbVar database of human hemoglobin variants and thalassemia mutations: 2007 update. 2007 http://www.goldenhelix.org/xprbase 403 +(...) +``` diff --git a/docs/PubmedDump.md b/docs/PubmedDump.md index 6b6634aa3..fc1a43f46 100644 --- a/docs/PubmedDump.md +++ b/docs/PubmedDump.md @@ -23,6 +23,9 @@ Usage: pubmeddump [options] Files java property file ${HOME}/.ncbi.properties and key api_key -o, --output Output file. Optional . Default: stdout + -r, --retmax + value for 'retmax' parameter for Eutils. + Default: 10000 -skip, --skip [20180302] Optional set of elements names to be ignored in the output. Spaces or comma separated. .eg: 'AuthorList PubmedData ' @@ -64,6 +67,11 @@ $ ./gradlew pubmeddump The java jar file will be installed in the `dist` directory. + +## Creation Date + +20140805 + ## Source code [https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java](https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java) diff --git a/src/main/java/com/github/lindenb/jvarkit/io/IOUtils.java b/src/main/java/com/github/lindenb/jvarkit/io/IOUtils.java index a66bbf702..28186fb67 100644 --- a/src/main/java/com/github/lindenb/jvarkit/io/IOUtils.java +++ b/src/main/java/com/github/lindenb/jvarkit/io/IOUtils.java @@ -259,8 +259,10 @@ public static byte[] gzipString(final String s) { } } - public static boolean isRemoteURI(String uri) + public static boolean isRemoteURI(final String uri) { + if(uri==null) return false; + if(!IOUtil.isUrl(uri)) return false; return uri.startsWith("http://") || uri.startsWith("https://") || uri.startsWith("ftp://") diff --git a/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java b/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java index 58c5a062e..6318dcfba 100644 --- a/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java +++ b/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/Pubmed404.java @@ -26,10 +26,10 @@ of this software and associated documentation files (the "Software"), to deal package com.github.lindenb.jvarkit.tools.pubmed; import java.io.ByteArrayInputStream; -import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; @@ -62,6 +62,7 @@ of this software and associated documentation files (the "Software"), to deal import htsjdk.samtools.util.IOUtil; /** +BEGIN_DOC ## Example @@ -100,16 +101,18 @@ of this software and associated documentation files (the "Software"), to deal 17221864 HbVar database of human hemoglobin variants and thalassemia mutations: 2007 update. 2007 http://www.goldenhelix.org/xprbase 403 (...) ``` - +END_DOC */ @Program(name="pubmed404", description="Test if URL in the pubmed abstracts are reacheable.", -keywords={"pubmed","url"} +keywords={"pubmed","url"}, +creationDate="20181210", +modificationDate="20200204" ) public class Pubmed404 extends Launcher{ private static final Logger LOG = Logger.build(Pubmed404.class).make(); @Parameter(names={"-o","--output"},description=OPT_OUPUT_FILE_OR_STDOUT) - private File outFile=null; + private Path outFile=null; @Parameter(names={"-t","--timeout"},description="timeout in seconds") private int timeoutSeconds = 5; @Parameter(names={"-c","--collapse"},description="Only one URL per article. Print the '200/OK' first.") @@ -223,7 +226,7 @@ else if(eltName.equals(rootName)) token=token.substring(0,token.length()-1); } if(token.isEmpty()) continue; - if(!IOUtil.isUrl(token)) { + if(!IOUtils.isRemoteURI(token)) { if(token.startsWith("http")) LOG.debug("strange url: "+token); continue; } @@ -273,8 +276,6 @@ public int doWork(final List args) { InputStream in=null; try { /** create http client */ - - this.httpClient = HttpClients.createSystem();//createDefault(); @@ -290,7 +291,7 @@ public Object resolveEntity(String publicID, String systemID, String baseURI, St in=(inputName==null?stdin():IOUtils.openURIForReading(inputName)); r = xmlInputFactory.createXMLEventReader(in); - out = super.openFileOrStdoutAsPrintWriter(this.outFile); + out = super.openPathOrStdoutAsPrintWriter(this.outFile); out.println("#PMID\tTITLE\tYEAR\tURL\thttp.code\thttp.reason"); while(r.hasNext()) { @@ -320,8 +321,7 @@ public Object resolveEntity(String publicID, String systemID, String baseURI, St } -public static void main(final String[] args) - { +public static void main(final String[] args) { new Pubmed404().instanceMainWithExit(args); } } diff --git a/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java b/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java index 4cd8a60ae..d500597f7 100644 --- a/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java +++ b/src/main/java/com/github/lindenb/jvarkit/tools/pubmed/PubmedDump.java @@ -26,10 +26,9 @@ of this software and associated documentation files (the "Software"), to deal import java.io.ByteArrayInputStream; -import java.io.File; import java.io.PrintWriter; +import java.nio.file.Path; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -95,7 +94,8 @@ of this software and associated documentation files (the "Software"), to deal @Program(name="pubmeddump",keywords={"ncbi","pubmed","xml"}, description="Dump XML results from pubmed/Eutils", biostars= {270498,365479}, - modificationDate="20190222" + creationDate="20140805", + modificationDate="20200204" ) public class PubmedDump extends Launcher @@ -105,9 +105,12 @@ public class PubmedDump @Parameter(names={"-e","--email"},description="optional user email") private String email = null; @Parameter(names={"-o","--output"},description=OPT_OUPUT_FILE_OR_STDOUT) - private File outputFile = null; + private Path outputFile = null; @Parameter(names={"-skip","--skip"},description="[20180302] Optional set of elements names to be ignored in the output. Spaces or comma separated. .eg: 'AuthorList PubmedData '") private String skipTagsStr = ""; + @Parameter(names={"-r","--retmax"},description="value for 'retmax' parameter for Eutils.") + private int retmax_param =10_000; + @ParametersDelegate private NcbiApiKey ncbiApiKey = new NcbiApiKey(); @@ -133,6 +136,11 @@ else if(evt.isStartElement()) public int doWork(final List args) { PrintWriter pw=null; + if(this.retmax_param<=0) { + LOG.error("bad retmax value"); + return -1; + } + if(args.isEmpty()) { LOG.error("Query missing"); @@ -162,14 +170,14 @@ public int doWork(final List args) { try { - XMLInputFactory xmlInputFactory=XMLInputFactory.newFactory(); + final XMLInputFactory xmlInputFactory=XMLInputFactory.newFactory(); xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE); xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE); xmlInputFactory.setXMLResolver(new XMLResolver() { @Override - public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) + public Object resolveEntity(final String publicID,final String systemID,final String baseURI, String namespace) throws XMLStreamException { LOG.info("ignoring DTD : "+publicID+" "+baseURI); return new ByteArrayInputStream(new byte[0]); @@ -181,11 +189,12 @@ public Object resolveEntity(String publicID, String systemID, String baseURI, St StringUtils.escapeHttp(query.toString())+ ncbiApiKey.getAmpParamValue()+ "&retstart=0&retmax=0&usehistory=y&retmode=xml"+ - (email==null?"":"&email="+StringUtils.escapeHttp(email))+ - (tool==null?"":"&tool="+StringUtils.escapeHttp(tool)) + (StringUtils.isBlank(this.email)?"":"&email="+StringUtils.escapeHttp(email))+ + (StringUtils.isBlank(this.tool)?"":"&tool="+StringUtils.escapeHttp(tool)) ; LOG.info(url); long expected_total_count=-1; + long total_found_so_far = 0; String WebEnv=null; String QueryKey=null; XMLEventReader r=xmlInputFactory.createXMLEventReader(new StreamSource(url)); @@ -217,15 +226,21 @@ else if(eName.equals("QueryKey")) LOG.error("Bad esearch result"); return -1; } - pw=super.openFileOrStdoutAsPrintWriter(outputFile); + pw=super.openPathOrStdoutAsPrintWriter(outputFile); final XMLOutputFactory xof=XMLOutputFactory.newFactory(); final XMLEventWriter w=xof.createXMLEventWriter(pw); - long total_found_so_far = 0L; - boolean end_document_printed=false; + final String xmlRootName = "PubmedArticleSet"; + + final XMLEventFactory xmlEventFactory = XMLEventFactory.newFactory(); + w.add(xmlEventFactory.createStartDocument()); + w.add(xmlEventFactory.createDTD(NcbiConstants.PUBMED_DTD)); + w.add(xmlEventFactory.createStartElement(new QName(xmlRootName),null,null)); + + while(total_found_so_far< expected_total_count) { - final int ret_max=90000; + final int ret_max= Math.max(1,Math.min(this.retmax_param,90_000)); LOG.info("nFound:"+total_found_so_far+"/"+expected_total_count); url= NcbiConstants.efetch()+"?"+ "db=pubmed&WebEnv="+ @@ -233,112 +248,88 @@ else if(eName.equals("QueryKey")) ncbiApiKey.getAmpParamValue()+ "&query_key="+StringUtils.escapeHttp(QueryKey)+ "&retmode=xml&retmax="+ret_max+"&retstart="+total_found_so_far+ - (email==null?"":"&email="+StringUtils.escapeHttp(email))+ - (tool==null?"":"&tool="+StringUtils.escapeHttp(tool)) + (StringUtils.isBlank(this.email)?"":"&email="+StringUtils.escapeHttp(email))+ + (StringUtils.isBlank(this.tool)?"":"&tool="+StringUtils.escapeHttp(tool)) ; LOG.info(url); int curr_count=0; r = xmlInputFactory.createXMLEventReader(new StreamSource(url)); - int current_dom_depth =0; + + while(r.hasNext()) { - final XMLEvent evt=r.nextEvent(); + final XMLEvent evt; + + try { + evt = r.nextEvent(); + } + catch(final XMLStreamException err) { + //2020 4 Feb. PLein d'erreur SSL a ce niveau au bout d'un moment. + LOG.error("skip loop",err); + break; + } switch(evt.getEventType()) { case XMLEvent.ATTRIBUTE: { - if(current_dom_depth>0) w.add(evt); + w.add(evt); break; } case XMLEvent.START_DOCUMENT: { - if(total_found_so_far==0) - { - w.add(evt); - } break; } case XMLEvent.END_DOCUMENT: { - if(total_found_so_far>= expected_total_count) - { - end_document_printed = true; - w.add(evt); - } break; } case XMLEvent.START_ELEMENT: { final String localName= evt.asStartElement().getName().getLocalPart(); - if(current_dom_depth==0) - { - if(!localName.equals("PubmedArticleSet")) { - throw new XMLStreamException("Expected but got <"+localName+">",evt.getLocation()); - } - if( total_found_so_far == 0) - { - w.add(evt); - } - current_dom_depth++; + if(localName.equals(xmlRootName)) { + break; } - else if(current_dom_depth==1) + + if(localName.equals("PubmedArticle") || localName.equals("PubmedBookArticle")) { - if(!(localName.equals("PubmedArticle") || localName.equals("PubmedBookArticle"))) - { - throw new IllegalStateException("Not PubmedArticle: "+evt); - } ++curr_count; ++total_found_so_far; - ++current_dom_depth; - w.add(evt); } - else if(current_dom_depth>1) + + if(skipTags.contains(localName)) { - if(skipTags.contains(localName)) - { - skip(r); - } - else - { - w.add(evt); - current_dom_depth++; - } + skip(r); } else { - LOG.warn("unmatched case <"+localName+"> depth:"+current_dom_depth); + w.add(evt); } break; } case XMLEvent.END_ELEMENT: { - current_dom_depth--; - if(current_dom_depth>0) - { - w.add(evt); - } - else if(total_found_so_far>=expected_total_count)//depth ==0 - { - end_document_printed = true; - w.add(evt); + final String localName= evt.asEndElement().getName().getLocalPart(); + + if(localName.equals(xmlRootName)) { + break; } + w.add(evt); break; } case XMLEvent.COMMENT:break; case XMLEvent.PROCESSING_INSTRUCTION:break; case XMLEvent.DTD: { - if(total_found_so_far==0) w.add(evt); break; } case XMLEvent.SPACE:break; case XMLEvent.CHARACTERS: { - if(current_dom_depth>1) w.add(evt); + w.add(evt); break; } default: @@ -352,11 +343,6 @@ else if(total_found_so_far>=expected_total_count)//depth ==0 if(curr_count==0) { LOG.info("Nothing found . Exiting."); - if(!end_document_printed) { - final XMLEventFactory xef = XMLEventFactory.newFactory(); - w.add(xef.createEndElement(new QName("PubmedArticleSet"),Collections.emptyIterator())); - w.add(xef.createEndDocument()); - } break; } else @@ -364,6 +350,10 @@ else if(total_found_so_far>=expected_total_count)//depth ==0 LOG.info("found "+curr_count+" total "+total_found_so_far+" expect "+expected_total_count); } } + + w.add(xmlEventFactory.createEndElement(new QName(xmlRootName),null)); + w.add(xmlEventFactory.createEndDocument()); + w.flush(); w.close(); pw.flush(); @@ -371,7 +361,7 @@ else if(total_found_so_far>=expected_total_count)//depth ==0 LOG.info("Done. found "+total_found_so_far+" / expected:" +expected_total_count+" articles."); return 0; } - catch(final Exception err) + catch(final Throwable err) { LOG.error(err); return -1; @@ -382,7 +372,7 @@ else if(total_found_so_far>=expected_total_count)//depth ==0 } } - public static void main(String[] args) { + public static void main(final String[] args) { new PubmedDump().instanceMainWithExit(args); } } diff --git a/src/main/java/com/github/lindenb/jvarkit/util/ncbi/NcbiConstants.java b/src/main/java/com/github/lindenb/jvarkit/util/ncbi/NcbiConstants.java index 694032359..eb27a3022 100644 --- a/src/main/java/com/github/lindenb/jvarkit/util/ncbi/NcbiConstants.java +++ b/src/main/java/com/github/lindenb/jvarkit/util/ncbi/NcbiConstants.java @@ -31,6 +31,8 @@ of this software and associated documentation files (the "Software"), to deal public class NcbiConstants { public static final String EUTILS_BASE_URL="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; + public static final String PUBMED_DTD = ""; + private static String service(final String name) { return EUTILS_BASE_URL+name+".fcgi"; }