diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..ea875e43f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -108,6 +108,7 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + @Deprecated public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e738d..8bede01bf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -171,6 +171,7 @@ public Spider setUUID(String uuid) { * set scheduler for Spider * * @param scheduler scheduler + * @deprecated since 0.4.0 * @return this * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) */ @@ -543,9 +544,9 @@ protected CollectorPipeline getCollectorPipeline() { public T get(String url) { List urls = WMCollections.newArrayList(url); - List resultItemses = getAll(urls); - if (resultItemses != null && resultItemses.size() > 0) { - return resultItemses.get(0); + List singleResultItems = getAll(urls); + if (singleResultItems != null && !singleResultItems.isEmpty()) { + return singleResultItems.get(0); } else { return null; } @@ -677,7 +678,7 @@ public Status getStatus() { public enum Status { - Init(0), Running(1), Stopped(2); + INIT(0), RUNNING(1), STOPPED(2); private Status(int value) { this.value = value; @@ -696,7 +697,7 @@ public static Status fromValue(int value) { } } //default value - return Init; + return INIT; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f1085..75f899925 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -69,9 +69,7 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { - logger.error("ssl connection fail", e); - } catch (NoSuchAlgorithmException e) { + } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); @@ -91,7 +89,7 @@ public void checkServerTrusted(X509Certificate[] chain, String authType) throws @Override public X509Certificate[] getAcceptedIssuers() { - return null; + return new X509Certificate[0]; } }; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b30785..5ae8126f4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -17,8 +17,10 @@ public class HttpRequestBody implements Serializable { private static final long serialVersionUID = 5659170945717023595L; + + private static final String ENCODING_ERROR ="illegal encoding "; - public static abstract class ContentType { + public abstract static class ContentType { public static final String JSON = "application/json"; @@ -68,7 +70,7 @@ public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ENCODING_ERROR+ encoding, e); } } @@ -76,7 +78,7 @@ public static HttpRequestBody xml(String xml, String encoding) { try { return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ENCODING_ERROR + encoding, e); } } @@ -92,7 +94,7 @@ public static HttpRequestBody form(Map params, String encoding){ try { return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ENCODING_ERROR + encoding, e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java index f6ad87e05..070db4803 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java @@ -42,9 +42,9 @@ public static void main(String[] args) { list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); - List resultItemses = spider.getAll(list); - for (ResultItems resultItemse : resultItemses) { - System.out.println(resultItemse.getAll()); + List multiResultItems = spider.getAll(list); + for (ResultItems singleResultItems : multiResultItems) { + System.out.println(singleResultItems.getAll()); } spider.close(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java index 4139f27fd..8c879ba61 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java @@ -4,7 +4,7 @@ import us.codecraft.webmagic.Task; /** - * Remove duplicate requests. + * Remove duplicate requests.
* @author code4crafer@gmail.com * @since 0.5.1 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb55215..5e868ebb8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -91,7 +91,7 @@ public Selectable replace(String regex, String replacement) { } public String getFirstSourceText() { - if (getSourceTexts() != null && getSourceTexts().size() > 0) { + if (getSourceTexts() != null && !getSourceTexts().isEmpty()) { return getSourceTexts().get(0); } return null; @@ -104,6 +104,6 @@ public String toString() { @Override public boolean match() { - return getSourceTexts() != null && getSourceTexts().size() > 0; + return getSourceTexts() != null && !getSourceTexts().isEmpty(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java index 135442dc6..d51e29824 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -48,7 +48,7 @@ public List selectList(String text) { resultsTemp.addAll(selector.selectList(result)); } results = resultsTemp; - if (results == null || results.size() == 0) { + if (results.size() == 0) { return results; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index bbc7217ab..64c7ef621 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -7,6 +7,8 @@ import java.util.List; /** + * Base selector for html elements. + * * @author code4crafter@gmail.com * @since 0.3.0 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0baeb5..de1e68c64 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -33,7 +33,7 @@ public String select(String text) { } if (object instanceof List) { List list = (List) object; - if (list != null && list.size() > 0) { + if (list.size() > 0) { return toString(list.iterator().next()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..df798e81e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -51,22 +51,21 @@ public String select(String html) { text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { - if (indexDistribution.get(i) > threshold && ! boolstart) { - if (indexDistribution.get(i+1).intValue() != 0 - || indexDistribution.get(i+2).intValue() != 0 - || indexDistribution.get(i+3).intValue() != 0) { + if (indexDistribution.get(i) > threshold && ! boolstart + && !isAnyIndexDistributionZero(indexDistribution,i+1,i+2,i+3)){ boolstart = true; start = i; continue; } - } - if (boolstart) { - if (indexDistribution.get(i).intValue() == 0 - || indexDistribution.get(i+1).intValue() == 0) { + + if (boolstart && isAnyIndexDistributionZero (indexDistribution,i,i+1,0)) { + end = i; boolend = true; - } + } + + StringBuilder tmp = new StringBuilder(); if (boolend) { //System.out.println(start+1 + "\t\t" + end+1); @@ -83,9 +82,25 @@ public String select(String html) { } return text.toString(); } - + + @Override public List selectList(String text) { throw new UnsupportedOperationException(); } + + private static boolean isAnyIndexDistributionZero( ArrayList indexDistribution, int index, int successorIndex, int afterSuccessorIndex) { + + + if (afterSuccessorIndex != 0) { + return (indexDistribution.get(index).intValue() == 0 + && indexDistribution.get(successorIndex).intValue() == 0 + && indexDistribution.get(afterSuccessorIndex).intValue() == 0 ); + }else { + return (indexDistribution.get(index).intValue() == 0 + || indexDistribution.get(successorIndex).intValue() == 0); + } + + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2d6b8fe2a..7459e7327 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -7,7 +7,7 @@ */ public abstract class HttpConstant { - public static abstract class Method { + public abstract static class Method { public static final String GET = "GET"; @@ -25,13 +25,13 @@ public static abstract class Method { } - public static abstract class StatusCode { + public abstract static class StatusCode { public static final int CODE_200 = 200; } - public static abstract class Header { + public abstract static class Header { public static final String REFERER = "Referer"; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java index 55e185105..e73d1d9a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.utils; /** + * Numbers comparison utilility for schedule priority + * * @author yihua.huang@dianping.com */ public abstract class NumberUtils { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index 9b734c73c..5661d3846 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -30,13 +30,12 @@ public static boolean validateProxy(Proxy p) { logger.warn("FAILRE - CAN not connect! remote: " + p); return false; } finally { - if (socket != null) { - try { + try { socket.close(); } catch (IOException e) { logger.warn("Error occurred while closing socket of validating proxy", e); } - } + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java index 23e1644ce..5fbe37d0d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -6,6 +6,8 @@ import java.util.Set; /** + * WebMagic collections builders + * * @author code4crafter@gmail.com * Date: 16/12/18 * Time: 上午10:16 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0f..5fd7f66a1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -20,6 +20,7 @@ public class PhantomJSDownloader extends AbstractDownloader { private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default + private static final String HTTP_REQUEST_ERROR = "HTTP request failed"; private int retryNum; private int threadNum; @@ -91,14 +92,14 @@ public Page download(Request request, Task task) { logger.info("downloading page: " + request.getUrl()); } String content = getPage(request); - if (content.contains("HTTP request failed")) { + if (content.contains(HTTP_REQUEST_ERROR)) { for (int i = 1; i <= getRetryNum(); i++) { content = getPage(request); - if (!content.contains("HTTP request failed")) { + if (!content.contains(HTTP_REQUEST_ERROR)) { break; } } - if (content.contains("HTTP request failed")) { + if (content.contains(HTTP_REQUEST_ERROR)) { //when failed Page page = new Page(); page.setRequest(request); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 003c5730d..e830bbb09 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -40,9 +40,9 @@ public static void main(String[] args) { list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); - List resultItemses = ooSpider.getAll(list); - for (BaiduBaike resultItemse : resultItemses) { - System.out.println(resultItemse); + List multiResultItems = ooSpider.getAll(list); + for (BaiduBaike singleResultItems : multiResultItems) { + System.out.println(singleResultItems); } ooSpider.close(); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java index f9ef286b2..4f428851b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java @@ -7,7 +7,7 @@ public abstract class PatternProcessor extends PatternRequestMatcher implements /** * @param pattern url pattern to handle */ - public PatternProcessor(String pattern) { + protected PatternProcessor(String pattern) { super(pattern); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java index 1be61a8f3..59559ed23 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java @@ -25,7 +25,7 @@ public abstract class PatternRequestMatcher implements RequestMatcher { /** * @param pattern url pattern to handle */ - public PatternRequestMatcher(String pattern) { + protected PatternRequestMatcher(String pattern) { this.pattern = pattern; this.patternCompiled = Pattern.compile(pattern); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba1332..fa9d45244 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -27,11 +27,13 @@ public FieldExtractor(Field field, Selector selector, Source source, boolean not Field getField() { return field; } - + + @Override Selector getSelector() { return selector; } - + + @Override Source getSource() { return source; } @@ -44,6 +46,7 @@ Method getSetterMethod() { return setterMethod; } + @Override boolean isNotNull() { return notNull; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java index d0537163c..1f6f859c8 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -18,11 +18,11 @@ public abstract class MultiKeyMapBase { @SuppressWarnings("rawtypes") private Class protoMapClass = DEFAULT_CLAZZ; - public MultiKeyMapBase() { + protected MultiKeyMapBase() { } @SuppressWarnings("rawtypes") - public MultiKeyMapBase(Class protoMapClass) { + protected MultiKeyMapBase(Class protoMapClass) { this.protoMapClass = protoMapClass; }