From 7d753cce690823eddf7a37577b364fff78edd91d Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Thu, 18 Mar 2021 21:50:25 +0100 Subject: [PATCH 1/8] re interrupting reinterrupting waitNewUrl and sleep after the interruptException is catched --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 886e74a92..9567236b0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -458,6 +458,8 @@ protected void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); + //restore interrupted thread + Thread.currentThread().interrupt(); } } @@ -564,6 +566,7 @@ private void waitNewUrl() { newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { logger.warn("waitNewUrl - interrupted, error {}", e); + Thread.currentThread().interrupt(); } finally { newUrlLock.unlock(); } From 8d3c73ac675d159480cd66799089532c07fb41f0 Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Thu, 18 Mar 2021 22:09:30 +0100 Subject: [PATCH 2/8] adding a nullPointerException that could be thrown by the IPUtils method --- .../src/main/java/us/codecraft/webmagic/utils/IPUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java index 3d416964b..dafb7ce70 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java @@ -12,7 +12,7 @@ */ public abstract class IPUtils { - public static String getFirstNoLoopbackIPAddresses() throws SocketException { + public static String getFirstNoLoopbackIPAddresses() throws SocketException, NullPointerException{ Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces(); From 13f9840b573b8c7305bf30d8f010fb3bc5817ac4 Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 23 Mar 2021 11:05:12 +0100 Subject: [PATCH 3/8] refactoring on constructors and setters --- .../us/codecraft/webmagic/utils/FilePersistentBase.java | 8 ++++++++ .../webmagic/pipeline/FilePageModelPipeline.java | 4 ++-- .../webmagic/pipeline/JsonFilePageModelPipeline.java | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java index 79b9efece..721401f8b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java @@ -11,6 +11,14 @@ public class FilePersistentBase { protected String path; + + public FilePersistentBase() { + setPath("/data/webmagic/"); + } + + public FilePersistentBase(String path) { + setPath(path); + } public static String PATH_SEPERATOR = "/"; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index 0db9b819d..22d818217 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -28,11 +28,11 @@ public class FilePageModelPipeline extends FilePersistentBase implements PageMod * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public FilePageModelPipeline() { - setPath("/data/webmagic/"); + super(); } public FilePageModelPipeline(String path) { - setPath(path); + super(path); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 7a7f80a25..7b3b3035c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -29,11 +29,11 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePageModelPipeline() { - setPath("/data/webmagic/"); + super(); } public JsonFilePageModelPipeline(String path) { - setPath(path); + super(path); } @Override From 246526b35238baa8fe7e957cba5cadfeafd6d02d Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 23 Mar 2021 11:58:55 +0100 Subject: [PATCH 4/8] removing some code smells in Page from webmagic core --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..f822b5602 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,12 +49,10 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; - public Page() { - } public static Page fail(){ Page page = new Page(); @@ -105,9 +103,9 @@ public Json getJson() { /** * @param html html - * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + public void setHtml(Html html) { this.html = html; } From de34155b532cbfca07a587a7eb413bc15c74aad8 Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 30 Mar 2021 09:49:59 +0200 Subject: [PATCH 5/8] correcting and removing some code smell in spider from webmagic core --- .../main/java/us/codecraft/webmagic/Spider.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9567236b0..d9117ce2f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task { protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList<>(); protected PageProcessor pageProcessor; @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -246,7 +246,7 @@ public Spider setPipelines(List pipelines) { * @return this */ public Spider clearPipeline() { - pipelines = new ArrayList(); + pipelines = new ArrayList<>(); return this; } @@ -313,7 +313,8 @@ public void run() { // wait until new url added waitNewUrl(); } else { - threadPool.execute(new Runnable() { + threadPool.execute( + new Runnable() { @Override public void run() { try { @@ -427,7 +428,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { From 1946b3f899f92903f7797a56f907c9ffb7f00625 Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 30 Mar 2021 09:52:55 +0200 Subject: [PATCH 6/8] deleting deprecated method created 8 years ago --- .../java/us/codecraft/webmagic/Spider.java | 38 ++----------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index d9117ce2f..59f633f23 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -167,17 +167,8 @@ public Spider setUUID(String uuid) { return this; } - /** - * set scheduler for Spider - * - * @param scheduler scheduler - * @return this - * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) - */ - @Deprecated - public Spider scheduler(Scheduler scheduler) { - return setScheduler(scheduler); - } + + /** * set scheduler for Spider @@ -200,17 +191,7 @@ public Spider setScheduler(Scheduler scheduler) { return this; } - /** - * add a pipeline for Spider - * - * @param pipeline pipeline - * @return this - * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) - * @deprecated - */ - public Spider pipeline(Pipeline pipeline) { - return addPipeline(pipeline); - } + /** * add a pipeline for Spider @@ -250,18 +231,7 @@ public Spider clearPipeline() { return this; } - /** - * set the downloader of spider - * - * @param downloader downloader - * @return this - * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) - * @deprecated - */ - public Spider downloader(Downloader downloader) { - return setDownloader(downloader); - } - + /** * set the downloader of spider * From 42c326bc9ecbff168cb1efa254a64b0c8c6c349e Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 30 Mar 2021 10:09:47 +0200 Subject: [PATCH 7/8] adding the deprecated methods to validate the pull request --- .../java/us/codecraft/webmagic/Spider.java | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 59f633f23..d9117ce2f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -167,8 +167,17 @@ public Spider setUUID(String uuid) { return this; } - - + /** + * set scheduler for Spider + * + * @param scheduler scheduler + * @return this + * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + */ + @Deprecated + public Spider scheduler(Scheduler scheduler) { + return setScheduler(scheduler); + } /** * set scheduler for Spider @@ -191,7 +200,17 @@ public Spider setScheduler(Scheduler scheduler) { return this; } - + /** + * add a pipeline for Spider + * + * @param pipeline pipeline + * @return this + * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated + */ + public Spider pipeline(Pipeline pipeline) { + return addPipeline(pipeline); + } /** * add a pipeline for Spider @@ -231,7 +250,18 @@ public Spider clearPipeline() { return this; } - + /** + * set the downloader of spider + * + * @param downloader downloader + * @return this + * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated + */ + public Spider downloader(Downloader downloader) { + return setDownloader(downloader); + } + /** * set the downloader of spider * From 5ae70799b4e6e716be50146c64251bd77ac5562a Mon Sep 17 00:00:00 2001 From: Malik Bouaoud Date: Tue, 30 Mar 2021 10:11:21 +0200 Subject: [PATCH 8/8] refactoring on the select method with remooving continue statement and merging some condition --- .../selector/SmartContentSelector.java | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..e4ffd38d1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -15,7 +15,9 @@ */ @Experimental public class SmartContentSelector implements Selector { - + /*** + * Empty/ default constructor for SmartContentSelector + */ public SmartContentSelector() { } @@ -33,7 +35,7 @@ public String select(String html) { int start; int end; StringBuilder text = new StringBuilder(); - ArrayList indexDistribution = new ArrayList(); + ArrayList indexDistribution = new ArrayList<>(); lines = Arrays.asList(html.split("\n")); @@ -47,39 +49,42 @@ public String select(String html) { } start = -1; end = -1; - boolean boolstart = false, boolend = false; + boolean boolstart = false; + boolean boolend = false; text.setLength(0); - - for (int i = 0; i < indexDistribution.size() - 1; i++) { - if (indexDistribution.get(i) > threshold && ! boolstart) { - if (indexDistribution.get(i+1).intValue() != 0 + + int i=0; + while (i < indexDistribution.size() - 1) { + + if ((indexDistribution.get(i) > threshold && ! boolstart) + && (indexDistribution.get(i+1).intValue() != 0 || indexDistribution.get(i+2).intValue() != 0 - || indexDistribution.get(i+3).intValue() != 0) { + || indexDistribution.get(i+3).intValue() != 0) ){ boolstart = true; start = i; - continue; + i++; } - } - if (boolstart) { - if (indexDistribution.get(i).intValue() == 0 - || indexDistribution.get(i+1).intValue() == 0) { + + if ((boolstart) && (indexDistribution.get(i).intValue() == 0 + || indexDistribution.get(i+1).intValue() == 0) ){ end = i; boolend = true; } - } + + StringBuilder tmp = new StringBuilder(); if (boolend) { - //System.out.println(start+1 + "\t\t" + end+1); for (int ii = start; ii <= end; ii++) { - if (lines.get(ii).length() < 5) continue; + if (lines.get(ii).length() < 5) i++; tmp.append(lines.get(ii) + "\n"); } String str = tmp.toString(); - //System.out.println(str); - if (str.contains("Copyright") ) continue; + + if (str.contains("Copyright")) i++; text.append(str); boolstart = boolend = false; } + i++; } return text.toString(); }