From d26ec838e082759404f78469df696a8d67d10dcb Mon Sep 17 00:00:00 2001 From: Vishal M Yadav Date: Mon, 2 Dec 2024 08:55:24 +0530 Subject: [PATCH] added web crawler code --- pom.xml | 5 + .../multithreadedwebcrawler/HtmlParser.java | 9 ++ .../done/multithreadedwebcrawler/Main.java | 18 ++++ .../MultithreadedWebCrawler.java | 72 ++++++++++++++ .../SingleThreadedHtmlParser.java | 53 ++++++++++ .../com/gatomalvado/done/webcrawler/Main.java | 12 +++ .../done/webcrawler/WebCrawler.java | 96 +++++++++++++++++++ .../todo/multithreadedwebcrawler/Main.java | 9 -- .../com/gatomalvado/todo/webcrawler/Main.java | 9 -- 9 files changed, 265 insertions(+), 18 deletions(-) create mode 100644 src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java create mode 100644 src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java create mode 100644 src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java create mode 100644 src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java create mode 100644 src/main/java/com/gatomalvado/done/webcrawler/Main.java create mode 100644 src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java delete mode 100644 src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java delete mode 100644 src/main/java/com/gatomalvado/todo/webcrawler/Main.java diff --git a/pom.xml b/pom.xml index 76336bc..7853360 100644 --- a/pom.xml +++ b/pom.xml @@ -20,6 +20,11 @@ RELEASE provided + + org.jsoup + jsoup + 1.16.1 + org.projectlombok lombok diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java new file mode 100644 index 0000000..071f310 --- /dev/null +++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java @@ -0,0 +1,9 @@ +package com.gatomalvado.done.multithreadedwebcrawler; + +import java.util.List; + +public interface HtmlParser { + + List parseHtml(String inputUrl); + +} diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java new file mode 100644 index 0000000..fbd6eec --- /dev/null +++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java @@ -0,0 +1,18 @@ +package com.gatomalvado.done.multithreadedwebcrawler; + +import java.util.concurrent.ExecutionException; + +public class Main { + + public static void main(String[] args) throws ExecutionException, InterruptedException { + System.out.println("Hello Multithreaded Web Crawler!"); + String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was"; + + MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10); + crawler.startCrawl(inputUrl); + crawler.showParsedUrls(); + } + + + +} diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java new file mode 100644 index 0000000..532976d --- /dev/null +++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java @@ -0,0 +1,72 @@ +package com.gatomalvado.done.multithreadedwebcrawler; + +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.Deque; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class MultithreadedWebCrawler { + + private final HtmlParser htmlParser; + private final Map map; + private final ExecutorService executorService; + private final int limit; + + public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) { + this.htmlParser = htmlParser; + this.map = new ConcurrentHashMap<>(); + this.limit = limit; + this.executorService = Executors.newFixedThreadPool(12); + } + + public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException { + Future> extractedUrls = executorService.submit(crawl(inputUrl)); + Deque>> queue = new ArrayDeque<>(); + queue.add(extractedUrls); + while (!queue.isEmpty()) { + if(map.size() >= limit) { + break; + } + Thread.sleep(3000); + + Future> extractedUrlsFuture = queue.removeFirst(); + List parsedUrls = extractedUrlsFuture.get(); + for(String parsedUrl : parsedUrls) { + if (!map.containsKey(parsedUrl)) { + Callable> callable = crawl(parsedUrl); + queue.add(executorService.submit(callable)); + } + } + } + executorService.shutdown(); + // Wait for existing tasks to complete + executorService.awaitTermination(1, TimeUnit.SECONDS); + } + + public void showParsedUrls() { + for(String key : map.keySet()) { + System.out.println(key); + } + } + + private Callable> crawl(String url) { + return () -> { + if(!map.containsKey(url)) { + List parsedUrls = htmlParser.parseHtml(url); + map.put(url, true); + return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList()); + } + return Collections.emptyList(); + }; + } + +} \ No newline at end of file diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java new file mode 100644 index 0000000..89d62a7 --- /dev/null +++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java @@ -0,0 +1,53 @@ +package com.gatomalvado.done.multithreadedwebcrawler; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.LinkedList; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +public class SingleThreadedHtmlParser implements HtmlParser { + + @Override + public List parseHtml(String inputUrl) { + String rawHtml = readUrl(inputUrl); + return getUrlsFromWebsite(rawHtml); + } + + private List getUrlsFromWebsite(String rawHtml) { + List urls = new LinkedList<>(); + Document doc = Jsoup.parse(rawHtml); + Elements elements = doc.select("a[href]"); + + for(Element element : elements) { + String link = element.attr("abs:href"); + if(!link.isEmpty()) { + urls.add(link); + } + } + + return urls; + } + + private String readUrl(String webLink) { + String rawHtml = ""; + try { + URL url = new URL(webLink); + BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); + String inputLine = ""; + while ((inputLine = reader.readLine()) != null) { + rawHtml += inputLine; + } + reader.close(); + } catch (Exception e) { + System.out.println("Error reading url: " + webLink); + } + + return rawHtml; + } +} diff --git a/src/main/java/com/gatomalvado/done/webcrawler/Main.java b/src/main/java/com/gatomalvado/done/webcrawler/Main.java new file mode 100644 index 0000000..cfa3d55 --- /dev/null +++ b/src/main/java/com/gatomalvado/done/webcrawler/Main.java @@ -0,0 +1,12 @@ +package com.gatomalvado.done.webcrawler; + +public class Main { + + public static void main(String[] args) throws InterruptedException { + System.out.println("Hello Simple Webcrawler!"); + WebCrawler webCrawler = new WebCrawler(); + webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was"); + System.out.println(webCrawler.getDiscoveredWebsites()); + } + +} diff --git a/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java b/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java new file mode 100644 index 0000000..27c1395 --- /dev/null +++ b/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java @@ -0,0 +1,96 @@ +package com.gatomalvado.done.webcrawler; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import lombok.Getter; + +public class WebCrawler { + + private Queue queue; + + @Getter + private Set discoveredWebsites; + + private int websitesProcessed; + + public WebCrawler() { + this.queue = new LinkedList<>(); + this.discoveredWebsites = new HashSet<>(); + this.websitesProcessed = 0; + } + + public void crawl(String seedUrl) throws InterruptedException { + this.queue.offer(seedUrl); + Thread.sleep(3000); + while (!this.queue.isEmpty()) { + String currentUrl = this.queue.poll(); + String rawHtml = readUrl(currentUrl); + if("".equals(rawHtml)) { + continue; + } + + List urlsParsed = getUrlsFromWebsite(rawHtml); + + for (String websiteUrl : urlsParsed) { + if(!discoveredWebsites.contains(websiteUrl)) { + // System.out.println("Website found with URL: " + websiteUrl); + queue.add(websiteUrl); + } + } + + this.discoveredWebsites.add(currentUrl); + this.websitesProcessed++; + + if(this.websitesProcessed == 10000) { + break; + } + } + } + + private List getUrlsFromWebsite(String rawHtml) { + List urls = new LinkedList<>(); + Document doc = Jsoup.parse(rawHtml); + Elements elements = doc.select("a[href]"); + + for(Element element : elements) { + String link = element.attr("abs:href"); + if(!link.isEmpty()) { + urls.add(link); + } + } + + return urls; + } + + private String readUrl(String webLink) { + String rawHtml = ""; + try { + URL url = new URL(webLink); + BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); + String inputLine = ""; + while ((inputLine = reader.readLine()) != null) { + rawHtml += inputLine; + } + reader.close(); + } catch (Exception e) { + System.out.println("Error reading url: " + webLink); + } + + return rawHtml; + } + +} diff --git a/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java b/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java deleted file mode 100644 index 87ea6b9..0000000 --- a/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.gatomalvado.todo.multithreadedwebcrawler; - -public class Main { - - public static void main(String[] args) { - System.out.println("Hello Multithreaded Web Crawler!"); - } - -} diff --git a/src/main/java/com/gatomalvado/todo/webcrawler/Main.java b/src/main/java/com/gatomalvado/todo/webcrawler/Main.java deleted file mode 100644 index b19ece8..0000000 --- a/src/main/java/com/gatomalvado/todo/webcrawler/Main.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.gatomalvado.todo.webcrawler; - -public class Main { - - public static void main(String[] args) { - System.out.println("Hello Simple Webcrawler!"); - } - -}