diff --git a/pom.xml b/pom.xml
index 76336bc..7853360 100644
--- a/pom.xml
+++ b/pom.xml
@@ -20,6 +20,11 @@
RELEASE
provided
+
+ org.jsoup
+ jsoup
+ 1.16.1
+
org.projectlombok
lombok
diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java
new file mode 100644
index 0000000..071f310
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java
@@ -0,0 +1,9 @@
+package com.gatomalvado.done.multithreadedwebcrawler;
+
+import java.util.List;
+
+public interface HtmlParser {
+
+ List parseHtml(String inputUrl);
+
+}
diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java
new file mode 100644
index 0000000..fbd6eec
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java
@@ -0,0 +1,18 @@
+package com.gatomalvado.done.multithreadedwebcrawler;
+
+import java.util.concurrent.ExecutionException;
+
+public class Main {
+
+ public static void main(String[] args) throws ExecutionException, InterruptedException {
+ System.out.println("Hello Multithreaded Web Crawler!");
+ String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was";
+
+ MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10);
+ crawler.startCrawl(inputUrl);
+ crawler.showParsedUrls();
+ }
+
+
+
+}
diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java
new file mode 100644
index 0000000..532976d
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java
@@ -0,0 +1,72 @@
+package com.gatomalvado.done.multithreadedwebcrawler;
+
+import java.util.ArrayDeque;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+public class MultithreadedWebCrawler {
+
+ private final HtmlParser htmlParser;
+ private final Map map;
+ private final ExecutorService executorService;
+ private final int limit;
+
+ public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) {
+ this.htmlParser = htmlParser;
+ this.map = new ConcurrentHashMap<>();
+ this.limit = limit;
+ this.executorService = Executors.newFixedThreadPool(12);
+ }
+
+ public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException {
+ Future> extractedUrls = executorService.submit(crawl(inputUrl));
+ Deque>> queue = new ArrayDeque<>();
+ queue.add(extractedUrls);
+ while (!queue.isEmpty()) {
+ if(map.size() >= limit) {
+ break;
+ }
+ Thread.sleep(3000);
+
+ Future> extractedUrlsFuture = queue.removeFirst();
+ List parsedUrls = extractedUrlsFuture.get();
+ for(String parsedUrl : parsedUrls) {
+ if (!map.containsKey(parsedUrl)) {
+ Callable> callable = crawl(parsedUrl);
+ queue.add(executorService.submit(callable));
+ }
+ }
+ }
+ executorService.shutdown();
+ // Wait for existing tasks to complete
+ executorService.awaitTermination(1, TimeUnit.SECONDS);
+ }
+
+ public void showParsedUrls() {
+ for(String key : map.keySet()) {
+ System.out.println(key);
+ }
+ }
+
+ private Callable> crawl(String url) {
+ return () -> {
+ if(!map.containsKey(url)) {
+ List parsedUrls = htmlParser.parseHtml(url);
+ map.put(url, true);
+ return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList());
+ }
+ return Collections.emptyList();
+ };
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java
new file mode 100644
index 0000000..89d62a7
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java
@@ -0,0 +1,53 @@
+package com.gatomalvado.done.multithreadedwebcrawler;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+public class SingleThreadedHtmlParser implements HtmlParser {
+
+ @Override
+ public List parseHtml(String inputUrl) {
+ String rawHtml = readUrl(inputUrl);
+ return getUrlsFromWebsite(rawHtml);
+ }
+
+ private List getUrlsFromWebsite(String rawHtml) {
+ List urls = new LinkedList<>();
+ Document doc = Jsoup.parse(rawHtml);
+ Elements elements = doc.select("a[href]");
+
+ for(Element element : elements) {
+ String link = element.attr("abs:href");
+ if(!link.isEmpty()) {
+ urls.add(link);
+ }
+ }
+
+ return urls;
+ }
+
+ private String readUrl(String webLink) {
+ String rawHtml = "";
+ try {
+ URL url = new URL(webLink);
+ BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
+ String inputLine = "";
+ while ((inputLine = reader.readLine()) != null) {
+ rawHtml += inputLine;
+ }
+ reader.close();
+ } catch (Exception e) {
+ System.out.println("Error reading url: " + webLink);
+ }
+
+ return rawHtml;
+ }
+}
diff --git a/src/main/java/com/gatomalvado/done/webcrawler/Main.java b/src/main/java/com/gatomalvado/done/webcrawler/Main.java
new file mode 100644
index 0000000..cfa3d55
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/webcrawler/Main.java
@@ -0,0 +1,12 @@
+package com.gatomalvado.done.webcrawler;
+
+public class Main {
+
+ public static void main(String[] args) throws InterruptedException {
+ System.out.println("Hello Simple Webcrawler!");
+ WebCrawler webCrawler = new WebCrawler();
+ webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was");
+ System.out.println(webCrawler.getDiscoveredWebsites());
+ }
+
+}
diff --git a/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java b/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java
new file mode 100644
index 0000000..27c1395
--- /dev/null
+++ b/src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java
@@ -0,0 +1,96 @@
+package com.gatomalvado.done.webcrawler;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import lombok.Getter;
+
+public class WebCrawler {
+
+ private Queue queue;
+
+ @Getter
+ private Set discoveredWebsites;
+
+ private int websitesProcessed;
+
+ public WebCrawler() {
+ this.queue = new LinkedList<>();
+ this.discoveredWebsites = new HashSet<>();
+ this.websitesProcessed = 0;
+ }
+
+ public void crawl(String seedUrl) throws InterruptedException {
+ this.queue.offer(seedUrl);
+ Thread.sleep(3000);
+ while (!this.queue.isEmpty()) {
+ String currentUrl = this.queue.poll();
+ String rawHtml = readUrl(currentUrl);
+ if("".equals(rawHtml)) {
+ continue;
+ }
+
+ List urlsParsed = getUrlsFromWebsite(rawHtml);
+
+ for (String websiteUrl : urlsParsed) {
+ if(!discoveredWebsites.contains(websiteUrl)) {
+ // System.out.println("Website found with URL: " + websiteUrl);
+ queue.add(websiteUrl);
+ }
+ }
+
+ this.discoveredWebsites.add(currentUrl);
+ this.websitesProcessed++;
+
+ if(this.websitesProcessed == 10000) {
+ break;
+ }
+ }
+ }
+
+ private List getUrlsFromWebsite(String rawHtml) {
+ List urls = new LinkedList<>();
+ Document doc = Jsoup.parse(rawHtml);
+ Elements elements = doc.select("a[href]");
+
+ for(Element element : elements) {
+ String link = element.attr("abs:href");
+ if(!link.isEmpty()) {
+ urls.add(link);
+ }
+ }
+
+ return urls;
+ }
+
+ private String readUrl(String webLink) {
+ String rawHtml = "";
+ try {
+ URL url = new URL(webLink);
+ BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
+ String inputLine = "";
+ while ((inputLine = reader.readLine()) != null) {
+ rawHtml += inputLine;
+ }
+ reader.close();
+ } catch (Exception e) {
+ System.out.println("Error reading url: " + webLink);
+ }
+
+ return rawHtml;
+ }
+
+}
diff --git a/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java b/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java
deleted file mode 100644
index 87ea6b9..0000000
--- a/src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package com.gatomalvado.todo.multithreadedwebcrawler;
-
-public class Main {
-
- public static void main(String[] args) {
- System.out.println("Hello Multithreaded Web Crawler!");
- }
-
-}
diff --git a/src/main/java/com/gatomalvado/todo/webcrawler/Main.java b/src/main/java/com/gatomalvado/todo/webcrawler/Main.java
deleted file mode 100644
index b19ece8..0000000
--- a/src/main/java/com/gatomalvado/todo/webcrawler/Main.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package com.gatomalvado.todo.webcrawler;
-
-public class Main {
-
- public static void main(String[] args) {
- System.out.println("Hello Simple Webcrawler!");
- }
-
-}