-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Vishal M Yadav
committed
Dec 2, 2024
1 parent
0d8fc65
commit d26ec83
Showing
9 changed files
with
265 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
src/main/java/com/gatomalvado/done/multithreadedwebcrawler/HtmlParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package com.gatomalvado.done.multithreadedwebcrawler; | ||
|
||
import java.util.List; | ||
|
||
public interface HtmlParser { | ||
|
||
List<String> parseHtml(String inputUrl); | ||
|
||
} |
18 changes: 18 additions & 0 deletions
18
src/main/java/com/gatomalvado/done/multithreadedwebcrawler/Main.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package com.gatomalvado.done.multithreadedwebcrawler; | ||
|
||
import java.util.concurrent.ExecutionException; | ||
|
||
public class Main { | ||
|
||
public static void main(String[] args) throws ExecutionException, InterruptedException { | ||
System.out.println("Hello Multithreaded Web Crawler!"); | ||
String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was"; | ||
|
||
MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10); | ||
crawler.startCrawl(inputUrl); | ||
crawler.showParsedUrls(); | ||
} | ||
|
||
|
||
|
||
} |
72 changes: 72 additions & 0 deletions
72
src/main/java/com/gatomalvado/done/multithreadedwebcrawler/MultithreadedWebCrawler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package com.gatomalvado.done.multithreadedwebcrawler; | ||
|
||
import java.util.ArrayDeque; | ||
import java.util.Collections; | ||
import java.util.Deque; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.concurrent.Callable; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.Future; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.stream.Collectors; | ||
|
||
public class MultithreadedWebCrawler { | ||
|
||
private final HtmlParser htmlParser; | ||
private final Map<String, Boolean> map; | ||
private final ExecutorService executorService; | ||
private final int limit; | ||
|
||
public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) { | ||
this.htmlParser = htmlParser; | ||
this.map = new ConcurrentHashMap<>(); | ||
this.limit = limit; | ||
this.executorService = Executors.newFixedThreadPool(12); | ||
} | ||
|
||
public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException { | ||
Future<List<String>> extractedUrls = executorService.submit(crawl(inputUrl)); | ||
Deque<Future<List<String>>> queue = new ArrayDeque<>(); | ||
queue.add(extractedUrls); | ||
while (!queue.isEmpty()) { | ||
if(map.size() >= limit) { | ||
break; | ||
} | ||
Thread.sleep(3000); | ||
|
||
Future<List<String>> extractedUrlsFuture = queue.removeFirst(); | ||
List<String> parsedUrls = extractedUrlsFuture.get(); | ||
for(String parsedUrl : parsedUrls) { | ||
if (!map.containsKey(parsedUrl)) { | ||
Callable<List<String>> callable = crawl(parsedUrl); | ||
queue.add(executorService.submit(callable)); | ||
} | ||
} | ||
} | ||
executorService.shutdown(); | ||
// Wait for existing tasks to complete | ||
executorService.awaitTermination(1, TimeUnit.SECONDS); | ||
} | ||
|
||
public void showParsedUrls() { | ||
for(String key : map.keySet()) { | ||
System.out.println(key); | ||
} | ||
} | ||
|
||
private Callable<List<String>> crawl(String url) { | ||
return () -> { | ||
if(!map.containsKey(url)) { | ||
List<String> parsedUrls = htmlParser.parseHtml(url); | ||
map.put(url, true); | ||
return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList()); | ||
} | ||
return Collections.emptyList(); | ||
}; | ||
} | ||
|
||
} |
53 changes: 53 additions & 0 deletions
53
src/main/java/com/gatomalvado/done/multithreadedwebcrawler/SingleThreadedHtmlParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package com.gatomalvado.done.multithreadedwebcrawler; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.InputStreamReader; | ||
import java.net.URL; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
|
||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
public class SingleThreadedHtmlParser implements HtmlParser { | ||
|
||
@Override | ||
public List<String> parseHtml(String inputUrl) { | ||
String rawHtml = readUrl(inputUrl); | ||
return getUrlsFromWebsite(rawHtml); | ||
} | ||
|
||
private List<String> getUrlsFromWebsite(String rawHtml) { | ||
List<String> urls = new LinkedList<>(); | ||
Document doc = Jsoup.parse(rawHtml); | ||
Elements elements = doc.select("a[href]"); | ||
|
||
for(Element element : elements) { | ||
String link = element.attr("abs:href"); | ||
if(!link.isEmpty()) { | ||
urls.add(link); | ||
} | ||
} | ||
|
||
return urls; | ||
} | ||
|
||
private String readUrl(String webLink) { | ||
String rawHtml = ""; | ||
try { | ||
URL url = new URL(webLink); | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); | ||
String inputLine = ""; | ||
while ((inputLine = reader.readLine()) != null) { | ||
rawHtml += inputLine; | ||
} | ||
reader.close(); | ||
} catch (Exception e) { | ||
System.out.println("Error reading url: " + webLink); | ||
} | ||
|
||
return rawHtml; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package com.gatomalvado.done.webcrawler; | ||
|
||
public class Main { | ||
|
||
public static void main(String[] args) throws InterruptedException { | ||
System.out.println("Hello Simple Webcrawler!"); | ||
WebCrawler webCrawler = new WebCrawler(); | ||
webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was"); | ||
System.out.println(webCrawler.getDiscoveredWebsites()); | ||
} | ||
|
||
} |
96 changes: 96 additions & 0 deletions
96
src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package com.gatomalvado.done.webcrawler; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.InputStreamReader; | ||
import java.net.URL; | ||
import java.util.HashSet; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.Queue; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import lombok.Getter; | ||
|
||
public class WebCrawler { | ||
|
||
private Queue<String> queue; | ||
|
||
@Getter | ||
private Set<String> discoveredWebsites; | ||
|
||
private int websitesProcessed; | ||
|
||
public WebCrawler() { | ||
this.queue = new LinkedList<>(); | ||
this.discoveredWebsites = new HashSet<>(); | ||
this.websitesProcessed = 0; | ||
} | ||
|
||
public void crawl(String seedUrl) throws InterruptedException { | ||
this.queue.offer(seedUrl); | ||
Thread.sleep(3000); | ||
while (!this.queue.isEmpty()) { | ||
String currentUrl = this.queue.poll(); | ||
String rawHtml = readUrl(currentUrl); | ||
if("".equals(rawHtml)) { | ||
continue; | ||
} | ||
|
||
List<String> urlsParsed = getUrlsFromWebsite(rawHtml); | ||
|
||
for (String websiteUrl : urlsParsed) { | ||
if(!discoveredWebsites.contains(websiteUrl)) { | ||
// System.out.println("Website found with URL: " + websiteUrl); | ||
queue.add(websiteUrl); | ||
} | ||
} | ||
|
||
this.discoveredWebsites.add(currentUrl); | ||
this.websitesProcessed++; | ||
|
||
if(this.websitesProcessed == 10000) { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
private List<String> getUrlsFromWebsite(String rawHtml) { | ||
List<String> urls = new LinkedList<>(); | ||
Document doc = Jsoup.parse(rawHtml); | ||
Elements elements = doc.select("a[href]"); | ||
|
||
for(Element element : elements) { | ||
String link = element.attr("abs:href"); | ||
if(!link.isEmpty()) { | ||
urls.add(link); | ||
} | ||
} | ||
|
||
return urls; | ||
} | ||
|
||
private String readUrl(String webLink) { | ||
String rawHtml = ""; | ||
try { | ||
URL url = new URL(webLink); | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); | ||
String inputLine = ""; | ||
while ((inputLine = reader.readLine()) != null) { | ||
rawHtml += inputLine; | ||
} | ||
reader.close(); | ||
} catch (Exception e) { | ||
System.out.println("Error reading url: " + webLink); | ||
} | ||
|
||
return rawHtml; | ||
} | ||
|
||
} |
9 changes: 0 additions & 9 deletions
9
src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.