Skip to content

Commit

Permalink
added web crawler code
Browse files Browse the repository at this point in the history
  • Loading branch information
Vishal M Yadav committed Dec 2, 2024
1 parent 0d8fc65 commit d26ec83
Show file tree
Hide file tree
Showing 9 changed files with 265 additions and 18 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
<version>RELEASE</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.List;

public interface HtmlParser {

List<String> parseHtml(String inputUrl);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.concurrent.ExecutionException;

public class Main {

public static void main(String[] args) throws ExecutionException, InterruptedException {
System.out.println("Hello Multithreaded Web Crawler!");
String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was";

MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10);
crawler.startCrawl(inputUrl);
crawler.showParsedUrls();
}



}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

public class MultithreadedWebCrawler {

private final HtmlParser htmlParser;
private final Map<String, Boolean> map;
private final ExecutorService executorService;
private final int limit;

public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) {
this.htmlParser = htmlParser;
this.map = new ConcurrentHashMap<>();
this.limit = limit;
this.executorService = Executors.newFixedThreadPool(12);
}

public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException {
Future<List<String>> extractedUrls = executorService.submit(crawl(inputUrl));
Deque<Future<List<String>>> queue = new ArrayDeque<>();
queue.add(extractedUrls);
while (!queue.isEmpty()) {
if(map.size() >= limit) {
break;
}
Thread.sleep(3000);

Future<List<String>> extractedUrlsFuture = queue.removeFirst();
List<String> parsedUrls = extractedUrlsFuture.get();
for(String parsedUrl : parsedUrls) {
if (!map.containsKey(parsedUrl)) {
Callable<List<String>> callable = crawl(parsedUrl);
queue.add(executorService.submit(callable));
}
}
}
executorService.shutdown();
// Wait for existing tasks to complete
executorService.awaitTermination(1, TimeUnit.SECONDS);
}

public void showParsedUrls() {
for(String key : map.keySet()) {
System.out.println(key);
}
}

private Callable<List<String>> crawl(String url) {
return () -> {
if(!map.containsKey(url)) {
List<String> parsedUrls = htmlParser.parseHtml(url);
map.put(url, true);
return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList());
}
return Collections.emptyList();
};
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class SingleThreadedHtmlParser implements HtmlParser {

@Override
public List<String> parseHtml(String inputUrl) {
String rawHtml = readUrl(inputUrl);
return getUrlsFromWebsite(rawHtml);
}

private List<String> getUrlsFromWebsite(String rawHtml) {
List<String> urls = new LinkedList<>();
Document doc = Jsoup.parse(rawHtml);
Elements elements = doc.select("a[href]");

for(Element element : elements) {
String link = element.attr("abs:href");
if(!link.isEmpty()) {
urls.add(link);
}
}

return urls;
}

private String readUrl(String webLink) {
String rawHtml = "";
try {
URL url = new URL(webLink);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while ((inputLine = reader.readLine()) != null) {
rawHtml += inputLine;
}
reader.close();
} catch (Exception e) {
System.out.println("Error reading url: " + webLink);
}

return rawHtml;
}
}
12 changes: 12 additions & 0 deletions src/main/java/com/gatomalvado/done/webcrawler/Main.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package com.gatomalvado.done.webcrawler;

public class Main {

public static void main(String[] args) throws InterruptedException {
System.out.println("Hello Simple Webcrawler!");
WebCrawler webCrawler = new WebCrawler();
webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was");
System.out.println(webCrawler.getDiscoveredWebsites());
}

}
96 changes: 96 additions & 0 deletions src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package com.gatomalvado.done.webcrawler;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import lombok.Getter;

public class WebCrawler {

private Queue<String> queue;

@Getter
private Set<String> discoveredWebsites;

private int websitesProcessed;

public WebCrawler() {
this.queue = new LinkedList<>();
this.discoveredWebsites = new HashSet<>();
this.websitesProcessed = 0;
}

public void crawl(String seedUrl) throws InterruptedException {
this.queue.offer(seedUrl);
Thread.sleep(3000);
while (!this.queue.isEmpty()) {
String currentUrl = this.queue.poll();
String rawHtml = readUrl(currentUrl);
if("".equals(rawHtml)) {
continue;
}

List<String> urlsParsed = getUrlsFromWebsite(rawHtml);

for (String websiteUrl : urlsParsed) {
if(!discoveredWebsites.contains(websiteUrl)) {
// System.out.println("Website found with URL: " + websiteUrl);
queue.add(websiteUrl);
}
}

this.discoveredWebsites.add(currentUrl);
this.websitesProcessed++;

if(this.websitesProcessed == 10000) {
break;
}
}
}

private List<String> getUrlsFromWebsite(String rawHtml) {
List<String> urls = new LinkedList<>();
Document doc = Jsoup.parse(rawHtml);
Elements elements = doc.select("a[href]");

for(Element element : elements) {
String link = element.attr("abs:href");
if(!link.isEmpty()) {
urls.add(link);
}
}

return urls;
}

private String readUrl(String webLink) {
String rawHtml = "";
try {
URL url = new URL(webLink);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while ((inputLine = reader.readLine()) != null) {
rawHtml += inputLine;
}
reader.close();
} catch (Exception e) {
System.out.println("Error reading url: " + webLink);
}

return rawHtml;
}

}

This file was deleted.

9 changes: 0 additions & 9 deletions src/main/java/com/gatomalvado/todo/webcrawler/Main.java

This file was deleted.

0 comments on commit d26ec83

Please sign in to comment.