Skip to content

Commit d26ec83

Browse files
author
Vishal M Yadav
committed
added web crawler code
1 parent 0d8fc65 commit d26ec83

File tree

9 files changed

+265
-18
lines changed

9 files changed

+265
-18
lines changed

pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020
<version>RELEASE</version>
2121
<scope>provided</scope>
2222
</dependency>
23+
<dependency>
24+
<groupId>org.jsoup</groupId>
25+
<artifactId>jsoup</artifactId>
26+
<version>1.16.1</version>
27+
</dependency>
2328
<dependency>
2429
<groupId>org.projectlombok</groupId>
2530
<artifactId>lombok</artifactId>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package com.gatomalvado.done.multithreadedwebcrawler;
2+
3+
import java.util.List;
4+
5+
public interface HtmlParser {
6+
7+
List<String> parseHtml(String inputUrl);
8+
9+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package com.gatomalvado.done.multithreadedwebcrawler;
2+
3+
import java.util.concurrent.ExecutionException;
4+
5+
public class Main {
6+
7+
public static void main(String[] args) throws ExecutionException, InterruptedException {
8+
System.out.println("Hello Multithreaded Web Crawler!");
9+
String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was";
10+
11+
MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10);
12+
crawler.startCrawl(inputUrl);
13+
crawler.showParsedUrls();
14+
}
15+
16+
17+
18+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package com.gatomalvado.done.multithreadedwebcrawler;
2+
3+
import java.util.ArrayDeque;
4+
import java.util.Collections;
5+
import java.util.Deque;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.concurrent.Callable;
9+
import java.util.concurrent.ConcurrentHashMap;
10+
import java.util.concurrent.ExecutionException;
11+
import java.util.concurrent.ExecutorService;
12+
import java.util.concurrent.Executors;
13+
import java.util.concurrent.Future;
14+
import java.util.concurrent.TimeUnit;
15+
import java.util.stream.Collectors;
16+
17+
public class MultithreadedWebCrawler {
18+
19+
private final HtmlParser htmlParser;
20+
private final Map<String, Boolean> map;
21+
private final ExecutorService executorService;
22+
private final int limit;
23+
24+
public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) {
25+
this.htmlParser = htmlParser;
26+
this.map = new ConcurrentHashMap<>();
27+
this.limit = limit;
28+
this.executorService = Executors.newFixedThreadPool(12);
29+
}
30+
31+
public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException {
32+
Future<List<String>> extractedUrls = executorService.submit(crawl(inputUrl));
33+
Deque<Future<List<String>>> queue = new ArrayDeque<>();
34+
queue.add(extractedUrls);
35+
while (!queue.isEmpty()) {
36+
if(map.size() >= limit) {
37+
break;
38+
}
39+
Thread.sleep(3000);
40+
41+
Future<List<String>> extractedUrlsFuture = queue.removeFirst();
42+
List<String> parsedUrls = extractedUrlsFuture.get();
43+
for(String parsedUrl : parsedUrls) {
44+
if (!map.containsKey(parsedUrl)) {
45+
Callable<List<String>> callable = crawl(parsedUrl);
46+
queue.add(executorService.submit(callable));
47+
}
48+
}
49+
}
50+
executorService.shutdown();
51+
// Wait for existing tasks to complete
52+
executorService.awaitTermination(1, TimeUnit.SECONDS);
53+
}
54+
55+
public void showParsedUrls() {
56+
for(String key : map.keySet()) {
57+
System.out.println(key);
58+
}
59+
}
60+
61+
private Callable<List<String>> crawl(String url) {
62+
return () -> {
63+
if(!map.containsKey(url)) {
64+
List<String> parsedUrls = htmlParser.parseHtml(url);
65+
map.put(url, true);
66+
return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList());
67+
}
68+
return Collections.emptyList();
69+
};
70+
}
71+
72+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package com.gatomalvado.done.multithreadedwebcrawler;
2+
3+
import java.io.BufferedReader;
4+
import java.io.InputStreamReader;
5+
import java.net.URL;
6+
import java.util.LinkedList;
7+
import java.util.List;
8+
9+
import org.jsoup.Jsoup;
10+
import org.jsoup.nodes.Document;
11+
import org.jsoup.nodes.Element;
12+
import org.jsoup.select.Elements;
13+
14+
public class SingleThreadedHtmlParser implements HtmlParser {
15+
16+
@Override
17+
public List<String> parseHtml(String inputUrl) {
18+
String rawHtml = readUrl(inputUrl);
19+
return getUrlsFromWebsite(rawHtml);
20+
}
21+
22+
private List<String> getUrlsFromWebsite(String rawHtml) {
23+
List<String> urls = new LinkedList<>();
24+
Document doc = Jsoup.parse(rawHtml);
25+
Elements elements = doc.select("a[href]");
26+
27+
for(Element element : elements) {
28+
String link = element.attr("abs:href");
29+
if(!link.isEmpty()) {
30+
urls.add(link);
31+
}
32+
}
33+
34+
return urls;
35+
}
36+
37+
private String readUrl(String webLink) {
38+
String rawHtml = "";
39+
try {
40+
URL url = new URL(webLink);
41+
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
42+
String inputLine = "";
43+
while ((inputLine = reader.readLine()) != null) {
44+
rawHtml += inputLine;
45+
}
46+
reader.close();
47+
} catch (Exception e) {
48+
System.out.println("Error reading url: " + webLink);
49+
}
50+
51+
return rawHtml;
52+
}
53+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package com.gatomalvado.done.webcrawler;
2+
3+
public class Main {
4+
5+
public static void main(String[] args) throws InterruptedException {
6+
System.out.println("Hello Simple Webcrawler!");
7+
WebCrawler webCrawler = new WebCrawler();
8+
webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was");
9+
System.out.println(webCrawler.getDiscoveredWebsites());
10+
}
11+
12+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package com.gatomalvado.done.webcrawler;
2+
3+
import java.io.BufferedReader;
4+
import java.io.InputStreamReader;
5+
import java.net.URL;
6+
import java.util.HashSet;
7+
import java.util.LinkedList;
8+
import java.util.List;
9+
import java.util.Queue;
10+
import java.util.Set;
11+
import java.util.regex.Matcher;
12+
import java.util.regex.Pattern;
13+
14+
import org.jsoup.Jsoup;
15+
import org.jsoup.nodes.Document;
16+
import org.jsoup.nodes.Element;
17+
import org.jsoup.select.Elements;
18+
19+
import lombok.Getter;
20+
21+
public class WebCrawler {
22+
23+
private Queue<String> queue;
24+
25+
@Getter
26+
private Set<String> discoveredWebsites;
27+
28+
private int websitesProcessed;
29+
30+
public WebCrawler() {
31+
this.queue = new LinkedList<>();
32+
this.discoveredWebsites = new HashSet<>();
33+
this.websitesProcessed = 0;
34+
}
35+
36+
public void crawl(String seedUrl) throws InterruptedException {
37+
this.queue.offer(seedUrl);
38+
Thread.sleep(3000);
39+
while (!this.queue.isEmpty()) {
40+
String currentUrl = this.queue.poll();
41+
String rawHtml = readUrl(currentUrl);
42+
if("".equals(rawHtml)) {
43+
continue;
44+
}
45+
46+
List<String> urlsParsed = getUrlsFromWebsite(rawHtml);
47+
48+
for (String websiteUrl : urlsParsed) {
49+
if(!discoveredWebsites.contains(websiteUrl)) {
50+
// System.out.println("Website found with URL: " + websiteUrl);
51+
queue.add(websiteUrl);
52+
}
53+
}
54+
55+
this.discoveredWebsites.add(currentUrl);
56+
this.websitesProcessed++;
57+
58+
if(this.websitesProcessed == 10000) {
59+
break;
60+
}
61+
}
62+
}
63+
64+
private List<String> getUrlsFromWebsite(String rawHtml) {
65+
List<String> urls = new LinkedList<>();
66+
Document doc = Jsoup.parse(rawHtml);
67+
Elements elements = doc.select("a[href]");
68+
69+
for(Element element : elements) {
70+
String link = element.attr("abs:href");
71+
if(!link.isEmpty()) {
72+
urls.add(link);
73+
}
74+
}
75+
76+
return urls;
77+
}
78+
79+
private String readUrl(String webLink) {
80+
String rawHtml = "";
81+
try {
82+
URL url = new URL(webLink);
83+
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
84+
String inputLine = "";
85+
while ((inputLine = reader.readLine()) != null) {
86+
rawHtml += inputLine;
87+
}
88+
reader.close();
89+
} catch (Exception e) {
90+
System.out.println("Error reading url: " + webLink);
91+
}
92+
93+
return rawHtml;
94+
}
95+
96+
}

src/main/java/com/gatomalvado/todo/multithreadedwebcrawler/Main.java

Lines changed: 0 additions & 9 deletions
This file was deleted.

src/main/java/com/gatomalvado/todo/webcrawler/Main.java

Lines changed: 0 additions & 9 deletions
This file was deleted.

0 commit comments

Comments
 (0)