✨ cloudflare support & golangci integration

lawzava · Jan 23, 2021 · b6d286a · b6d286a
1 parent d8909f7
commit b6d286a
Show file tree

Hide file tree

Showing 7 changed files with 195 additions and 42 deletions.
diff --git a/.golangci.yml b/.golangci.yml
@@ -0,0 +1,102 @@
+linters-settings:
+  gocritic:
+    enabled-tags:
+      - diagnostic
+      - experimental
+      - opinionated
+      - performance
+      - style
+  goimports:
+    local-prefixes: github.com/golangci/golangci-lint
+  golint:
+    min-confidence: 0
+  govet:
+    check-shadowing: true
+  funlen:
+    lines: 100
+  maligned:
+    suggest-new: true
+  misspell:
+    locale: US
+  nakedret:
+    max-func-lines: 2
+  gofumpt:
+    extra-rules: true
+
+linters:
+  disable-all: true
+  enable:
+    - bodyclose
+    - deadcode
+    - depguard
+    - dogsled
+    - dupl
+    - errcheck
+    - funlen
+    - gochecknoglobals
+    - gochecknoinits
+    - gocognit
+    - goconst
+    - gocritic
+    - gocyclo
+    - godot
+    - gofmt
+    - goimports
+    - golint
+    - gomnd
+    - gomodguard
+    - goprintffuncname
+    - gosec
+    - gosimple
+    - govet
+    - ineffassign
+    - interfacer
+    - lll
+    - maligned
+    - misspell
+    - nakedret
+    - nestif
+    - prealloc
+    - rowserrcheck
+    - scopelint
+    - staticcheck
+    - structcheck
+    - stylecheck
+    - typecheck
+    - unconvert
+    - unparam
+    - unused
+    - varcheck
+    - whitespace
+    - wsl
+    - asciicheck
+    - godox
+    - nolintlint
+    - goerr113
+    - exhaustive
+    - exportloopref
+    - gofumpt
+    - goheader
+    - noctx
+    - sqlclosecheck
+    - nlreturn
+    - errorlint
+    - exhaustivestruct
+    - paralleltest
+    - tparallel
+    - wrapcheck
+    - forbidigo
+    - makezero
+    - predeclared
+    - thelper
+
+issues:
+  exclude-rules:
+    - path: examples/*
+      linters:
+        - gomnd
+        - exhaustivestruct
+        - gochecknoglobals
+    - path: _test\.go
+      linters:
+        - exhaustivestruct
diff --git a/cmd/root.go b/cmd/root.go
@@ -10,8 +10,10 @@ import (
 	"github.com/spf13/cobra"
 )
 
+// nolint:gochecknoglobals // allow global var here
 var scraperParameters scraper.Parameters
 
+// nolint:exhaustivestruct,gochecknoglobals // not valid requirement for this use case
 var rootCmd = &cobra.Command{
 	Use:   "scrape",
 	Short: "CLI utility to scrape emails from websites",
@@ -20,13 +22,13 @@ var rootCmd = &cobra.Command{
 		scraper := scraper.New(scraperParameters)
 
 		// Scrape for emails
-		var scrapedEmails []string
-		if err := scraper.Scrape(&scrapedEmails); err != nil {
+		scrapedEmails, err := scraper.Scrape()
+		if err != nil {
 			log.Fatal(err)
 		}
 
 		for _, email := range scrapedEmails {
-			fmt.Println(email)
+			fmt.Println(email) // nolint:forbidigo // allow println here for non intrusive response
 		}
 	},
 }
@@ -38,6 +40,7 @@ func Execute() {
 	}
 }
 
+// nolint:gochecknoinits // required by github.com/spf13/cobra
 func init() {
 	rootCmd.PersistentFlags().StringVarP(&scraperParameters.Website,
 		"website", "w", "https://lawzava.com", "Website to scrape")

diff --git a/scraper/chrome.go b/scraper/chrome.go
@@ -11,7 +11,9 @@ import (
 
 func initiateScrapingFromChrome(response *colly.Response, timeout int) error {
 	opts := []chromedp.ExecAllocatorOption{
-		chromedp.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3830.0 Safari/537.36"), // nolint
+		// nolint:lll // allow longer line here
+		chromedp.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3830.0 Safari/537.36"),
+		// nolint:gomnd // allow magic number here
 		chromedp.WindowSize(1920, 1080),
 		chromedp.NoFirstRun,
 		chromedp.Headless,
@@ -35,6 +37,7 @@ func initiateScrapingFromChrome(response *colly.Response, timeout int) error {
 	); err != nil {
 		return fmt.Errorf("executing chromedp: %w", err)
 	}
+
 	response.Body = []byte(res)
 
 	return nil

diff --git a/scraper/domain.go b/scraper/domain.go
@@ -1,17 +1,18 @@
 package scraper
 
 import (
+	"fmt"
 	"net/url"
 	"strings"
 )
 
-// Trim the input domain to whitelist root
+// Trim the input domain to whitelist root.
 func prepareAllowedDomain(requestURL string) ([]string, error) {
 	requestURL = "https://" + trimProtocol(requestURL)
 
 	u, err := url.ParseRequestURI(requestURL)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("failed to parse request URI: %w", err)
 	}
 
 	domain := strings.TrimPrefix(u.Hostname(), "www.")

diff --git a/scraper/email.go b/scraper/email.go
@@ -1,52 +1,85 @@
 package scraper
 
 import (
+	"bytes"
 	"regexp"
 	"strconv"
 	"strings"
+	"sync"
 
 	"github.com/lawzava/go-tld"
 )
 
-// Initialize once
+type emails struct {
+	emails []string
+	m      sync.Mutex
+}
+
+func (s *emails) add(email string) {
+	if !isValidEmail(email) {
+		return
+	}
+
+	// check for already existing emails
+	for _, existingEmail := range s.emails {
+		if existingEmail == email {
+			return
+		}
+	}
+
+	s.m.Lock()
+	s.emails = append(s.emails, email)
+	s.m.Unlock()
+}
+
+// Initialize once.
 var reg = regexp.MustCompile(`([a-zA-Z0-9._-]+@([a-zA-Z0-9_-]+\.)+[a-zA-Z0-9_-]+)`)
 
-// Parse any *@*.* string and append to the slice
-func parseEmails(body []byte, scrapedEmails *[]string) {
+// Parse any *@*.* string and append to the slice.
+func (s *emails) parseEmails(body []byte) {
 	res := reg.FindAll(body, -1)
 
 	for _, r := range res {
-		email := string(r)
-		if !isValidEmail(email) {
-			continue
-		}
+		s.add(string(r))
+	}
+}
 
-		var skip bool
-		// Check for already existing emails
-		for _, existingEmail := range *scrapedEmails {
-			if existingEmail == email {
-				skip = true
-				break
-			}
-		}
+func (s *emails) parseCloudflareEmail(cloudflareEncodedEmail string) {
+	decodedEmail := decodeCloudflareEmail(cloudflareEncodedEmail)
+	email := reg.FindString(decodedEmail)
 
-		if skip {
-			continue
-		}
+	s.add(email)
+}
+
+func decodeCloudflareEmail(email string) string {
+	var e bytes.Buffer
+
+	r, _ := strconv.ParseInt(email[0:2], 16, 0)
 
-		*scrapedEmails = append(*scrapedEmails, email)
+	for n := 4; n < len(email)+2; n += 2 {
+		i, _ := strconv.ParseInt(email[n-2:n], 16, 0)
+		e.WriteString(string(i ^ r))
 	}
+
+	return e.String()
 }
 
-// Check if email looks valid
+// Check if email looks valid.
 func isValidEmail(email string) bool {
+	if email == "" {
+		return false
+	}
+
 	split := strings.Split(email, ".")
+
+	// nolint:gomnd // allow magic number here
 	if len(split) < 2 {
 		return false
 	}
 
 	ending := split[len(split)-1]
 
+	// nolint:gomnd // allow magic number here
 	if len(ending) < 2 {
 		return false
 	}

diff --git a/scraper/scrape.go b/scraper/scrape.go
@@ -1,23 +1,26 @@
 package scraper
 
 import (
+	"errors"
+
 	"github.com/gocolly/colly"
 	"github.com/gocolly/colly/debug"
 )
 
-// Scrape is responsible for main scraping logic
-func (s *Scraper) Scrape(scrapedEmails *[]string) error {
+// Scrape is responsible for main scraping logic.
+func (s *Scraper) Scrape() ([]string, error) {
 	// Initiate colly
 	c := colly.NewCollector()
 
 	c.Async = s.Async
 	c.MaxDepth = s.MaxDepth
 	s.Website = trimProtocol(s.Website)
+	e := emails{}
 
 	if !s.FollowExternalLinks {
 		allowedDomains, err := prepareAllowedDomain(s.Website)
 		if err != nil {
-			return err
+			return nil, err
 		}
 
 		c.AllowedDomains = allowedDomains
@@ -31,18 +34,19 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
 		c.OnResponse(func(response *colly.Response) {
 			if err := initiateScrapingFromChrome(response, s.Timeout); err != nil {
 				s.Log(err)
+
 				return
 			}
 		})
 	}
 
 	if s.Recursively {
 		// Find and visit all links
-		c.OnHTML("a", func(e *colly.HTMLElement) {
-			s.Log("visiting: ", e.Attr("href"))
-			if err := e.Request.Visit(e.Attr("href")); err != nil {
+		c.OnHTML("a[href]", func(el *colly.HTMLElement) {
+			s.Log("visiting: ", el.Attr("href"))
+			if err := el.Request.Visit(el.Attr("href")); err != nil {
 				// Ignore already visited error, this appears too often
-				if err != colly.ErrAlreadyVisited {
+				if !errors.Is(err, colly.ErrAlreadyVisited) {
 					s.Log("error while linking: ", err.Error())
 				}
 			}
@@ -51,7 +55,12 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
 
 	// Parse emails on each downloaded page
 	c.OnScraped(func(response *colly.Response) {
-		parseEmails(response.Body, scrapedEmails)
+		e.parseEmails(response.Body)
+	})
+
+	// cloudflare encoded email support
+	c.OnHTML("span[data-cfemail]", func(el *colly.HTMLElement) {
+		e.parseCloudflareEmail(el.Attr("data-cfemail"))
 	})
 
 	// Start the scrape
@@ -61,7 +70,7 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
 
 	c.Wait() // Wait for concurrent scrapes to finish
 
-	if scrapedEmails == nil || len(*scrapedEmails) == 0 {
+	if e.emails == nil || len(e.emails) == 0 {
 		// Start the scrape on insecure url
 		if err := c.Visit(s.GetWebsite(false)); err != nil {
 			s.Log("error while visiting: ", err.Error())
@@ -70,5 +79,5 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
 		c.Wait() // Wait for concurrent scrapes to finish
 	}
 
-	return nil
+	return e.emails, nil
 }