Skip to content

Commit

Permalink
✨ cloudflare support & golangci integration
Browse files Browse the repository at this point in the history
  • Loading branch information
lawzava committed Jan 23, 2021
1 parent d8909f7 commit b6d286a
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 42 deletions.
102 changes: 102 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
linters-settings:
gocritic:
enabled-tags:
- diagnostic
- experimental
- opinionated
- performance
- style
goimports:
local-prefixes: github.com/golangci/golangci-lint
golint:
min-confidence: 0
govet:
check-shadowing: true
funlen:
lines: 100
maligned:
suggest-new: true
misspell:
locale: US
nakedret:
max-func-lines: 2
gofumpt:
extra-rules: true

linters:
disable-all: true
enable:
- bodyclose
- deadcode
- depguard
- dogsled
- dupl
- errcheck
- funlen
- gochecknoglobals
- gochecknoinits
- gocognit
- goconst
- gocritic
- gocyclo
- godot
- gofmt
- goimports
- golint
- gomnd
- gomodguard
- goprintffuncname
- gosec
- gosimple
- govet
- ineffassign
- interfacer
- lll
- maligned
- misspell
- nakedret
- nestif
- prealloc
- rowserrcheck
- scopelint
- staticcheck
- structcheck
- stylecheck
- typecheck
- unconvert
- unparam
- unused
- varcheck
- whitespace
- wsl
- asciicheck
- godox
- nolintlint
- goerr113
- exhaustive
- exportloopref
- gofumpt
- goheader
- noctx
- sqlclosecheck
- nlreturn
- errorlint
- exhaustivestruct
- paralleltest
- tparallel
- wrapcheck
- forbidigo
- makezero
- predeclared
- thelper

issues:
exclude-rules:
- path: examples/*
linters:
- gomnd
- exhaustivestruct
- gochecknoglobals
- path: _test\.go
linters:
- exhaustivestruct
9 changes: 6 additions & 3 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ import (
"github.com/spf13/cobra"
)

// nolint:gochecknoglobals // allow global var here
var scraperParameters scraper.Parameters

// nolint:exhaustivestruct,gochecknoglobals // not valid requirement for this use case
var rootCmd = &cobra.Command{
Use: "scrape",
Short: "CLI utility to scrape emails from websites",
Expand All @@ -20,13 +22,13 @@ var rootCmd = &cobra.Command{
scraper := scraper.New(scraperParameters)

// Scrape for emails
var scrapedEmails []string
if err := scraper.Scrape(&scrapedEmails); err != nil {
scrapedEmails, err := scraper.Scrape()
if err != nil {
log.Fatal(err)
}

for _, email := range scrapedEmails {
fmt.Println(email)
fmt.Println(email) // nolint:forbidigo // allow println here for non intrusive response
}
},
}
Expand All @@ -38,6 +40,7 @@ func Execute() {
}
}

// nolint:gochecknoinits // required by github.com/spf13/cobra
func init() {
rootCmd.PersistentFlags().StringVarP(&scraperParameters.Website,
"website", "w", "https://lawzava.com", "Website to scrape")
Expand Down
5 changes: 4 additions & 1 deletion scraper/chrome.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (

func initiateScrapingFromChrome(response *colly.Response, timeout int) error {
opts := []chromedp.ExecAllocatorOption{
chromedp.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3830.0 Safari/537.36"), // nolint
// nolint:lll // allow longer line here
chromedp.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3830.0 Safari/537.36"),
// nolint:gomnd // allow magic number here
chromedp.WindowSize(1920, 1080),
chromedp.NoFirstRun,
chromedp.Headless,
Expand All @@ -35,6 +37,7 @@ func initiateScrapingFromChrome(response *colly.Response, timeout int) error {
); err != nil {
return fmt.Errorf("executing chromedp: %w", err)
}

response.Body = []byte(res)

return nil
Expand Down
5 changes: 3 additions & 2 deletions scraper/domain.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
package scraper

import (
"fmt"
"net/url"
"strings"
)

// Trim the input domain to whitelist root
// Trim the input domain to whitelist root.
func prepareAllowedDomain(requestURL string) ([]string, error) {
requestURL = "https://" + trimProtocol(requestURL)

u, err := url.ParseRequestURI(requestURL)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to parse request URI: %w", err)
}

domain := strings.TrimPrefix(u.Hostname(), "www.")
Expand Down
73 changes: 53 additions & 20 deletions scraper/email.go
Original file line number Diff line number Diff line change
@@ -1,52 +1,85 @@
package scraper

import (
"bytes"
"regexp"
"strconv"
"strings"
"sync"

"github.com/lawzava/go-tld"
)

// Initialize once
type emails struct {
emails []string
m sync.Mutex
}

func (s *emails) add(email string) {
if !isValidEmail(email) {
return
}

// check for already existing emails
for _, existingEmail := range s.emails {
if existingEmail == email {
return
}
}

s.m.Lock()
s.emails = append(s.emails, email)
s.m.Unlock()
}

// Initialize once.
var reg = regexp.MustCompile(`([a-zA-Z0-9._-]+@([a-zA-Z0-9_-]+\.)+[a-zA-Z0-9_-]+)`)

// Parse any *@*.* string and append to the slice
func parseEmails(body []byte, scrapedEmails *[]string) {
// Parse any *@*.* string and append to the slice.
func (s *emails) parseEmails(body []byte) {
res := reg.FindAll(body, -1)

for _, r := range res {
email := string(r)
if !isValidEmail(email) {
continue
}
s.add(string(r))
}
}

var skip bool
// Check for already existing emails
for _, existingEmail := range *scrapedEmails {
if existingEmail == email {
skip = true
break
}
}
func (s *emails) parseCloudflareEmail(cloudflareEncodedEmail string) {
decodedEmail := decodeCloudflareEmail(cloudflareEncodedEmail)
email := reg.FindString(decodedEmail)

if skip {
continue
}
s.add(email)
}

func decodeCloudflareEmail(email string) string {
var e bytes.Buffer

r, _ := strconv.ParseInt(email[0:2], 16, 0)

*scrapedEmails = append(*scrapedEmails, email)
for n := 4; n < len(email)+2; n += 2 {
i, _ := strconv.ParseInt(email[n-2:n], 16, 0)
e.WriteString(string(i ^ r))
}

return e.String()
}

// Check if email looks valid
// Check if email looks valid.
func isValidEmail(email string) bool {
if email == "" {
return false
}

split := strings.Split(email, ".")

// nolint:gomnd // allow magic number here
if len(split) < 2 {
return false
}

ending := split[len(split)-1]

// nolint:gomnd // allow magic number here
if len(ending) < 2 {
return false
}
Expand Down
29 changes: 19 additions & 10 deletions scraper/scrape.go
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
package scraper

import (
"errors"

"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
)

// Scrape is responsible for main scraping logic
func (s *Scraper) Scrape(scrapedEmails *[]string) error {
// Scrape is responsible for main scraping logic.
func (s *Scraper) Scrape() ([]string, error) {
// Initiate colly
c := colly.NewCollector()

c.Async = s.Async
c.MaxDepth = s.MaxDepth
s.Website = trimProtocol(s.Website)
e := emails{}

if !s.FollowExternalLinks {
allowedDomains, err := prepareAllowedDomain(s.Website)
if err != nil {
return err
return nil, err
}

c.AllowedDomains = allowedDomains
Expand All @@ -31,18 +34,19 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
c.OnResponse(func(response *colly.Response) {
if err := initiateScrapingFromChrome(response, s.Timeout); err != nil {
s.Log(err)

return
}
})
}

if s.Recursively {
// Find and visit all links
c.OnHTML("a", func(e *colly.HTMLElement) {
s.Log("visiting: ", e.Attr("href"))
if err := e.Request.Visit(e.Attr("href")); err != nil {
c.OnHTML("a[href]", func(el *colly.HTMLElement) {
s.Log("visiting: ", el.Attr("href"))
if err := el.Request.Visit(el.Attr("href")); err != nil {
// Ignore already visited error, this appears too often
if err != colly.ErrAlreadyVisited {
if !errors.Is(err, colly.ErrAlreadyVisited) {
s.Log("error while linking: ", err.Error())
}
}
Expand All @@ -51,7 +55,12 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {

// Parse emails on each downloaded page
c.OnScraped(func(response *colly.Response) {
parseEmails(response.Body, scrapedEmails)
e.parseEmails(response.Body)
})

// cloudflare encoded email support
c.OnHTML("span[data-cfemail]", func(el *colly.HTMLElement) {
e.parseCloudflareEmail(el.Attr("data-cfemail"))
})

// Start the scrape
Expand All @@ -61,7 +70,7 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {

c.Wait() // Wait for concurrent scrapes to finish

if scrapedEmails == nil || len(*scrapedEmails) == 0 {
if e.emails == nil || len(e.emails) == 0 {
// Start the scrape on insecure url
if err := c.Visit(s.GetWebsite(false)); err != nil {
s.Log("error while visiting: ", err.Error())
Expand All @@ -70,5 +79,5 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {
c.Wait() // Wait for concurrent scrapes to finish
}

return nil
return e.emails, nil
}
Loading

0 comments on commit b6d286a

Please sign in to comment.