generated from templates/go-generic
checkpoint: add configurable max pages and max concurrency settings
This commit is contained in:
parent
a8a7bcaced
commit
9efb2102d8
2 changed files with 32 additions and 17 deletions
|
@ -15,9 +15,10 @@ type Crawler struct {
|
||||||
mu *sync.Mutex
|
mu *sync.Mutex
|
||||||
concurrencyControl chan struct{}
|
concurrencyControl chan struct{}
|
||||||
wg *sync.WaitGroup
|
wg *sync.WaitGroup
|
||||||
|
maxPages int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCrawler(rawBaseURL string) (*Crawler, error) {
|
func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
|
||||||
baseURL, err := url.Parse(rawBaseURL)
|
baseURL, err := url.Parse(rawBaseURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
|
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
|
||||||
|
@ -31,16 +32,15 @@ func NewCrawler(rawBaseURL string) (*Crawler, error) {
|
||||||
pages: make(map[string]int),
|
pages: make(map[string]int),
|
||||||
baseURL: baseURL,
|
baseURL: baseURL,
|
||||||
mu: &sync.Mutex{},
|
mu: &sync.Mutex{},
|
||||||
concurrencyControl: make(chan struct{}, 2),
|
concurrencyControl: make(chan struct{}, maxConcurrency),
|
||||||
wg: &waitGroup,
|
wg: &waitGroup,
|
||||||
|
maxPages: maxPages,
|
||||||
}
|
}
|
||||||
|
|
||||||
return &crawler, nil
|
return &crawler, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) Crawl(rawCurrentURL string) {
|
func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
var err error
|
|
||||||
|
|
||||||
// Add an empty struct to channel here
|
// Add an empty struct to channel here
|
||||||
c.concurrencyControl <- struct{}{}
|
c.concurrencyControl <- struct{}{}
|
||||||
|
|
||||||
|
@ -51,6 +51,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
c.wg.Done()
|
c.wg.Done()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if c.reachedMaxPages() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// if current URL is not on the same domain as the base URL then return early.
|
// if current URL is not on the same domain as the base URL then return early.
|
||||||
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
|
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -147,9 +151,19 @@ func (c *Crawler) Wait() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) PrintReport() {
|
func (c *Crawler) PrintReport() {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
fmt.Printf("\n\nREPORT:\n")
|
fmt.Printf("\n\nREPORT:\n")
|
||||||
|
|
||||||
for page, count := range maps.All(c.pages) {
|
for page, count := range maps.All(c.pages) {
|
||||||
fmt.Printf("%s: %d\n", page, count)
|
fmt.Printf("%s: %d\n", page, count)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) reachedMaxPages() bool {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
|
return len(c.pages) >= c.maxPages
|
||||||
|
}
|
||||||
|
|
27
main.go
27
main.go
|
@ -1,18 +1,13 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
errNoWebsiteProvided = errors.New("no website provided")
|
|
||||||
errTooManyArgs = errors.New("too many arguments provided")
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
if err := run(); err != nil {
|
if err := run(); err != nil {
|
||||||
os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
|
os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
|
||||||
|
@ -24,17 +19,23 @@ func main() {
|
||||||
func run() error {
|
func run() error {
|
||||||
args := os.Args[1:]
|
args := os.Args[1:]
|
||||||
|
|
||||||
if len(args) == 0 {
|
if len(args) != 3 {
|
||||||
return errNoWebsiteProvided
|
return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
|
||||||
}
|
|
||||||
|
|
||||||
if len(args) > 1 {
|
|
||||||
return errTooManyArgs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
baseURL := args[0]
|
baseURL := args[0]
|
||||||
|
|
||||||
c, err := crawler.NewCrawler(baseURL)
|
maxConcurrency, err := strconv.Atoi(args[1])
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
|
||||||
|
}
|
||||||
|
|
||||||
|
maxPages, err := strconv.Atoi(args[2])
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
|
||||||
|
}
|
||||||
|
|
||||||
|
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to create the crawler: %w", err)
|
return fmt.Errorf("unable to create the crawler: %w", err)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue