checkpoint: add configurable max pages and max concurrency settings

This commit is contained in:
Dan Anglin 2024-08-27 13:45:18 +01:00
parent a8a7bcaced
commit 9efb2102d8
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
2 changed files with 32 additions and 17 deletions

View file

@ -15,9 +15,10 @@ type Crawler struct {
mu *sync.Mutex mu *sync.Mutex
concurrencyControl chan struct{} concurrencyControl chan struct{}
wg *sync.WaitGroup wg *sync.WaitGroup
maxPages int
} }
func NewCrawler(rawBaseURL string) (*Crawler, error) { func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL) baseURL, err := url.Parse(rawBaseURL)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err) return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -31,16 +32,15 @@ func NewCrawler(rawBaseURL string) (*Crawler, error) {
pages: make(map[string]int), pages: make(map[string]int),
baseURL: baseURL, baseURL: baseURL,
mu: &sync.Mutex{}, mu: &sync.Mutex{},
concurrencyControl: make(chan struct{}, 2), concurrencyControl: make(chan struct{}, maxConcurrency),
wg: &waitGroup, wg: &waitGroup,
maxPages: maxPages,
} }
return &crawler, nil return &crawler, nil
} }
func (c *Crawler) Crawl(rawCurrentURL string) { func (c *Crawler) Crawl(rawCurrentURL string) {
var err error
// Add an empty struct to channel here // Add an empty struct to channel here
c.concurrencyControl <- struct{}{} c.concurrencyControl <- struct{}{}
@ -51,6 +51,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
c.wg.Done() c.wg.Done()
}() }()
if c.reachedMaxPages() {
return
}
// if current URL is not on the same domain as the base URL then return early. // if current URL is not on the same domain as the base URL then return early.
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL) hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
if err != nil { if err != nil {
@ -147,9 +151,19 @@ func (c *Crawler) Wait() {
} }
func (c *Crawler) PrintReport() { func (c *Crawler) PrintReport() {
c.mu.Lock()
defer c.mu.Unlock()
fmt.Printf("\n\nREPORT:\n") fmt.Printf("\n\nREPORT:\n")
for page, count := range maps.All(c.pages) { for page, count := range maps.All(c.pages) {
fmt.Printf("%s: %d\n", page, count) fmt.Printf("%s: %d\n", page, count)
} }
} }
func (c *Crawler) reachedMaxPages() bool {
c.mu.Lock()
defer c.mu.Unlock()
return len(c.pages) >= c.maxPages
}

27
main.go
View file

@ -1,18 +1,13 @@
package main package main
import ( import (
"errors"
"fmt" "fmt"
"os" "os"
"strconv"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
) )
var (
errNoWebsiteProvided = errors.New("no website provided")
errTooManyArgs = errors.New("too many arguments provided")
)
func main() { func main() {
if err := run(); err != nil { if err := run(); err != nil {
os.Stderr.WriteString("ERROR: " + err.Error() + "\n") os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
@ -24,17 +19,23 @@ func main() {
func run() error { func run() error {
args := os.Args[1:] args := os.Args[1:]
if len(args) == 0 { if len(args) != 3 {
return errNoWebsiteProvided return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
}
if len(args) > 1 {
return errTooManyArgs
} }
baseURL := args[0] baseURL := args[0]
c, err := crawler.NewCrawler(baseURL) maxConcurrency, err := strconv.Atoi(args[1])
if err != nil {
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
}
maxPages, err := strconv.Atoi(args[2])
if err != nil {
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
}
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
if err != nil { if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err) return fmt.Errorf("unable to create the crawler: %w", err)
} }