feat: use flags to configure the crawler

- Use flags to configure the worker pool and the maximum number of pages. - Add README.md
2024-08-27 17:11:47 +01:00 · 2024-08-27 17:11:47 +01:00 · 85717a7fac
commit 85717a7fac
parent 4519de764e
4 changed files with 84 additions and 33 deletions
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -13,6 +13,14 @@ output:
  sort-results: true
 linters-settings:
  depguard:
    rules:
      main:
        files:
        - $all
        allow:
        - $gostd
        - codeflow.dananglin.me.uk/apollo/web-crawler
  lll:
    line-length: 140
@ -21,4 +29,5 @@ linters:
  disable:
  - execinquery
  - gomnd
  - mnd
  fast: false
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
 # Web Crawler
 ## Overview
 This web crawler crawls a given URL and generates a report for all the internal links it finds.
 ### Repository mirrors
 - **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
 - **GitHub:** https://github.com/dananglin/web-crawler
 ## Requirements
 - **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
 ## How to run the application
 Clone this repository to your local machine.
 ```
 git clone https://github.com/dananglin/web-crawler.git
 ```
 Build the application.
 ```
 go build -o crawler .
 ```
 Run the application specifying the website that you want to crawl.
 - To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
   ```
   ./crawler --max-workers 3 --max-pages 20 https://example.com
   ```
 ## Flags
 You can configure the application with the following flags.
 | Name | Description | Default |
 |------|-------------|---------|
 | `max-workers` | The maximum number of concurrent workers. | 2 |
 | `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -11,15 +11,15 @@ import (
 )
 type Crawler struct {
-	pages              map[string]int
+	pages      map[string]int
-	baseURL            *url.URL
+	baseURL    *url.URL
-	mu                 *sync.Mutex
+	mu         *sync.Mutex
-	concurrencyControl chan struct{}
+	workerPool chan struct{}
-	wg                 *sync.WaitGroup
+	wg         *sync.WaitGroup
-	maxPages           int
+	maxPages   int
 }
-func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
+func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
 	baseURL, err := url.Parse(rawBaseURL)
 	if err != nil {
 		return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
 	waitGroup.Add(1)
 	crawler := Crawler{
-		pages:              make(map[string]int),
+		pages:      make(map[string]int),
-		baseURL:            baseURL,
+		baseURL:    baseURL,
-		mu:                 &sync.Mutex{},
+		mu:         &sync.Mutex{},
-		concurrencyControl: make(chan struct{}, maxConcurrency),
+		workerPool: make(chan struct{}, maxWorkers),
-		wg:                 &waitGroup,
+		wg:         &waitGroup,
-		maxPages:           maxPages,
+		maxPages:   maxPages,
 	}
 	return &crawler, nil
@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
 func (c *Crawler) Crawl(rawCurrentURL string) {
 	// Add an empty struct to channel here
-	c.concurrencyControl <- struct{}{}
+	c.workerPool <- struct{}{}
-	// Decrement the wait group counter and free up the channel when finished
+	// Decrement the wait group counter and free up the worker pool when
-	// crawling.
+	// finished crawling.
 	defer func() {
-		<-c.concurrencyControl
+		<-c.workerPool
 		c.wg.Done()
 	}()
--- a/main.go
+++ b/main.go
@ -1,9 +1,10 @@
 package main
 import (
 	"errors"
 	"flag"
 	"fmt"
 	"os"
 	"strconv"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
 )
@ -16,26 +17,25 @@ func main() {
 	}
 }
 var errNoURLProvided = errors.New("the URL is not provided")
 func run() error {
-	args := os.Args[1:]
+	var (
 		maxWorkers int
 		maxPages   int
 	)
-	if len(args) != 3 {
+	flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
-		return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
+	flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
 	flag.Parse()
 	if flag.NArg() < 1 {
 		return errNoURLProvided
 	}
-	baseURL := args[0]
+	baseURL := flag.Arg(0)
-	maxConcurrency, err := strconv.Atoi(args[1])
+	c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
 	if err != nil {
 		return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
 	}
 	maxPages, err := strconv.Atoi(args[2])
 	if err != nil {
 		return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
 	}
 	c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
 	if err != nil {
 		return fmt.Errorf("unable to create the crawler: %w", err)
 	}