feat: use flags to configure the crawler

- Use flags to configure the worker pool and the maximum number of pages. - Add README.md
2024-08-27 17:11:47 +01:00 · 2024-08-27 17:11:47 +01:00 · 85717a7fac
commit 85717a7fac
parent 4519de764e
4 changed files with 84 additions and 33 deletions
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -13,6 +13,14 @@ output:
  sort-results: true

 linters-settings:
+  depguard:
+    rules:
+      main:
+        files:
+        - $all
+        allow:
+        - $gostd
+        - codeflow.dananglin.me.uk/apollo/web-crawler
  lll:
    line-length: 140

@ -21,4 +29,5 @@ linters:
  disable:
  - execinquery
  - gomnd
+  - mnd
  fast: false
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
+# Web Crawler
+
+## Overview
+
+This web crawler crawls a given URL and generates a report for all the internal links it finds.
+
+### Repository mirrors
+
+- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
+- **GitHub:** https://github.com/dananglin/web-crawler
+
+## Requirements
+
+- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
+
+## How to run the application
+
+Clone this repository to your local machine.
+```
+git clone https://github.com/dananglin/web-crawler.git
+```
+
+Build the application.
+```
+go build -o crawler .
+```
+
+Run the application specifying the website that you want to crawl.
+
+- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
+   ```
+   ./crawler --max-workers 3 --max-pages 20 https://example.com
+   ```
+
+## Flags
+
+You can configure the application with the following flags.
+
+| Name | Description | Default |
+|------|-------------|---------|
+| `max-workers` | The maximum number of concurrent workers. | 2 |
+| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -11,15 +11,15 @@ import (
 )

 type Crawler struct {
-	pages              map[string]int
-	baseURL            *url.URL
-	mu                 *sync.Mutex
-	concurrencyControl chan struct{}
-	wg                 *sync.WaitGroup
-	maxPages           int
+	pages      map[string]int
+	baseURL    *url.URL
+	mu         *sync.Mutex
+	workerPool chan struct{}
+	wg         *sync.WaitGroup
+	maxPages   int
 }

-func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
+func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
 	baseURL, err := url.Parse(rawBaseURL)
 	if err != nil {
 		return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
 	waitGroup.Add(1)

 	crawler := Crawler{
-		pages:              make(map[string]int),
-		baseURL:            baseURL,
-		mu:                 &sync.Mutex{},
-		concurrencyControl: make(chan struct{}, maxConcurrency),
-		wg:                 &waitGroup,
-		maxPages:           maxPages,
+		pages:      make(map[string]int),
+		baseURL:    baseURL,
+		mu:         &sync.Mutex{},
+		workerPool: make(chan struct{}, maxWorkers),
+		wg:         &waitGroup,
+		maxPages:   maxPages,
 	}

 	return &crawler, nil
@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro

 func (c *Crawler) Crawl(rawCurrentURL string) {
 	// Add an empty struct to channel here
-	c.concurrencyControl <- struct{}{}
+	c.workerPool <- struct{}{}

-	// Decrement the wait group counter and free up the channel when finished
-	// crawling.
+	// Decrement the wait group counter and free up the worker pool when
+	// finished crawling.
 	defer func() {
-		<-c.concurrencyControl
+		<-c.workerPool
 		c.wg.Done()
 	}()

--- a/main.go
+++ b/main.go
@ -1,9 +1,10 @@
 package main

 import (
+	"errors"
+	"flag"
 	"fmt"
 	"os"
-	"strconv"

 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
 )
@ -16,26 +17,25 @@ func main() {
 	}
 }

+var errNoURLProvided = errors.New("the URL is not provided")
+
 func run() error {
-	args := os.Args[1:]
+	var (
+		maxWorkers int
+		maxPages   int
+	)

-	if len(args) != 3 {
-		return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
+	flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
+	flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
+	flag.Parse()
+
+	if flag.NArg() < 1 {
+		return errNoURLProvided
 	}

-	baseURL := args[0]
+	baseURL := flag.Arg(0)

-	maxConcurrency, err := strconv.Atoi(args[1])
-	if err != nil {
-		return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
-	}
-
-	maxPages, err := strconv.Atoi(args[2])
-	if err != nil {
-		return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
-	}
-
-	c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
+	c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
 	if err != nil {
 		return fmt.Errorf("unable to create the crawler: %w", err)
 	}