From 85717a7fac4a2d4f8b4efe08c1a67cebba277a25 Mon Sep 17 00:00:00 2001
From: Dan Anglin <daangling@gmail.com>
Date: Tue, 27 Aug 2024 17:11:47 +0100
Subject: [PATCH] feat: use flags to configure the crawler

- Use flags to configure the worker pool and the maximum number of
  pages.
- Add README.md
---
 .golangci.yaml              |  9 ++++++++
 README.md                   | 42 +++++++++++++++++++++++++++++++++++++
 internal/crawler/crawler.go | 34 +++++++++++++++---------------
 main.go                     | 32 ++++++++++++++--------------
 4 files changed, 84 insertions(+), 33 deletions(-)
 create mode 100644 README.md

diff --git a/.golangci.yaml b/.golangci.yaml
index dd8e3ee..caccf5c 100644
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -13,6 +13,14 @@ output:
   sort-results: true
 
 linters-settings:
+  depguard:
+    rules:
+      main:
+        files:
+        - $all
+        allow:
+        - $gostd
+        - codeflow.dananglin.me.uk/apollo/web-crawler
   lll:
     line-length: 140
 
@@ -21,4 +29,5 @@ linters:
   disable:
   - execinquery
   - gomnd
+  - mnd
   fast: false
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4a6813c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+# Web Crawler
+
+## Overview
+
+This web crawler crawls a given URL and generates a report for all the internal links it finds.
+
+### Repository mirrors
+
+- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
+- **GitHub:** https://github.com/dananglin/web-crawler
+
+## Requirements
+
+- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
+
+## How to run the application
+
+Clone this repository to your local machine.
+```
+git clone https://github.com/dananglin/web-crawler.git
+```
+
+Build the application.
+```
+go build -o crawler .
+```
+
+Run the application specifying the website that you want to crawl.
+
+- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
+   ```
+   ./crawler --max-workers 3 --max-pages 20 https://example.com
+   ```
+
+## Flags
+
+You can configure the application with the following flags.
+
+| Name | Description | Default |
+|------|-------------|---------|
+| `max-workers` | The maximum number of concurrent workers. | 2 |
+| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index bf25d5b..c4a62ec 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -11,15 +11,15 @@ import (
 )
 
 type Crawler struct {
-	pages              map[string]int
-	baseURL            *url.URL
-	mu                 *sync.Mutex
-	concurrencyControl chan struct{}
-	wg                 *sync.WaitGroup
-	maxPages           int
+	pages      map[string]int
+	baseURL    *url.URL
+	mu         *sync.Mutex
+	workerPool chan struct{}
+	wg         *sync.WaitGroup
+	maxPages   int
 }
 
-func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
+func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
 	baseURL, err := url.Parse(rawBaseURL)
 	if err != nil {
 		return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
 	waitGroup.Add(1)
 
 	crawler := Crawler{
-		pages:              make(map[string]int),
-		baseURL:            baseURL,
-		mu:                 &sync.Mutex{},
-		concurrencyControl: make(chan struct{}, maxConcurrency),
-		wg:                 &waitGroup,
-		maxPages:           maxPages,
+		pages:      make(map[string]int),
+		baseURL:    baseURL,
+		mu:         &sync.Mutex{},
+		workerPool: make(chan struct{}, maxWorkers),
+		wg:         &waitGroup,
+		maxPages:   maxPages,
 	}
 
 	return &crawler, nil
@@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
 
 func (c *Crawler) Crawl(rawCurrentURL string) {
 	// Add an empty struct to channel here
-	c.concurrencyControl <- struct{}{}
+	c.workerPool <- struct{}{}
 
-	// Decrement the wait group counter and free up the channel when finished
-	// crawling.
+	// Decrement the wait group counter and free up the worker pool when
+	// finished crawling.
 	defer func() {
-		<-c.concurrencyControl
+		<-c.workerPool
 		c.wg.Done()
 	}()
 
diff --git a/main.go b/main.go
index 41df9ce..850f1ab 100644
--- a/main.go
+++ b/main.go
@@ -1,9 +1,10 @@
 package main
 
 import (
+	"errors"
+	"flag"
 	"fmt"
 	"os"
-	"strconv"
 
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
 )
@@ -16,26 +17,25 @@ func main() {
 	}
 }
 
+var errNoURLProvided = errors.New("the URL is not provided")
+
 func run() error {
-	args := os.Args[1:]
+	var (
+		maxWorkers int
+		maxPages   int
+	)
 
-	if len(args) != 3 {
-		return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
+	flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
+	flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
+	flag.Parse()
+
+	if flag.NArg() < 1 {
+		return errNoURLProvided
 	}
 
-	baseURL := args[0]
+	baseURL := flag.Arg(0)
 
-	maxConcurrency, err := strconv.Atoi(args[1])
-	if err != nil {
-		return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
-	}
-
-	maxPages, err := strconv.Atoi(args[2])
-	if err != nil {
-		return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
-	}
-
-	c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
+	c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
 	if err != nil {
 		return fmt.Errorf("unable to create the crawler: %w", err)
 	}