From 85717a7fac4a2d4f8b4efe08c1a67cebba277a25 Mon Sep 17 00:00:00 2001 From: Dan Anglin Date: Tue, 27 Aug 2024 17:11:47 +0100 Subject: [PATCH] feat: use flags to configure the crawler - Use flags to configure the worker pool and the maximum number of pages. - Add README.md --- .golangci.yaml | 9 ++++++++ README.md | 42 +++++++++++++++++++++++++++++++++++++ internal/crawler/crawler.go | 34 +++++++++++++++--------------- main.go | 32 ++++++++++++++-------------- 4 files changed, 84 insertions(+), 33 deletions(-) create mode 100644 README.md diff --git a/.golangci.yaml b/.golangci.yaml index dd8e3ee..caccf5c 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -13,6 +13,14 @@ output: sort-results: true linters-settings: + depguard: + rules: + main: + files: + - $all + allow: + - $gostd + - codeflow.dananglin.me.uk/apollo/web-crawler lll: line-length: 140 @@ -21,4 +29,5 @@ linters: disable: - execinquery - gomnd + - mnd fast: false diff --git a/README.md b/README.md new file mode 100644 index 0000000..4a6813c --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# Web Crawler + +## Overview + +This web crawler crawls a given URL and generates a report for all the internal links it finds. + +### Repository mirrors + +- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler +- **GitHub:** https://github.com/dananglin/web-crawler + +## Requirements + +- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version. + +## How to run the application + +Clone this repository to your local machine. +``` +git clone https://github.com/dananglin/web-crawler.git +``` + +Build the application. +``` +go build -o crawler . +``` + +Run the application specifying the website that you want to crawl. + +- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages: + ``` + ./crawler --max-workers 3 --max-pages 20 https://example.com + ``` + +## Flags + +You can configure the application with the following flags. + +| Name | Description | Default | +|------|-------------|---------| +| `max-workers` | The maximum number of concurrent workers. | 2 | +| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 | diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index bf25d5b..c4a62ec 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -11,15 +11,15 @@ import ( ) type Crawler struct { - pages map[string]int - baseURL *url.URL - mu *sync.Mutex - concurrencyControl chan struct{} - wg *sync.WaitGroup - maxPages int + pages map[string]int + baseURL *url.URL + mu *sync.Mutex + workerPool chan struct{} + wg *sync.WaitGroup + maxPages int } -func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) { +func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) { baseURL, err := url.Parse(rawBaseURL) if err != nil { return nil, fmt.Errorf("unable to parse the base URL: %w", err) @@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro waitGroup.Add(1) crawler := Crawler{ - pages: make(map[string]int), - baseURL: baseURL, - mu: &sync.Mutex{}, - concurrencyControl: make(chan struct{}, maxConcurrency), - wg: &waitGroup, - maxPages: maxPages, + pages: make(map[string]int), + baseURL: baseURL, + mu: &sync.Mutex{}, + workerPool: make(chan struct{}, maxWorkers), + wg: &waitGroup, + maxPages: maxPages, } return &crawler, nil @@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro func (c *Crawler) Crawl(rawCurrentURL string) { // Add an empty struct to channel here - c.concurrencyControl <- struct{}{} + c.workerPool <- struct{}{} - // Decrement the wait group counter and free up the channel when finished - // crawling. + // Decrement the wait group counter and free up the worker pool when + // finished crawling. defer func() { - <-c.concurrencyControl + <-c.workerPool c.wg.Done() }() diff --git a/main.go b/main.go index 41df9ce..850f1ab 100644 --- a/main.go +++ b/main.go @@ -1,9 +1,10 @@ package main import ( + "errors" + "flag" "fmt" "os" - "strconv" "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" ) @@ -16,26 +17,25 @@ func main() { } } +var errNoURLProvided = errors.New("the URL is not provided") + func run() error { - args := os.Args[1:] + var ( + maxWorkers int + maxPages int + ) - if len(args) != 3 { - return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args)) + flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers") + flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl") + flag.Parse() + + if flag.NArg() < 1 { + return errNoURLProvided } - baseURL := args[0] + baseURL := flag.Arg(0) - maxConcurrency, err := strconv.Atoi(args[1]) - if err != nil { - return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err) - } - - maxPages, err := strconv.Atoi(args[2]) - if err != nil { - return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err) - } - - c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages) + c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages) if err != nil { return fmt.Errorf("unable to create the crawler: %w", err) }