feat: use flags to configure the crawler

- Use flags to configure the worker pool and the maximum number of
  pages.
- Add README.md
This commit is contained in:
Dan Anglin 2024-08-27 17:11:47 +01:00
parent 4519de764e
commit 85717a7fac
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
4 changed files with 84 additions and 33 deletions

View file

@ -13,6 +13,14 @@ output:
sort-results: true sort-results: true
linters-settings: linters-settings:
depguard:
rules:
main:
files:
- $all
allow:
- $gostd
- codeflow.dananglin.me.uk/apollo/web-crawler
lll: lll:
line-length: 140 line-length: 140
@ -21,4 +29,5 @@ linters:
disable: disable:
- execinquery - execinquery
- gomnd - gomnd
- mnd
fast: false fast: false

42
README.md Normal file
View file

@ -0,0 +1,42 @@
# Web Crawler
## Overview
This web crawler crawls a given URL and generates a report for all the internal links it finds.
### Repository mirrors
- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
- **GitHub:** https://github.com/dananglin/web-crawler
## Requirements
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
## How to run the application
Clone this repository to your local machine.
```
git clone https://github.com/dananglin/web-crawler.git
```
Build the application.
```
go build -o crawler .
```
Run the application specifying the website that you want to crawl.
- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
```
./crawler --max-workers 3 --max-pages 20 https://example.com
```
## Flags
You can configure the application with the following flags.
| Name | Description | Default |
|------|-------------|---------|
| `max-workers` | The maximum number of concurrent workers. | 2 |
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |

View file

@ -14,12 +14,12 @@ type Crawler struct {
pages map[string]int pages map[string]int
baseURL *url.URL baseURL *url.URL
mu *sync.Mutex mu *sync.Mutex
concurrencyControl chan struct{} workerPool chan struct{}
wg *sync.WaitGroup wg *sync.WaitGroup
maxPages int maxPages int
} }
func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) { func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL) baseURL, err := url.Parse(rawBaseURL)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err) return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -33,7 +33,7 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
pages: make(map[string]int), pages: make(map[string]int),
baseURL: baseURL, baseURL: baseURL,
mu: &sync.Mutex{}, mu: &sync.Mutex{},
concurrencyControl: make(chan struct{}, maxConcurrency), workerPool: make(chan struct{}, maxWorkers),
wg: &waitGroup, wg: &waitGroup,
maxPages: maxPages, maxPages: maxPages,
} }
@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
func (c *Crawler) Crawl(rawCurrentURL string) { func (c *Crawler) Crawl(rawCurrentURL string) {
// Add an empty struct to channel here // Add an empty struct to channel here
c.concurrencyControl <- struct{}{} c.workerPool <- struct{}{}
// Decrement the wait group counter and free up the channel when finished // Decrement the wait group counter and free up the worker pool when
// crawling. // finished crawling.
defer func() { defer func() {
<-c.concurrencyControl <-c.workerPool
c.wg.Done() c.wg.Done()
}() }()

32
main.go
View file

@ -1,9 +1,10 @@
package main package main
import ( import (
"errors"
"flag"
"fmt" "fmt"
"os" "os"
"strconv"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
) )
@ -16,26 +17,25 @@ func main() {
} }
} }
var errNoURLProvided = errors.New("the URL is not provided")
func run() error { func run() error {
args := os.Args[1:] var (
maxWorkers int
maxPages int
)
if len(args) != 3 { flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args)) flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
flag.Parse()
if flag.NArg() < 1 {
return errNoURLProvided
} }
baseURL := args[0] baseURL := flag.Arg(0)
maxConcurrency, err := strconv.Atoi(args[1]) c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
if err != nil {
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
}
maxPages, err := strconv.Atoi(args[2])
if err != nil {
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
}
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
if err != nil { if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err) return fmt.Errorf("unable to create the crawler: %w", err)
} }