feat: use flags to configure the crawler

- Use flags to configure the worker pool and the maximum number of
  pages.
- Add README.md
This commit is contained in:
Dan Anglin 2024-08-27 17:11:47 +01:00
parent 4519de764e
commit 85717a7fac
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
4 changed files with 84 additions and 33 deletions

View file

@ -13,6 +13,14 @@ output:
sort-results: true
linters-settings:
depguard:
rules:
main:
files:
- $all
allow:
- $gostd
- codeflow.dananglin.me.uk/apollo/web-crawler
lll:
line-length: 140
@ -21,4 +29,5 @@ linters:
disable:
- execinquery
- gomnd
- mnd
fast: false

42
README.md Normal file
View file

@ -0,0 +1,42 @@
# Web Crawler
## Overview
This web crawler crawls a given URL and generates a report for all the internal links it finds.
### Repository mirrors
- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
- **GitHub:** https://github.com/dananglin/web-crawler
## Requirements
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
## How to run the application
Clone this repository to your local machine.
```
git clone https://github.com/dananglin/web-crawler.git
```
Build the application.
```
go build -o crawler .
```
Run the application specifying the website that you want to crawl.
- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
```
./crawler --max-workers 3 --max-pages 20 https://example.com
```
## Flags
You can configure the application with the following flags.
| Name | Description | Default |
|------|-------------|---------|
| `max-workers` | The maximum number of concurrent workers. | 2 |
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |

View file

@ -11,15 +11,15 @@ import (
)
type Crawler struct {
pages map[string]int
baseURL *url.URL
mu *sync.Mutex
concurrencyControl chan struct{}
wg *sync.WaitGroup
maxPages int
pages map[string]int
baseURL *url.URL
mu *sync.Mutex
workerPool chan struct{}
wg *sync.WaitGroup
maxPages int
}
func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL)
if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
waitGroup.Add(1)
crawler := Crawler{
pages: make(map[string]int),
baseURL: baseURL,
mu: &sync.Mutex{},
concurrencyControl: make(chan struct{}, maxConcurrency),
wg: &waitGroup,
maxPages: maxPages,
pages: make(map[string]int),
baseURL: baseURL,
mu: &sync.Mutex{},
workerPool: make(chan struct{}, maxWorkers),
wg: &waitGroup,
maxPages: maxPages,
}
return &crawler, nil
@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
func (c *Crawler) Crawl(rawCurrentURL string) {
// Add an empty struct to channel here
c.concurrencyControl <- struct{}{}
c.workerPool <- struct{}{}
// Decrement the wait group counter and free up the channel when finished
// crawling.
// Decrement the wait group counter and free up the worker pool when
// finished crawling.
defer func() {
<-c.concurrencyControl
<-c.workerPool
c.wg.Done()
}()

32
main.go
View file

@ -1,9 +1,10 @@
package main
import (
"errors"
"flag"
"fmt"
"os"
"strconv"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
)
@ -16,26 +17,25 @@ func main() {
}
}
var errNoURLProvided = errors.New("the URL is not provided")
func run() error {
args := os.Args[1:]
var (
maxWorkers int
maxPages int
)
if len(args) != 3 {
return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
flag.Parse()
if flag.NArg() < 1 {
return errNoURLProvided
}
baseURL := args[0]
baseURL := flag.Arg(0)
maxConcurrency, err := strconv.Atoi(args[1])
if err != nil {
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
}
maxPages, err := strconv.Atoi(args[2])
if err != nil {
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
}
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err)
}