generated from templates/go-generic
feat: use flags to configure the crawler
- Use flags to configure the worker pool and the maximum number of pages. - Add README.md
This commit is contained in:
parent
4519de764e
commit
85717a7fac
4 changed files with 84 additions and 33 deletions
|
@ -13,6 +13,14 @@ output:
|
||||||
sort-results: true
|
sort-results: true
|
||||||
|
|
||||||
linters-settings:
|
linters-settings:
|
||||||
|
depguard:
|
||||||
|
rules:
|
||||||
|
main:
|
||||||
|
files:
|
||||||
|
- $all
|
||||||
|
allow:
|
||||||
|
- $gostd
|
||||||
|
- codeflow.dananglin.me.uk/apollo/web-crawler
|
||||||
lll:
|
lll:
|
||||||
line-length: 140
|
line-length: 140
|
||||||
|
|
||||||
|
@ -21,4 +29,5 @@ linters:
|
||||||
disable:
|
disable:
|
||||||
- execinquery
|
- execinquery
|
||||||
- gomnd
|
- gomnd
|
||||||
|
- mnd
|
||||||
fast: false
|
fast: false
|
||||||
|
|
42
README.md
Normal file
42
README.md
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# Web Crawler
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This web crawler crawls a given URL and generates a report for all the internal links it finds.
|
||||||
|
|
||||||
|
### Repository mirrors
|
||||||
|
|
||||||
|
- **Code Flow:** https://codeflow.dananglin.me.uk/apollo/web-crawler
|
||||||
|
- **GitHub:** https://github.com/dananglin/web-crawler
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
|
||||||
|
|
||||||
|
## How to run the application
|
||||||
|
|
||||||
|
Clone this repository to your local machine.
|
||||||
|
```
|
||||||
|
git clone https://github.com/dananglin/web-crawler.git
|
||||||
|
```
|
||||||
|
|
||||||
|
Build the application.
|
||||||
|
```
|
||||||
|
go build -o crawler .
|
||||||
|
```
|
||||||
|
|
||||||
|
Run the application specifying the website that you want to crawl.
|
||||||
|
|
||||||
|
- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
|
||||||
|
```
|
||||||
|
./crawler --max-workers 3 --max-pages 20 https://example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
## Flags
|
||||||
|
|
||||||
|
You can configure the application with the following flags.
|
||||||
|
|
||||||
|
| Name | Description | Default |
|
||||||
|
|------|-------------|---------|
|
||||||
|
| `max-workers` | The maximum number of concurrent workers. | 2 |
|
||||||
|
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
|
|
@ -11,15 +11,15 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
pages map[string]int
|
pages map[string]int
|
||||||
baseURL *url.URL
|
baseURL *url.URL
|
||||||
mu *sync.Mutex
|
mu *sync.Mutex
|
||||||
concurrencyControl chan struct{}
|
workerPool chan struct{}
|
||||||
wg *sync.WaitGroup
|
wg *sync.WaitGroup
|
||||||
maxPages int
|
maxPages int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
|
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
||||||
baseURL, err := url.Parse(rawBaseURL)
|
baseURL, err := url.Parse(rawBaseURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
|
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
|
||||||
|
@ -30,12 +30,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
|
||||||
waitGroup.Add(1)
|
waitGroup.Add(1)
|
||||||
|
|
||||||
crawler := Crawler{
|
crawler := Crawler{
|
||||||
pages: make(map[string]int),
|
pages: make(map[string]int),
|
||||||
baseURL: baseURL,
|
baseURL: baseURL,
|
||||||
mu: &sync.Mutex{},
|
mu: &sync.Mutex{},
|
||||||
concurrencyControl: make(chan struct{}, maxConcurrency),
|
workerPool: make(chan struct{}, maxWorkers),
|
||||||
wg: &waitGroup,
|
wg: &waitGroup,
|
||||||
maxPages: maxPages,
|
maxPages: maxPages,
|
||||||
}
|
}
|
||||||
|
|
||||||
return &crawler, nil
|
return &crawler, nil
|
||||||
|
@ -43,12 +43,12 @@ func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, erro
|
||||||
|
|
||||||
func (c *Crawler) Crawl(rawCurrentURL string) {
|
func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
// Add an empty struct to channel here
|
// Add an empty struct to channel here
|
||||||
c.concurrencyControl <- struct{}{}
|
c.workerPool <- struct{}{}
|
||||||
|
|
||||||
// Decrement the wait group counter and free up the channel when finished
|
// Decrement the wait group counter and free up the worker pool when
|
||||||
// crawling.
|
// finished crawling.
|
||||||
defer func() {
|
defer func() {
|
||||||
<-c.concurrencyControl
|
<-c.workerPool
|
||||||
c.wg.Done()
|
c.wg.Done()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|
32
main.go
32
main.go
|
@ -1,9 +1,10 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
||||||
)
|
)
|
||||||
|
@ -16,26 +17,25 @@ func main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var errNoURLProvided = errors.New("the URL is not provided")
|
||||||
|
|
||||||
func run() error {
|
func run() error {
|
||||||
args := os.Args[1:]
|
var (
|
||||||
|
maxWorkers int
|
||||||
|
maxPages int
|
||||||
|
)
|
||||||
|
|
||||||
if len(args) != 3 {
|
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
|
||||||
return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
|
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if flag.NArg() < 1 {
|
||||||
|
return errNoURLProvided
|
||||||
}
|
}
|
||||||
|
|
||||||
baseURL := args[0]
|
baseURL := flag.Arg(0)
|
||||||
|
|
||||||
maxConcurrency, err := strconv.Atoi(args[1])
|
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
|
|
||||||
}
|
|
||||||
|
|
||||||
maxPages, err := strconv.Atoi(args[2])
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
|
|
||||||
}
|
|
||||||
|
|
||||||
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to create the crawler: %w", err)
|
return fmt.Errorf("unable to create the crawler: %w", err)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue