2024-08-26 10:30:14 +01:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2024-08-27 17:11:47 +01:00
|
|
|
"errors"
|
|
|
|
"flag"
|
2024-08-26 10:30:14 +01:00
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
|
2024-08-27 15:42:26 +01:00
|
|
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
2024-08-26 10:30:14 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
if err := run(); err != nil {
|
2024-08-27 15:42:26 +01:00
|
|
|
os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
|
|
|
|
|
2024-08-26 10:30:14 +01:00
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
var errNoURLProvided = errors.New("the URL is not provided")
|
2024-08-27 15:42:26 +01:00
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
func run() error {
|
|
|
|
var (
|
|
|
|
maxWorkers int
|
|
|
|
maxPages int
|
|
|
|
)
|
2024-08-27 15:42:26 +01:00
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
|
|
|
|
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
|
|
|
|
flag.Parse()
|
2024-08-27 15:42:26 +01:00
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
if flag.NArg() < 1 {
|
|
|
|
return errNoURLProvided
|
2024-08-27 15:42:26 +01:00
|
|
|
}
|
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
baseURL := flag.Arg(0)
|
2024-08-27 15:42:26 +01:00
|
|
|
|
2024-08-27 17:11:47 +01:00
|
|
|
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
|
2024-08-27 15:42:26 +01:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to create the crawler: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
go c.Crawl(baseURL)
|
|
|
|
|
|
|
|
c.Wait()
|
|
|
|
|
|
|
|
c.PrintReport()
|
|
|
|
|
2024-08-26 10:30:14 +01:00
|
|
|
return nil
|
|
|
|
}
|