diff --git a/.gitignore b/.gitignore
index 74d6f60..d4b7467 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
/crawler
+/reports/*
diff --git a/README.md b/README.md
index d603396..2875006 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ This web crawler crawls a given website and generates a report for all the inter
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
-## How to run the application
+## Build the application
Clone this repository to your local machine.
```
@@ -31,11 +31,32 @@ Build the application.
mage build
```
+## Run the application
+
Run the application specifying the website that you want to crawl.
-- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
+### Format
+
+`./crawler [FLAGS] URL`
+
+### Examples
+
+- Crawl the [Crawler Test Site](https://crawler-test.com).
+ ```
+ ./crawler https://crawler-test.com
+ ```
+- Crawl the site using 3 concurrent workers and generate a report of up to 100 pages.
```
- ./crawler --max-workers 3 --max-pages 20 https://example.com
+ ./crawler --max-workers 3 --max-pages 100 https://crawler-test.com
+ ```
+- Crawl the site and print out a CSV report.
+ ```
+ ./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com
+ ```
+- Crawl the site and save the report to a CSV file.
+ ```
+ mkdir -p reports
+ ./crawler --max-workers 3 --max-pages 100 --format csv --file reports/report.csv https://crawler-test.com
```
## Flags
@@ -46,3 +67,5 @@ You can configure the application with the following flags.
|------|-------------|---------|
| `max-workers` | The maximum number of concurrent workers. | 2 |
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
+| `format` | The format of the generated report.
Currently supports `text` and `csv`. | text |
+| `file` | The file to save the generated report to.
Leave this empty to print to the screen instead. | |
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 74dddba..1590d1f 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -10,12 +10,14 @@ import (
)
type Crawler struct {
- pages map[string]pageStat
- baseURL *url.URL
- mu *sync.Mutex
- workerPool chan struct{}
- wg *sync.WaitGroup
- maxPages int
+ pages map[string]pageStat
+ baseURL *url.URL
+ mu *sync.Mutex
+ workerPool chan struct{}
+ wg *sync.WaitGroup
+ maxPages int
+ reportFormat string
+ filepath string
}
type pageStat struct {
@@ -23,7 +25,7 @@ type pageStat struct {
internal bool
}
-func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
+func NewCrawler(rawBaseURL string, maxWorkers, maxPages int, reportFormat, filepath string) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL)
if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@@ -34,12 +36,14 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
waitGroup.Add(1)
crawler := Crawler{
- pages: make(map[string]pageStat),
- baseURL: baseURL,
- mu: &sync.Mutex{},
- workerPool: make(chan struct{}, maxWorkers),
- wg: &waitGroup,
- maxPages: maxPages,
+ pages: make(map[string]pageStat),
+ baseURL: baseURL,
+ mu: &sync.Mutex{},
+ workerPool: make(chan struct{}, maxWorkers),
+ wg: &waitGroup,
+ maxPages: maxPages,
+ reportFormat: reportFormat,
+ filepath: filepath,
}
return &crawler, nil
@@ -162,13 +166,29 @@ func (c *Crawler) Wait() {
c.wg.Wait()
}
-func (c *Crawler) PrintReport() {
+// GenerateReport generates a report of the crawl. The report is written to a file if the
+// user specifies a file path, otherwise it is printed to the screen.
+func (c *Crawler) GenerateReport() error {
c.mu.Lock()
defer c.mu.Unlock()
- r := newReport(c.baseURL.String(), c.pages)
+ report := newReport(c.reportFormat, c.baseURL.String(), c.pages)
- fmt.Fprint(os.Stdout, r)
+ if c.filepath != "" {
+ file, err := os.Create(c.filepath)
+ if err != nil {
+ return fmt.Errorf("error creating %s: %w", c.filepath, err)
+ }
+ defer file.Close()
+
+ fmt.Fprintln(file, report)
+
+ fmt.Println("\nSuccessfully saved the report to", c.filepath)
+ } else {
+ fmt.Fprintln(os.Stdout, report)
+ }
+
+ return nil
}
func (c *Crawler) reachedMaxPages() bool {
diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go
index 48c17ff..2bdb5f1 100644
--- a/internal/crawler/crawler_test.go
+++ b/internal/crawler/crawler_test.go
@@ -11,7 +11,7 @@ import (
func TestCrawler(t *testing.T) {
testBaseURL := "https://example.com"
- testCrawler, err := NewCrawler(testBaseURL, 1, 10)
+ testCrawler, err := NewCrawler(testBaseURL, 1, 10, "text", "")
if err != nil {
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
}
diff --git a/internal/crawler/report.go b/internal/crawler/report.go
index 3847512..88a5a44 100644
--- a/internal/crawler/report.go
+++ b/internal/crawler/report.go
@@ -9,6 +9,7 @@ import (
)
type report struct {
+ format string
baseURL string
records []record
}
@@ -19,7 +20,15 @@ type record struct {
internal bool
}
-func newReport(baseURL string, pages map[string]pageStat) report {
+func (r record) linkType() string {
+ if r.internal {
+ return "internal"
+ }
+
+ return "external"
+}
+
+func newReport(format, baseURL string, pages map[string]pageStat) report {
records := make([]record, 0)
for link, stats := range maps.All(pages) {
@@ -33,6 +42,7 @@ func newReport(baseURL string, pages map[string]pageStat) report {
}
report := report{
+ format: format,
baseURL: baseURL,
records: records,
}
@@ -55,6 +65,15 @@ func (r *report) sortRecords() {
}
func (r report) String() string {
+ switch r.format {
+ case "csv":
+ return r.csv()
+ default:
+ return r.text()
+ }
+}
+
+func (r report) text() string {
var builder strings.Builder
titlebar := strings.Repeat("\u2500", 80)
@@ -64,20 +83,25 @@ func (r report) String() string {
builder.WriteString("\n" + titlebar)
for ind := range slices.All(r.records) {
- linkType := "internal"
- if !r.records[ind].internal {
- linkType = "external"
- }
-
links := "links"
if r.records[ind].count == 1 {
links = "link"
}
- builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
+ builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType() + " " + links + " to " + r.records[ind].link)
+ }
+
+ return builder.String()
+}
+
+func (r report) csv() string {
+ var builder strings.Builder
+
+ builder.WriteString("LINK,TYPE,COUNT")
+
+ for ind := range slices.All(r.records) {
+ builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType() + "," + strconv.Itoa(r.records[ind].count))
}
- builder.WriteString("\n")
-
return builder.String()
}
diff --git a/magefiles/mage.go b/magefiles/mage.go
index cac393d..5772d16 100644
--- a/magefiles/mage.go
+++ b/magefiles/mage.go
@@ -53,8 +53,8 @@ func Lint() error {
}
// Build build the executable.
-// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
-// To enable verbose mode set PROJECT_BUILD_VERBOSE=1
+// To rebuild packages that are already up-to-date set CRAWLER_BUILD_REBUILD_ALL=1
+// To enable verbose mode set CRAWLER_BUILD_VERBOSE=1
func Build() error {
main := "."
//flags := ldflags()
diff --git a/main.go b/main.go
index 850f1ab..645e4b7 100644
--- a/main.go
+++ b/main.go
@@ -23,10 +23,15 @@ func run() error {
var (
maxWorkers int
maxPages int
+ format string
+ file string
)
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
+ flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'")
+ flag.StringVar(&file, "file", "", "The file to save the report to")
+
flag.Parse()
if flag.NArg() < 1 {
@@ -35,7 +40,7 @@ func run() error {
baseURL := flag.Arg(0)
- c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
+ c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages, format, file)
if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err)
}
@@ -44,7 +49,9 @@ func run() error {
c.Wait()
- c.PrintReport()
+ if err := c.GenerateReport(); err != nil {
+ return fmt.Errorf("unable to generate the report: %w", err)
+ }
return nil
}