From caa6bbfe7eb647d622cc1db842ce6db8722ffb23 Mon Sep 17 00:00:00 2001 From: Dan Anglin Date: Wed, 28 Aug 2024 12:00:25 +0100 Subject: [PATCH] feat: generate CSV reports and save to file The crawler can now generate CSV reports and save both text and CSV reports to a file. --- .gitignore | 1 + README.md | 29 ++++++++++++++++-- internal/crawler/crawler.go | 52 ++++++++++++++++++++++---------- internal/crawler/crawler_test.go | 2 +- internal/crawler/report.go | 42 ++++++++++++++++++++------ magefiles/mage.go | 4 +-- main.go | 11 +++++-- 7 files changed, 108 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index 74d6f60..d4b7467 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /crawler +/reports/* diff --git a/README.md b/README.md index d603396..2875006 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ This web crawler crawls a given website and generates a report for all the inter - **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version. -## How to run the application +## Build the application Clone this repository to your local machine. ``` @@ -31,11 +31,32 @@ Build the application. mage build ``` +## Run the application + Run the application specifying the website that you want to crawl. -- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages: +### Format + +`./crawler [FLAGS] URL` + +### Examples + +- Crawl the [Crawler Test Site](https://crawler-test.com). + ``` + ./crawler https://crawler-test.com + ``` +- Crawl the site using 3 concurrent workers and generate a report of up to 100 pages. ``` - ./crawler --max-workers 3 --max-pages 20 https://example.com + ./crawler --max-workers 3 --max-pages 100 https://crawler-test.com + ``` +- Crawl the site and print out a CSV report. + ``` + ./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com + ``` +- Crawl the site and save the report to a CSV file. + ``` + mkdir -p reports + ./crawler --max-workers 3 --max-pages 100 --format csv --file reports/report.csv https://crawler-test.com ``` ## Flags @@ -46,3 +67,5 @@ You can configure the application with the following flags. |------|-------------|---------| | `max-workers` | The maximum number of concurrent workers. | 2 | | `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 | +| `format` | The format of the generated report.
Currently supports `text` and `csv`. | text | +| `file` | The file to save the generated report to.
Leave this empty to print to the screen instead. | | diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 74dddba..1590d1f 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -10,12 +10,14 @@ import ( ) type Crawler struct { - pages map[string]pageStat - baseURL *url.URL - mu *sync.Mutex - workerPool chan struct{} - wg *sync.WaitGroup - maxPages int + pages map[string]pageStat + baseURL *url.URL + mu *sync.Mutex + workerPool chan struct{} + wg *sync.WaitGroup + maxPages int + reportFormat string + filepath string } type pageStat struct { @@ -23,7 +25,7 @@ type pageStat struct { internal bool } -func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) { +func NewCrawler(rawBaseURL string, maxWorkers, maxPages int, reportFormat, filepath string) (*Crawler, error) { baseURL, err := url.Parse(rawBaseURL) if err != nil { return nil, fmt.Errorf("unable to parse the base URL: %w", err) @@ -34,12 +36,14 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) { waitGroup.Add(1) crawler := Crawler{ - pages: make(map[string]pageStat), - baseURL: baseURL, - mu: &sync.Mutex{}, - workerPool: make(chan struct{}, maxWorkers), - wg: &waitGroup, - maxPages: maxPages, + pages: make(map[string]pageStat), + baseURL: baseURL, + mu: &sync.Mutex{}, + workerPool: make(chan struct{}, maxWorkers), + wg: &waitGroup, + maxPages: maxPages, + reportFormat: reportFormat, + filepath: filepath, } return &crawler, nil @@ -162,13 +166,29 @@ func (c *Crawler) Wait() { c.wg.Wait() } -func (c *Crawler) PrintReport() { +// GenerateReport generates a report of the crawl. The report is written to a file if the +// user specifies a file path, otherwise it is printed to the screen. +func (c *Crawler) GenerateReport() error { c.mu.Lock() defer c.mu.Unlock() - r := newReport(c.baseURL.String(), c.pages) + report := newReport(c.reportFormat, c.baseURL.String(), c.pages) - fmt.Fprint(os.Stdout, r) + if c.filepath != "" { + file, err := os.Create(c.filepath) + if err != nil { + return fmt.Errorf("error creating %s: %w", c.filepath, err) + } + defer file.Close() + + fmt.Fprintln(file, report) + + fmt.Println("\nSuccessfully saved the report to", c.filepath) + } else { + fmt.Fprintln(os.Stdout, report) + } + + return nil } func (c *Crawler) reachedMaxPages() bool { diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index 48c17ff..2bdb5f1 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -11,7 +11,7 @@ import ( func TestCrawler(t *testing.T) { testBaseURL := "https://example.com" - testCrawler, err := NewCrawler(testBaseURL, 1, 10) + testCrawler, err := NewCrawler(testBaseURL, 1, 10, "text", "") if err != nil { t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err) } diff --git a/internal/crawler/report.go b/internal/crawler/report.go index 3847512..88a5a44 100644 --- a/internal/crawler/report.go +++ b/internal/crawler/report.go @@ -9,6 +9,7 @@ import ( ) type report struct { + format string baseURL string records []record } @@ -19,7 +20,15 @@ type record struct { internal bool } -func newReport(baseURL string, pages map[string]pageStat) report { +func (r record) linkType() string { + if r.internal { + return "internal" + } + + return "external" +} + +func newReport(format, baseURL string, pages map[string]pageStat) report { records := make([]record, 0) for link, stats := range maps.All(pages) { @@ -33,6 +42,7 @@ func newReport(baseURL string, pages map[string]pageStat) report { } report := report{ + format: format, baseURL: baseURL, records: records, } @@ -55,6 +65,15 @@ func (r *report) sortRecords() { } func (r report) String() string { + switch r.format { + case "csv": + return r.csv() + default: + return r.text() + } +} + +func (r report) text() string { var builder strings.Builder titlebar := strings.Repeat("\u2500", 80) @@ -64,20 +83,25 @@ func (r report) String() string { builder.WriteString("\n" + titlebar) for ind := range slices.All(r.records) { - linkType := "internal" - if !r.records[ind].internal { - linkType = "external" - } - links := "links" if r.records[ind].count == 1 { links = "link" } - builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link) + builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType() + " " + links + " to " + r.records[ind].link) + } + + return builder.String() +} + +func (r report) csv() string { + var builder strings.Builder + + builder.WriteString("LINK,TYPE,COUNT") + + for ind := range slices.All(r.records) { + builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType() + "," + strconv.Itoa(r.records[ind].count)) } - builder.WriteString("\n") - return builder.String() } diff --git a/magefiles/mage.go b/magefiles/mage.go index cac393d..5772d16 100644 --- a/magefiles/mage.go +++ b/magefiles/mage.go @@ -53,8 +53,8 @@ func Lint() error { } // Build build the executable. -// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1 -// To enable verbose mode set PROJECT_BUILD_VERBOSE=1 +// To rebuild packages that are already up-to-date set CRAWLER_BUILD_REBUILD_ALL=1 +// To enable verbose mode set CRAWLER_BUILD_VERBOSE=1 func Build() error { main := "." //flags := ldflags() diff --git a/main.go b/main.go index 850f1ab..645e4b7 100644 --- a/main.go +++ b/main.go @@ -23,10 +23,15 @@ func run() error { var ( maxWorkers int maxPages int + format string + file string ) flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers") flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl") + flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'") + flag.StringVar(&file, "file", "", "The file to save the report to") + flag.Parse() if flag.NArg() < 1 { @@ -35,7 +40,7 @@ func run() error { baseURL := flag.Arg(0) - c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages) + c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages, format, file) if err != nil { return fmt.Errorf("unable to create the crawler: %w", err) } @@ -44,7 +49,9 @@ func run() error { c.Wait() - c.PrintReport() + if err := c.GenerateReport(); err != nil { + return fmt.Errorf("unable to generate the report: %w", err) + } return nil }