diff --git a/README.md b/README.md index 49c082e..3f01953 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,9 @@ Run the application specifying the website that you want to crawl. ``` ./crawler --max-workers 3 --max-pages 100 https://crawler-test.com ``` -- Crawl the site and print out a CSV report. +- Crawl the site and print out a JSON report. ``` - ./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com + ./crawler --max-workers 3 --max-pages 100 --format json https://crawler-test.com ``` - Crawl the site and save the report to a CSV file. ``` @@ -67,5 +67,5 @@ You can configure the application with the following flags. |------|-------------|---------| | `max-workers` | The maximum number of concurrent workers. | 2 | | `max-pages` | The maximum number of pages the crawler can discoverd before stopping the crawl. | 10 | -| `format` | The format of the generated report.
Currently supports `text` and `csv`. | text | +| `format` | The format of the generated report.
Currently supports `text`, `csv` or `json`. | text | | `file` | The file to save the generated report to.
Leave this empty to print to the screen instead. | | diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 1590d1f..dd6a104 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -1,7 +1,9 @@ package crawler import ( + "encoding/json" "fmt" + "io" "net/url" "os" "sync" @@ -174,6 +176,12 @@ func (c *Crawler) GenerateReport() error { report := newReport(c.reportFormat, c.baseURL.String(), c.pages) + if c.reportFormat == "json" { + return c.generateJSONReport(report) + } + + var writer io.Writer + if c.filepath != "" { file, err := os.Create(c.filepath) if err != nil { @@ -181,16 +189,53 @@ func (c *Crawler) GenerateReport() error { } defer file.Close() - fmt.Fprintln(file, report) + writer = file - fmt.Println("\nSuccessfully saved the report to", c.filepath) + fmt.Fprintln(file, report) } else { - fmt.Fprintln(os.Stdout, report) + writer = os.Stdout + } + + fmt.Fprintln(writer, report) + + if c.filepath != "" { + fmt.Println("\nSuccessfully saved the report to", c.filepath) } return nil } +func (c *Crawler) generateJSONReport(report report) error { + var writer io.Writer + + if c.filepath != "" { + file, err := os.Create(c.filepath) + if err != nil { + return fmt.Errorf("error creating %s: %w", c.filepath, err) + } + defer file.Close() + + writer = file + } else { + writer = os.Stdout + } + + encoder := json.NewEncoder(writer) + encoder.SetIndent("", " ") + + if err := encoder.Encode(report); err != nil { + return fmt.Errorf("error marshalling the report to JSON: %w", err) + } + + if c.filepath != "" { + fmt.Println("\nSuccessfully saved the report to", c.filepath) + } + + return nil +} + +// reachedMaxPages evaluates to true if the map has reached the +// maximum number of entries. func (c *Crawler) reachedMaxPages() bool { c.mu.Lock() defer c.mu.Unlock() diff --git a/internal/crawler/report.go b/internal/crawler/report.go index c6e947d..7fcc895 100644 --- a/internal/crawler/report.go +++ b/internal/crawler/report.go @@ -9,15 +9,15 @@ import ( ) type report struct { - format string - baseURL string - records []record + Format string `json:"-"` + BaseURL string `json:"baseUrl"` + Records []record `json:"records"` } type record struct { - link string - count int - linkType string + Link string `json:"link"` + Count int `json:"count"` + LinkType string `json:"linkType"` } func newReport(format, baseURL string, pages map[string]pageStat) report { @@ -30,18 +30,18 @@ func newReport(format, baseURL string, pages map[string]pageStat) report { } record := record{ - link: link, - count: stats.count, - linkType: linkType, + Link: link, + Count: stats.count, + LinkType: linkType, } records = append(records, record) } report := report{ - format: format, - baseURL: baseURL, - records: records, + Format: format, + BaseURL: baseURL, + Records: records, } report.sortRecords() @@ -52,17 +52,17 @@ func newReport(format, baseURL string, pages map[string]pageStat) report { func (r *report) sortRecords() { // First sort records by count (in reverse order hopefully) // Then sort records by name if two elements have the same count. - slices.SortFunc(r.records, func(a, b record) int { - if n := cmp.Compare(a.count, b.count); n != 0 { + slices.SortFunc(r.Records, func(a, b record) int { + if n := cmp.Compare(a.Count, b.Count); n != 0 { return -1 * n } - return strings.Compare(a.link, b.link) + return strings.Compare(a.Link, b.Link) }) } func (r report) String() string { - switch r.format { + switch r.Format { case "csv": return r.csv() default: @@ -76,16 +76,16 @@ func (r report) text() string { titlebar := strings.Repeat("\u2500", 80) builder.WriteString("\n" + titlebar) - builder.WriteString("\n" + "REPORT for " + r.baseURL) + builder.WriteString("\n" + "REPORT for " + r.BaseURL) builder.WriteString("\n" + titlebar) - for ind := range slices.All(r.records) { + for ind := range slices.All(r.Records) { links := "links" - if r.records[ind].count == 1 { + if r.Records[ind].Count == 1 { links = "link" } - builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType + " " + links + " to " + r.records[ind].link) + builder.WriteString("\nFound " + strconv.Itoa(r.Records[ind].Count) + " " + r.Records[ind].LinkType + " " + links + " to " + r.Records[ind].Link) } return builder.String() @@ -96,8 +96,8 @@ func (r report) csv() string { builder.WriteString("LINK,TYPE,COUNT") - for ind := range slices.All(r.records) { - builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType + "," + strconv.Itoa(r.records[ind].count)) + for ind := range slices.All(r.Records) { + builder.WriteString("\n" + r.Records[ind].Link + "," + r.Records[ind].LinkType + "," + strconv.Itoa(r.Records[ind].Count)) } return builder.String() diff --git a/internal/crawler/report_test.go b/internal/crawler/report_test.go index 33fc6b6..3da87cd 100644 --- a/internal/crawler/report_test.go +++ b/internal/crawler/report_test.go @@ -24,19 +24,19 @@ func TestReport(t *testing.T) { } want := report{ - format: "text", - baseURL: "https://example.org", - records: []record{ - {link: "example.org", count: 45, linkType: "internal"}, - {link: "example.org/about/contact", count: 10, linkType: "internal"}, - {link: "example.org/posts", count: 4, linkType: "internal"}, - {link: "example.org/tags", count: 4, linkType: "internal"}, - {link: "mastodon.example.social/@benbarlett", count: 4, linkType: "external"}, - {link: "example.org/tags/golang", count: 2, linkType: "internal"}, - {link: "ben-barlett.dev", count: 1, linkType: "external"}, - {link: "example.org/posts/yet-another-web-crawler-has-emerged", count: 1, linkType: "internal"}, - {link: "github.com/benbarlettdotdev", count: 1, linkType: "external"}, - {link: "github.com/dananglin/web-crawler", count: 1, linkType: "external"}, + Format: "text", + BaseURL: "https://example.org", + Records: []record{ + {Link: "example.org", Count: 45, LinkType: "internal"}, + {Link: "example.org/about/contact", Count: 10, LinkType: "internal"}, + {Link: "example.org/posts", Count: 4, LinkType: "internal"}, + {Link: "example.org/tags", Count: 4, LinkType: "internal"}, + {Link: "mastodon.example.social/@benbarlett", Count: 4, LinkType: "external"}, + {Link: "example.org/tags/golang", Count: 2, LinkType: "internal"}, + {Link: "ben-barlett.dev", Count: 1, LinkType: "external"}, + {Link: "example.org/posts/yet-another-web-crawler-has-emerged", Count: 1, LinkType: "internal"}, + {Link: "github.com/benbarlettdotdev", Count: 1, LinkType: "external"}, + {Link: "github.com/dananglin/web-crawler", Count: 1, LinkType: "external"}, }, } diff --git a/main.go b/main.go index 645e4b7..d6c349f 100644 --- a/main.go +++ b/main.go @@ -29,7 +29,7 @@ func run() error { flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers") flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl") - flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'") + flag.StringVar(&format, "format", "text", "The format of the report. Valid formats are 'text', 'json' and 'csv'") flag.StringVar(&file, "file", "", "The file to save the report to") flag.Parse()