generated from templates/go-generic
feat: add support for generating JSON reports
This commit is contained in:
parent
0022d7650c
commit
8e5acbe7c7
5 changed files with 87 additions and 42 deletions
|
@ -49,9 +49,9 @@ Run the application specifying the website that you want to crawl.
|
||||||
```
|
```
|
||||||
./crawler --max-workers 3 --max-pages 100 https://crawler-test.com
|
./crawler --max-workers 3 --max-pages 100 https://crawler-test.com
|
||||||
```
|
```
|
||||||
- Crawl the site and print out a CSV report.
|
- Crawl the site and print out a JSON report.
|
||||||
```
|
```
|
||||||
./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com
|
./crawler --max-workers 3 --max-pages 100 --format json https://crawler-test.com
|
||||||
```
|
```
|
||||||
- Crawl the site and save the report to a CSV file.
|
- Crawl the site and save the report to a CSV file.
|
||||||
```
|
```
|
||||||
|
@ -67,5 +67,5 @@ You can configure the application with the following flags.
|
||||||
|------|-------------|---------|
|
|------|-------------|---------|
|
||||||
| `max-workers` | The maximum number of concurrent workers. | 2 |
|
| `max-workers` | The maximum number of concurrent workers. | 2 |
|
||||||
| `max-pages` | The maximum number of pages the crawler can discoverd before stopping the crawl. | 10 |
|
| `max-pages` | The maximum number of pages the crawler can discoverd before stopping the crawl. | 10 |
|
||||||
| `format` | The format of the generated report.<br>Currently supports `text` and `csv`. | text |
|
| `format` | The format of the generated report.<br>Currently supports `text`, `csv` or `json`. | text |
|
||||||
| `file` | The file to save the generated report to.<br>Leave this empty to print to the screen instead. | |
|
| `file` | The file to save the generated report to.<br>Leave this empty to print to the screen instead. | |
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
package crawler
|
package crawler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -174,6 +176,12 @@ func (c *Crawler) GenerateReport() error {
|
||||||
|
|
||||||
report := newReport(c.reportFormat, c.baseURL.String(), c.pages)
|
report := newReport(c.reportFormat, c.baseURL.String(), c.pages)
|
||||||
|
|
||||||
|
if c.reportFormat == "json" {
|
||||||
|
return c.generateJSONReport(report)
|
||||||
|
}
|
||||||
|
|
||||||
|
var writer io.Writer
|
||||||
|
|
||||||
if c.filepath != "" {
|
if c.filepath != "" {
|
||||||
file, err := os.Create(c.filepath)
|
file, err := os.Create(c.filepath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -181,16 +189,53 @@ func (c *Crawler) GenerateReport() error {
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
fmt.Fprintln(file, report)
|
writer = file
|
||||||
|
|
||||||
fmt.Println("\nSuccessfully saved the report to", c.filepath)
|
fmt.Fprintln(file, report)
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintln(os.Stdout, report)
|
writer = os.Stdout
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintln(writer, report)
|
||||||
|
|
||||||
|
if c.filepath != "" {
|
||||||
|
fmt.Println("\nSuccessfully saved the report to", c.filepath)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) generateJSONReport(report report) error {
|
||||||
|
var writer io.Writer
|
||||||
|
|
||||||
|
if c.filepath != "" {
|
||||||
|
file, err := os.Create(c.filepath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error creating %s: %w", c.filepath, err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
writer = file
|
||||||
|
} else {
|
||||||
|
writer = os.Stdout
|
||||||
|
}
|
||||||
|
|
||||||
|
encoder := json.NewEncoder(writer)
|
||||||
|
encoder.SetIndent("", " ")
|
||||||
|
|
||||||
|
if err := encoder.Encode(report); err != nil {
|
||||||
|
return fmt.Errorf("error marshalling the report to JSON: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.filepath != "" {
|
||||||
|
fmt.Println("\nSuccessfully saved the report to", c.filepath)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// reachedMaxPages evaluates to true if the map has reached the
|
||||||
|
// maximum number of entries.
|
||||||
func (c *Crawler) reachedMaxPages() bool {
|
func (c *Crawler) reachedMaxPages() bool {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
|
@ -9,15 +9,15 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type report struct {
|
type report struct {
|
||||||
format string
|
Format string `json:"-"`
|
||||||
baseURL string
|
BaseURL string `json:"baseUrl"`
|
||||||
records []record
|
Records []record `json:"records"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type record struct {
|
type record struct {
|
||||||
link string
|
Link string `json:"link"`
|
||||||
count int
|
Count int `json:"count"`
|
||||||
linkType string
|
LinkType string `json:"linkType"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func newReport(format, baseURL string, pages map[string]pageStat) report {
|
func newReport(format, baseURL string, pages map[string]pageStat) report {
|
||||||
|
@ -30,18 +30,18 @@ func newReport(format, baseURL string, pages map[string]pageStat) report {
|
||||||
}
|
}
|
||||||
|
|
||||||
record := record{
|
record := record{
|
||||||
link: link,
|
Link: link,
|
||||||
count: stats.count,
|
Count: stats.count,
|
||||||
linkType: linkType,
|
LinkType: linkType,
|
||||||
}
|
}
|
||||||
|
|
||||||
records = append(records, record)
|
records = append(records, record)
|
||||||
}
|
}
|
||||||
|
|
||||||
report := report{
|
report := report{
|
||||||
format: format,
|
Format: format,
|
||||||
baseURL: baseURL,
|
BaseURL: baseURL,
|
||||||
records: records,
|
Records: records,
|
||||||
}
|
}
|
||||||
|
|
||||||
report.sortRecords()
|
report.sortRecords()
|
||||||
|
@ -52,17 +52,17 @@ func newReport(format, baseURL string, pages map[string]pageStat) report {
|
||||||
func (r *report) sortRecords() {
|
func (r *report) sortRecords() {
|
||||||
// First sort records by count (in reverse order hopefully)
|
// First sort records by count (in reverse order hopefully)
|
||||||
// Then sort records by name if two elements have the same count.
|
// Then sort records by name if two elements have the same count.
|
||||||
slices.SortFunc(r.records, func(a, b record) int {
|
slices.SortFunc(r.Records, func(a, b record) int {
|
||||||
if n := cmp.Compare(a.count, b.count); n != 0 {
|
if n := cmp.Compare(a.Count, b.Count); n != 0 {
|
||||||
return -1 * n
|
return -1 * n
|
||||||
}
|
}
|
||||||
|
|
||||||
return strings.Compare(a.link, b.link)
|
return strings.Compare(a.Link, b.Link)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r report) String() string {
|
func (r report) String() string {
|
||||||
switch r.format {
|
switch r.Format {
|
||||||
case "csv":
|
case "csv":
|
||||||
return r.csv()
|
return r.csv()
|
||||||
default:
|
default:
|
||||||
|
@ -76,16 +76,16 @@ func (r report) text() string {
|
||||||
titlebar := strings.Repeat("\u2500", 80)
|
titlebar := strings.Repeat("\u2500", 80)
|
||||||
|
|
||||||
builder.WriteString("\n" + titlebar)
|
builder.WriteString("\n" + titlebar)
|
||||||
builder.WriteString("\n" + "REPORT for " + r.baseURL)
|
builder.WriteString("\n" + "REPORT for " + r.BaseURL)
|
||||||
builder.WriteString("\n" + titlebar)
|
builder.WriteString("\n" + titlebar)
|
||||||
|
|
||||||
for ind := range slices.All(r.records) {
|
for ind := range slices.All(r.Records) {
|
||||||
links := "links"
|
links := "links"
|
||||||
if r.records[ind].count == 1 {
|
if r.Records[ind].Count == 1 {
|
||||||
links = "link"
|
links = "link"
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType + " " + links + " to " + r.records[ind].link)
|
builder.WriteString("\nFound " + strconv.Itoa(r.Records[ind].Count) + " " + r.Records[ind].LinkType + " " + links + " to " + r.Records[ind].Link)
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.String()
|
return builder.String()
|
||||||
|
@ -96,8 +96,8 @@ func (r report) csv() string {
|
||||||
|
|
||||||
builder.WriteString("LINK,TYPE,COUNT")
|
builder.WriteString("LINK,TYPE,COUNT")
|
||||||
|
|
||||||
for ind := range slices.All(r.records) {
|
for ind := range slices.All(r.Records) {
|
||||||
builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType + "," + strconv.Itoa(r.records[ind].count))
|
builder.WriteString("\n" + r.Records[ind].Link + "," + r.Records[ind].LinkType + "," + strconv.Itoa(r.Records[ind].Count))
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.String()
|
return builder.String()
|
||||||
|
|
|
@ -24,19 +24,19 @@ func TestReport(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
want := report{
|
want := report{
|
||||||
format: "text",
|
Format: "text",
|
||||||
baseURL: "https://example.org",
|
BaseURL: "https://example.org",
|
||||||
records: []record{
|
Records: []record{
|
||||||
{link: "example.org", count: 45, linkType: "internal"},
|
{Link: "example.org", Count: 45, LinkType: "internal"},
|
||||||
{link: "example.org/about/contact", count: 10, linkType: "internal"},
|
{Link: "example.org/about/contact", Count: 10, LinkType: "internal"},
|
||||||
{link: "example.org/posts", count: 4, linkType: "internal"},
|
{Link: "example.org/posts", Count: 4, LinkType: "internal"},
|
||||||
{link: "example.org/tags", count: 4, linkType: "internal"},
|
{Link: "example.org/tags", Count: 4, LinkType: "internal"},
|
||||||
{link: "mastodon.example.social/@benbarlett", count: 4, linkType: "external"},
|
{Link: "mastodon.example.social/@benbarlett", Count: 4, LinkType: "external"},
|
||||||
{link: "example.org/tags/golang", count: 2, linkType: "internal"},
|
{Link: "example.org/tags/golang", Count: 2, LinkType: "internal"},
|
||||||
{link: "ben-barlett.dev", count: 1, linkType: "external"},
|
{Link: "ben-barlett.dev", Count: 1, LinkType: "external"},
|
||||||
{link: "example.org/posts/yet-another-web-crawler-has-emerged", count: 1, linkType: "internal"},
|
{Link: "example.org/posts/yet-another-web-crawler-has-emerged", Count: 1, LinkType: "internal"},
|
||||||
{link: "github.com/benbarlettdotdev", count: 1, linkType: "external"},
|
{Link: "github.com/benbarlettdotdev", Count: 1, LinkType: "external"},
|
||||||
{link: "github.com/dananglin/web-crawler", count: 1, linkType: "external"},
|
{Link: "github.com/dananglin/web-crawler", Count: 1, LinkType: "external"},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
main.go
2
main.go
|
@ -29,7 +29,7 @@ func run() error {
|
||||||
|
|
||||||
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
|
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
|
||||||
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
|
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
|
||||||
flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'")
|
flag.StringVar(&format, "format", "text", "The format of the report. Valid formats are 'text', 'json' and 'csv'")
|
||||||
flag.StringVar(&file, "file", "", "The file to save the report to")
|
flag.StringVar(&file, "file", "", "The file to save the report to")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
Loading…
Reference in a new issue