feat: generate CSV reports and save to file

The crawler can now generate CSV reports and save both text and CSV
reports to a file.
This commit is contained in:
Dan Anglin 2024-08-28 12:00:25 +01:00
parent 5498ac7b4e
commit caa6bbfe7e
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
7 changed files with 108 additions and 33 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/crawler
/reports/*

View file

@ -13,7 +13,7 @@ This web crawler crawls a given website and generates a report for all the inter
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
## How to run the application
## Build the application
Clone this repository to your local machine.
```
@ -31,11 +31,32 @@ Build the application.
mage build
```
## Run the application
Run the application specifying the website that you want to crawl.
- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
### Format
`./crawler [FLAGS] URL`
### Examples
- Crawl the [Crawler Test Site](https://crawler-test.com).
```
./crawler https://crawler-test.com
```
- Crawl the site using 3 concurrent workers and generate a report of up to 100 pages.
```
./crawler --max-workers 3 --max-pages 20 https://example.com
./crawler --max-workers 3 --max-pages 100 https://crawler-test.com
```
- Crawl the site and print out a CSV report.
```
./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com
```
- Crawl the site and save the report to a CSV file.
```
mkdir -p reports
./crawler --max-workers 3 --max-pages 100 --format csv --file reports/report.csv https://crawler-test.com
```
## Flags
@ -46,3 +67,5 @@ You can configure the application with the following flags.
|------|-------------|---------|
| `max-workers` | The maximum number of concurrent workers. | 2 |
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
| `format` | The format of the generated report.<br>Currently supports `text` and `csv`. | text |
| `file` | The file to save the generated report to.<br>Leave this empty to print to the screen instead. | |

View file

@ -10,12 +10,14 @@ import (
)
type Crawler struct {
pages map[string]pageStat
baseURL *url.URL
mu *sync.Mutex
workerPool chan struct{}
wg *sync.WaitGroup
maxPages int
pages map[string]pageStat
baseURL *url.URL
mu *sync.Mutex
workerPool chan struct{}
wg *sync.WaitGroup
maxPages int
reportFormat string
filepath string
}
type pageStat struct {
@ -23,7 +25,7 @@ type pageStat struct {
internal bool
}
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int, reportFormat, filepath string) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL)
if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
@ -34,12 +36,14 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
waitGroup.Add(1)
crawler := Crawler{
pages: make(map[string]pageStat),
baseURL: baseURL,
mu: &sync.Mutex{},
workerPool: make(chan struct{}, maxWorkers),
wg: &waitGroup,
maxPages: maxPages,
pages: make(map[string]pageStat),
baseURL: baseURL,
mu: &sync.Mutex{},
workerPool: make(chan struct{}, maxWorkers),
wg: &waitGroup,
maxPages: maxPages,
reportFormat: reportFormat,
filepath: filepath,
}
return &crawler, nil
@ -162,13 +166,29 @@ func (c *Crawler) Wait() {
c.wg.Wait()
}
func (c *Crawler) PrintReport() {
// GenerateReport generates a report of the crawl. The report is written to a file if the
// user specifies a file path, otherwise it is printed to the screen.
func (c *Crawler) GenerateReport() error {
c.mu.Lock()
defer c.mu.Unlock()
r := newReport(c.baseURL.String(), c.pages)
report := newReport(c.reportFormat, c.baseURL.String(), c.pages)
fmt.Fprint(os.Stdout, r)
if c.filepath != "" {
file, err := os.Create(c.filepath)
if err != nil {
return fmt.Errorf("error creating %s: %w", c.filepath, err)
}
defer file.Close()
fmt.Fprintln(file, report)
fmt.Println("\nSuccessfully saved the report to", c.filepath)
} else {
fmt.Fprintln(os.Stdout, report)
}
return nil
}
func (c *Crawler) reachedMaxPages() bool {

View file

@ -11,7 +11,7 @@ import (
func TestCrawler(t *testing.T) {
testBaseURL := "https://example.com"
testCrawler, err := NewCrawler(testBaseURL, 1, 10)
testCrawler, err := NewCrawler(testBaseURL, 1, 10, "text", "")
if err != nil {
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
}

View file

@ -9,6 +9,7 @@ import (
)
type report struct {
format string
baseURL string
records []record
}
@ -19,7 +20,15 @@ type record struct {
internal bool
}
func newReport(baseURL string, pages map[string]pageStat) report {
func (r record) linkType() string {
if r.internal {
return "internal"
}
return "external"
}
func newReport(format, baseURL string, pages map[string]pageStat) report {
records := make([]record, 0)
for link, stats := range maps.All(pages) {
@ -33,6 +42,7 @@ func newReport(baseURL string, pages map[string]pageStat) report {
}
report := report{
format: format,
baseURL: baseURL,
records: records,
}
@ -55,6 +65,15 @@ func (r *report) sortRecords() {
}
func (r report) String() string {
switch r.format {
case "csv":
return r.csv()
default:
return r.text()
}
}
func (r report) text() string {
var builder strings.Builder
titlebar := strings.Repeat("\u2500", 80)
@ -64,20 +83,25 @@ func (r report) String() string {
builder.WriteString("\n" + titlebar)
for ind := range slices.All(r.records) {
linkType := "internal"
if !r.records[ind].internal {
linkType = "external"
}
links := "links"
if r.records[ind].count == 1 {
links = "link"
}
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType() + " " + links + " to " + r.records[ind].link)
}
return builder.String()
}
func (r report) csv() string {
var builder strings.Builder
builder.WriteString("LINK,TYPE,COUNT")
for ind := range slices.All(r.records) {
builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType() + "," + strconv.Itoa(r.records[ind].count))
}
builder.WriteString("\n")
return builder.String()
}

View file

@ -53,8 +53,8 @@ func Lint() error {
}
// Build build the executable.
// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
// To enable verbose mode set PROJECT_BUILD_VERBOSE=1
// To rebuild packages that are already up-to-date set CRAWLER_BUILD_REBUILD_ALL=1
// To enable verbose mode set CRAWLER_BUILD_VERBOSE=1
func Build() error {
main := "."
//flags := ldflags()

11
main.go
View file

@ -23,10 +23,15 @@ func run() error {
var (
maxWorkers int
maxPages int
format string
file string
)
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'")
flag.StringVar(&file, "file", "", "The file to save the report to")
flag.Parse()
if flag.NArg() < 1 {
@ -35,7 +40,7 @@ func run() error {
baseURL := flag.Arg(0)
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages, format, file)
if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err)
}
@ -44,7 +49,9 @@ func run() error {
c.Wait()
c.PrintReport()
if err := c.GenerateReport(); err != nil {
return fmt.Errorf("unable to generate the report: %w", err)
}
return nil
}