generated from templates/go-generic
feat: generate CSV reports and save to file
The crawler can now generate CSV reports and save both text and CSV reports to a file.
This commit is contained in:
parent
5498ac7b4e
commit
caa6bbfe7e
7 changed files with 108 additions and 33 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
/crawler
|
||||
/reports/*
|
||||
|
|
29
README.md
29
README.md
|
@ -13,7 +13,7 @@ This web crawler crawls a given website and generates a report for all the inter
|
|||
|
||||
- **Go:** A minimum version of Go 1.23.0 is required for building/installing the web crawler. Please go [here](https://go.dev/dl/) to download the latest version.
|
||||
|
||||
## How to run the application
|
||||
## Build the application
|
||||
|
||||
Clone this repository to your local machine.
|
||||
```
|
||||
|
@ -31,11 +31,32 @@ Build the application.
|
|||
mage build
|
||||
```
|
||||
|
||||
## Run the application
|
||||
|
||||
Run the application specifying the website that you want to crawl.
|
||||
|
||||
- To crawl `https://example.com` using 3 concurrent workers and generate a report of up to 20 unique discovered pages:
|
||||
### Format
|
||||
|
||||
`./crawler [FLAGS] URL`
|
||||
|
||||
### Examples
|
||||
|
||||
- Crawl the [Crawler Test Site](https://crawler-test.com).
|
||||
```
|
||||
./crawler --max-workers 3 --max-pages 20 https://example.com
|
||||
./crawler https://crawler-test.com
|
||||
```
|
||||
- Crawl the site using 3 concurrent workers and generate a report of up to 100 pages.
|
||||
```
|
||||
./crawler --max-workers 3 --max-pages 100 https://crawler-test.com
|
||||
```
|
||||
- Crawl the site and print out a CSV report.
|
||||
```
|
||||
./crawler --max-workers 3 --max-pages 100 --format csv https://crawler-test.com
|
||||
```
|
||||
- Crawl the site and save the report to a CSV file.
|
||||
```
|
||||
mkdir -p reports
|
||||
./crawler --max-workers 3 --max-pages 100 --format csv --file reports/report.csv https://crawler-test.com
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
@ -46,3 +67,5 @@ You can configure the application with the following flags.
|
|||
|------|-------------|---------|
|
||||
| `max-workers` | The maximum number of concurrent workers. | 2 |
|
||||
| `max-pages` | The maximum number of pages discovered before stopping the crawl. | 10 |
|
||||
| `format` | The format of the generated report.<br>Currently supports `text` and `csv`. | text |
|
||||
| `file` | The file to save the generated report to.<br>Leave this empty to print to the screen instead. | |
|
||||
|
|
|
@ -16,6 +16,8 @@ type Crawler struct {
|
|||
workerPool chan struct{}
|
||||
wg *sync.WaitGroup
|
||||
maxPages int
|
||||
reportFormat string
|
||||
filepath string
|
||||
}
|
||||
|
||||
type pageStat struct {
|
||||
|
@ -23,7 +25,7 @@ type pageStat struct {
|
|||
internal bool
|
||||
}
|
||||
|
||||
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
||||
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int, reportFormat, filepath string) (*Crawler, error) {
|
||||
baseURL, err := url.Parse(rawBaseURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
|
||||
|
@ -40,6 +42,8 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
|||
workerPool: make(chan struct{}, maxWorkers),
|
||||
wg: &waitGroup,
|
||||
maxPages: maxPages,
|
||||
reportFormat: reportFormat,
|
||||
filepath: filepath,
|
||||
}
|
||||
|
||||
return &crawler, nil
|
||||
|
@ -162,13 +166,29 @@ func (c *Crawler) Wait() {
|
|||
c.wg.Wait()
|
||||
}
|
||||
|
||||
func (c *Crawler) PrintReport() {
|
||||
// GenerateReport generates a report of the crawl. The report is written to a file if the
|
||||
// user specifies a file path, otherwise it is printed to the screen.
|
||||
func (c *Crawler) GenerateReport() error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
r := newReport(c.baseURL.String(), c.pages)
|
||||
report := newReport(c.reportFormat, c.baseURL.String(), c.pages)
|
||||
|
||||
fmt.Fprint(os.Stdout, r)
|
||||
if c.filepath != "" {
|
||||
file, err := os.Create(c.filepath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating %s: %w", c.filepath, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
fmt.Fprintln(file, report)
|
||||
|
||||
fmt.Println("\nSuccessfully saved the report to", c.filepath)
|
||||
} else {
|
||||
fmt.Fprintln(os.Stdout, report)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) reachedMaxPages() bool {
|
||||
|
|
|
@ -11,7 +11,7 @@ import (
|
|||
func TestCrawler(t *testing.T) {
|
||||
testBaseURL := "https://example.com"
|
||||
|
||||
testCrawler, err := NewCrawler(testBaseURL, 1, 10)
|
||||
testCrawler, err := NewCrawler(testBaseURL, 1, 10, "text", "")
|
||||
if err != nil {
|
||||
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
)
|
||||
|
||||
type report struct {
|
||||
format string
|
||||
baseURL string
|
||||
records []record
|
||||
}
|
||||
|
@ -19,7 +20,15 @@ type record struct {
|
|||
internal bool
|
||||
}
|
||||
|
||||
func newReport(baseURL string, pages map[string]pageStat) report {
|
||||
func (r record) linkType() string {
|
||||
if r.internal {
|
||||
return "internal"
|
||||
}
|
||||
|
||||
return "external"
|
||||
}
|
||||
|
||||
func newReport(format, baseURL string, pages map[string]pageStat) report {
|
||||
records := make([]record, 0)
|
||||
|
||||
for link, stats := range maps.All(pages) {
|
||||
|
@ -33,6 +42,7 @@ func newReport(baseURL string, pages map[string]pageStat) report {
|
|||
}
|
||||
|
||||
report := report{
|
||||
format: format,
|
||||
baseURL: baseURL,
|
||||
records: records,
|
||||
}
|
||||
|
@ -55,6 +65,15 @@ func (r *report) sortRecords() {
|
|||
}
|
||||
|
||||
func (r report) String() string {
|
||||
switch r.format {
|
||||
case "csv":
|
||||
return r.csv()
|
||||
default:
|
||||
return r.text()
|
||||
}
|
||||
}
|
||||
|
||||
func (r report) text() string {
|
||||
var builder strings.Builder
|
||||
|
||||
titlebar := strings.Repeat("\u2500", 80)
|
||||
|
@ -64,20 +83,25 @@ func (r report) String() string {
|
|||
builder.WriteString("\n" + titlebar)
|
||||
|
||||
for ind := range slices.All(r.records) {
|
||||
linkType := "internal"
|
||||
if !r.records[ind].internal {
|
||||
linkType = "external"
|
||||
}
|
||||
|
||||
links := "links"
|
||||
if r.records[ind].count == 1 {
|
||||
links = "link"
|
||||
}
|
||||
|
||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
|
||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + r.records[ind].linkType() + " " + links + " to " + r.records[ind].link)
|
||||
}
|
||||
|
||||
return builder.String()
|
||||
}
|
||||
|
||||
func (r report) csv() string {
|
||||
var builder strings.Builder
|
||||
|
||||
builder.WriteString("LINK,TYPE,COUNT")
|
||||
|
||||
for ind := range slices.All(r.records) {
|
||||
builder.WriteString("\n" + r.records[ind].link + "," + r.records[ind].linkType() + "," + strconv.Itoa(r.records[ind].count))
|
||||
}
|
||||
|
||||
builder.WriteString("\n")
|
||||
|
||||
return builder.String()
|
||||
}
|
||||
|
|
|
@ -53,8 +53,8 @@ func Lint() error {
|
|||
}
|
||||
|
||||
// Build build the executable.
|
||||
// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
|
||||
// To enable verbose mode set PROJECT_BUILD_VERBOSE=1
|
||||
// To rebuild packages that are already up-to-date set CRAWLER_BUILD_REBUILD_ALL=1
|
||||
// To enable verbose mode set CRAWLER_BUILD_VERBOSE=1
|
||||
func Build() error {
|
||||
main := "."
|
||||
//flags := ldflags()
|
||||
|
|
11
main.go
11
main.go
|
@ -23,10 +23,15 @@ func run() error {
|
|||
var (
|
||||
maxWorkers int
|
||||
maxPages int
|
||||
format string
|
||||
file string
|
||||
)
|
||||
|
||||
flag.IntVar(&maxWorkers, "max-workers", 2, "The maximum number of concurrent workers")
|
||||
flag.IntVar(&maxPages, "max-pages", 10, "The maximum number of pages to discover before stopping the crawl")
|
||||
flag.StringVar(&format, "format", "text", "The format of the report. Can be 'text' or 'csv'")
|
||||
flag.StringVar(&file, "file", "", "The file to save the report to")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if flag.NArg() < 1 {
|
||||
|
@ -35,7 +40,7 @@ func run() error {
|
|||
|
||||
baseURL := flag.Arg(0)
|
||||
|
||||
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages)
|
||||
c, err := crawler.NewCrawler(baseURL, maxWorkers, maxPages, format, file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create the crawler: %w", err)
|
||||
}
|
||||
|
@ -44,7 +49,9 @@ func run() error {
|
|||
|
||||
c.Wait()
|
||||
|
||||
c.PrintReport()
|
||||
if err := c.GenerateReport(); err != nil {
|
||||
return fmt.Errorf("unable to generate the report: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue