diff --git a/.forgejo/actions/mage/Dockerfile b/.forgejo/actions/mage/Dockerfile new file mode 100644 index 0000000..ac36f20 --- /dev/null +++ b/.forgejo/actions/mage/Dockerfile @@ -0,0 +1,6 @@ +# syntax=docker/dockerfile:1 +FROM golang:1.23.0 + +RUN go install github.com/magefile/mage@v1.15.0 + +ENTRYPOINT ["mage"] diff --git a/.forgejo/actions/mage/action.yaml b/.forgejo/actions/mage/action.yaml new file mode 100644 index 0000000..ae3718d --- /dev/null +++ b/.forgejo/actions/mage/action.yaml @@ -0,0 +1,16 @@ +--- +name: "Mage Action" +description: "Runs a mage target in the defined in the project's repository" + +inputs: + target: + description: "The mage target to run" + required: true + +runs: + using: "docker" + image: "Dockerfile" + entrypoint: "mage" + args: + - -v + - ${{ inputs.target }} diff --git a/.forgejo/workflows/Tests.yaml b/.forgejo/workflows/Tests.yaml new file mode 100644 index 0000000..366a8cc --- /dev/null +++ b/.forgejo/workflows/Tests.yaml @@ -0,0 +1,23 @@ +--- +name: Tests + +on: + pull_request: + types: + - opened + - synchronize + +jobs: + test: + if: ${{ ! github.event.pull_request.draft }} + runs-on: docker + steps: + - name: Checkout Repository + uses: https://code.forgejo.org/actions/checkout@v4 + - name: Test + uses: ./.forgejo/actions/mage + with: + target: test + env: + CRAWLER_TEST_COVER: "1" + CRAWLER_TEST_VERBOSE: "1" diff --git a/.forgejo/workflows/workflow.yaml b/.forgejo/workflows/workflow.yaml deleted file mode 100644 index e08ccb4..0000000 --- a/.forgejo/workflows/workflow.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -on: - pull_request: - types: - - opened - - reopened - - synchronize - -jobs: - test: - runs-on: docker - env: - GO_TEST_VERBOSE: "1" - GO_TEST_COVER: "1" - steps: - - name: Checkout Repository - uses: https://code.forgejo.org/actions/checkout@v4 - - name: Setup Go - uses: https://code.forgejo.org/actions/setup-go@v5 - with: - go-version: '1.22' - - name: Test - run: go run magefiles/main.go -v test - - lint: - runs-on: docker - steps: - - name: Checkout Repository - uses: https://code.forgejo.org/actions/checkout@v4 - - name: Setup Go - uses: https://code.forgejo.org/actions/setup-go@v5 - with: - go-version: '1.22' - - name: Lint - uses: https://github.com/golangci/golangci-lint-action@v3 - with: - version: v1.54 diff --git a/.gitignore b/.gitignore index e200850..74d6f60 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -/__build/* -!__build/.gitkeep +/crawler diff --git a/.golangci.yaml b/.golangci.yaml index 8549273..dd8e3ee 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -6,7 +6,7 @@ run: tests: true output: - format: colored-line-number + formats: colored-line-number print-issues-lines: true print-linter-name: true uniq-by-line: true @@ -18,5 +18,7 @@ linters-settings: linters: enable-all: true - # disable: + disable: + - execinquery + - gomnd fast: false diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5200de6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Dan Anglin + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the “Software”), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/__build/.gitkeep b/__build/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6e9f95f --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module codeflow.dananglin.me.uk/apollo/web-crawler + +go 1.23.0 + +require golang.org/x/net v0.28.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e890837 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go new file mode 100644 index 0000000..bf25d5b --- /dev/null +++ b/internal/crawler/crawler.go @@ -0,0 +1,168 @@ +package crawler + +import ( + "fmt" + "net/url" + "os" + "sync" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/report" + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" +) + +type Crawler struct { + pages map[string]int + baseURL *url.URL + mu *sync.Mutex + concurrencyControl chan struct{} + wg *sync.WaitGroup + maxPages int +} + +func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) { + baseURL, err := url.Parse(rawBaseURL) + if err != nil { + return nil, fmt.Errorf("unable to parse the base URL: %w", err) + } + + var waitGroup sync.WaitGroup + + waitGroup.Add(1) + + crawler := Crawler{ + pages: make(map[string]int), + baseURL: baseURL, + mu: &sync.Mutex{}, + concurrencyControl: make(chan struct{}, maxConcurrency), + wg: &waitGroup, + maxPages: maxPages, + } + + return &crawler, nil +} + +func (c *Crawler) Crawl(rawCurrentURL string) { + // Add an empty struct to channel here + c.concurrencyControl <- struct{}{} + + // Decrement the wait group counter and free up the channel when finished + // crawling. + defer func() { + <-c.concurrencyControl + c.wg.Done() + }() + + if c.reachedMaxPages() { + return + } + + // if current URL is not on the same domain as the base URL then return early. + hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL) + if err != nil { + fmt.Printf( + "WARNING: Unable to determine if %q has the same domain as %q; %v.\n", + rawCurrentURL, + c.baseURL.Hostname(), + err, + ) + + return + } + + if !hasEqualDomain { + return + } + + // get normalised version of rawCurrentURL + normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL) + if err != nil { + fmt.Printf("WARNING: Error normalising %q: %v.\n", rawCurrentURL, err) + + return + } + + // Add (or update) a record of the URL in the pages map. + // If there's already an entry of the URL in the map then return early. + if existed := c.AddPageVisit(normalisedCurrentURL); existed { + return + } + + // Get the HTML from the current URL, print that you are getting the HTML doc from current URL. + fmt.Printf("Crawling %q\n", rawCurrentURL) + + htmlDoc, err := getHTML(rawCurrentURL) + if err != nil { + fmt.Printf( + "WARNING: Error retrieving the HTML document from %q: %v.\n", + rawCurrentURL, + err, + ) + + return + } + + // Get all the URLs from the HTML doc. + links, err := util.GetURLsFromHTML(htmlDoc, c.baseURL.String()) + if err != nil { + fmt.Printf( + "WARNING: Error retrieving the links from the HTML document: %v.\n", + err, + ) + + return + } + + // Recursively crawl each URL on the page. + for ind := range len(links) { + c.wg.Add(1) + go c.Crawl(links[ind]) + } +} + +func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) { + parsedRawURL, err := url.Parse(rawURL) + if err != nil { + return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err) + } + + return c.baseURL.Hostname() == parsedRawURL.Hostname(), nil +} + +// addPageVisit adds a record of the visited page's URL to the pages map. +// If there is already a record of the URL then it's record is updated (incremented) +// and the method returns true. If the URL is not already recorded then it is created +// and the method returns false. +func (c *Crawler) AddPageVisit(normalisedURL string) bool { + c.mu.Lock() + defer c.mu.Unlock() + + _, exists := c.pages[normalisedURL] + + if exists { + c.pages[normalisedURL]++ + } else { + c.pages[normalisedURL] = 1 + } + + return exists +} + +func (c *Crawler) Wait() { + c.wg.Wait() +} + +func (c *Crawler) PrintReport() { + c.mu.Lock() + defer c.mu.Unlock() + + r := report.NewReport(c.baseURL.String(), c.pages) + + fmt.Fprint(os.Stdout, r) +} + +func (c *Crawler) reachedMaxPages() bool { + c.mu.Lock() + defer c.mu.Unlock() + + return len(c.pages) >= c.maxPages +} diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go new file mode 100644 index 0000000..4ccfac8 --- /dev/null +++ b/internal/crawler/crawler_test.go @@ -0,0 +1,172 @@ +package crawler_test + +import ( + "fmt" + "slices" + "testing" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" +) + +func TestCrawler(t *testing.T) { + testBaseURL := "https://example.com" + + testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10) + if err != nil { + t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err) + } + + testCasesForEqualDomains := []struct { + name string + rawURL string + want bool + }{ + { + name: "Same domain", + rawURL: "https://example.com", + want: true, + }, + { + name: "Same domain, different path", + rawURL: "https://example.com/about/contact", + want: true, + }, + { + name: "Same domain, different protocol", + rawURL: "http://example.com", + want: true, + }, + { + name: "Different domain", + rawURL: "https://blog.person.me.uk", + want: false, + }, + { + name: "Different domain, same path", + rawURL: "https://example.org/blog", + want: false, + }, + } + + for ind, tc := range slices.All(testCasesForEqualDomains) { + t.Run(tc.name, testHasEqualDomains( + testCrawler, + ind+1, + tc.name, + tc.rawURL, + tc.want, + )) + } + + testCasesForPages := []struct { + rawURL string + wantVisited bool + }{ + { + rawURL: "https://example.com/tags/linux", + wantVisited: false, + }, + { + rawURL: "https://example.com/blog", + wantVisited: false, + }, + { + rawURL: "https://example.com/about/contact.html", + wantVisited: false, + }, + { + rawURL: "https://example.com/blog", + wantVisited: true, + }, + } + + for ind, tc := range slices.All(testCasesForPages) { + name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL) + t.Run(name, testAddPageVisit( + testCrawler, + ind+1, + name, + tc.rawURL, + tc.wantVisited, + )) + } +} + +func testHasEqualDomains( + testCrawler *crawler.Crawler, + testNum int, + testName string, + rawURL string, + want bool, +) func(t *testing.T) { + return func(t *testing.T) { + t.Parallel() + + got, err := testCrawler.HasEqualDomain(rawURL) + if err != nil { + t.Fatalf( + "Test %d - '%s' FAILED: unexpected error: %v", + testNum, + testName, + err, + ) + } + + if got != want { + t.Errorf( + "Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t", + testNum, + testName, + want, + got, + ) + } else { + t.Logf( + "Test %d - '%s' PASSED: expected domain comparison received: got %t", + testNum, + testName, + got, + ) + } + } +} + +func testAddPageVisit( + testCrawler *crawler.Crawler, + testNum int, + testName string, + rawURL string, + wantVisited bool, +) func(t *testing.T) { + return func(t *testing.T) { + normalisedURL, err := util.NormaliseURL(rawURL) + if err != nil { + t.Fatalf( + "Test %d - '%s' FAILED: unexpected error: %v", + testNum, + testName, + err, + ) + } + + gotVisited := testCrawler.AddPageVisit(normalisedURL) + + if gotVisited != wantVisited { + t.Errorf( + "Test %d - '%s' FAILED: unexpected bool returned after updated pages record: want %t, got %t", + testNum, + testName, + wantVisited, + gotVisited, + ) + } else { + t.Logf( + "Test %d - '%s' PASSED: expected bool returned after updated pages record: got %t", + testNum, + testName, + gotVisited, + ) + } + } +} diff --git a/internal/crawler/gethtml.go b/internal/crawler/gethtml.go new file mode 100644 index 0000000..c475ae1 --- /dev/null +++ b/internal/crawler/gethtml.go @@ -0,0 +1,50 @@ +package crawler + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +func getHTML(rawURL string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second)) + defer cancel() + + request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return "", fmt.Errorf("error creating the HTTP request: %w", err) + } + + client := http.Client{} + + resp, err := client.Do(request) + if err != nil { + return "", fmt.Errorf("error getting the response: %w", err) + } + + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + return "", fmt.Errorf( + "received a bad status from %s: (%d) %s", + rawURL, + resp.StatusCode, + resp.Status, + ) + } + + contentType := resp.Header.Get("content-type") + if !strings.Contains(contentType, "text/html") { + return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("error reading the data from the response: %w", err) + } + + return string(data), nil +} diff --git a/internal/report/report.go b/internal/report/report.go new file mode 100644 index 0000000..b28a393 --- /dev/null +++ b/internal/report/report.go @@ -0,0 +1,66 @@ +package report + +import ( + "cmp" + "maps" + "slices" + "strconv" + "strings" +) + +type Report struct { + baseURL string + records []Record +} + +type Record struct { + link string + count int +} + +func NewReport(baseURL string, pages map[string]int) Report { + records := make([]Record, 0) + + for link, count := range maps.All(pages) { + records = append(records, Record{link: link, count: count}) + } + + report := Report{ + baseURL: baseURL, + records: records, + } + + report.SortRecords() + + return report +} + +func (r *Report) SortRecords() { + // First sort records by count (in reverse order hopefully) + // Then sort records by name if two elements have the same count. + slices.SortFunc(r.records, func(a, b Record) int { + if n := cmp.Compare(a.count, b.count); n != 0 { + return -1 * n + } + + return strings.Compare(a.link, b.link) + }) +} + +func (r Report) String() string { + var builder strings.Builder + + titlebar := strings.Repeat("\u2500", 80) + + builder.WriteString("\n" + titlebar) + builder.WriteString("\n" + "REPORT for " + r.baseURL) + builder.WriteString("\n" + titlebar) + + for ind := range slices.All(r.records) { + builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link) + } + + builder.WriteString("\n") + + return builder.String() +} diff --git a/internal/util/html.go b/internal/util/html.go new file mode 100644 index 0000000..52716e3 --- /dev/null +++ b/internal/util/html.go @@ -0,0 +1,70 @@ +package util + +import ( + "fmt" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) { + htmlDoc, err := html.Parse(strings.NewReader(htmlBody)) + if err != nil { + return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err) + } + + parsedRawBaseURL, err := url.Parse(rawBaseURL) + if err != nil { + return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err) + } + + output := make([]string, 0, 3) + + var extractLinkFunc func(*html.Node) error + + extractLinkFunc = func(node *html.Node) error { + if node.Type == html.ElementNode && node.Data == "a" { + for _, a := range node.Attr { + if a.Key == "href" { + extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL) + if err != nil { + return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err) + } + + output = append(output, extractedURL) + + break + } + } + } + + for c := node.FirstChild; c != nil; c = c.NextSibling { + if err := extractLinkFunc(c); err != nil { + return err + } + } + + return nil + } + + if err := extractLinkFunc(htmlDoc); err != nil { + return []string{}, err + } + + return output, nil +} + +func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) { + parsedURL, err := url.Parse(inputURL) + if err != nil { + return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err) + } + + if parsedURL.Scheme == "" && parsedURL.Host == "" { + parsedURL.Scheme = baseURL.Scheme + parsedURL.Host = baseURL.Host + } + + return parsedURL.String(), nil +} diff --git a/internal/util/html_test.go b/internal/util/html_test.go new file mode 100644 index 0000000..479a966 --- /dev/null +++ b/internal/util/html_test.go @@ -0,0 +1,99 @@ +package util_test + +import ( + "os" + "reflect" + "slices" + "testing" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" +) + +func TestGetURLsFromHTML(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + filepath string + baseURL string + want []string + }{ + { + name: "HTML documentation using blog.boot.dev", + filepath: "testdata/GetURLFromHTML/blog.boot.dev.html", + baseURL: "https://blog.boot.dev", + want: []string{ + "https://blog.boot.dev/path/one", + "https://other.com/path/one", + }, + }, + { + name: "HTML documentation using https://ben-bartlett.me.uk", + filepath: "testdata/GetURLFromHTML/ben-bartlett.html", + baseURL: "https://ben-bartlett.me.uk", + want: []string{ + "https://ben-bartlett.me.uk", + "https://github.com/ben-bartlett", + "https://mastodon.ben-bartlett.me.uk", + "https://ben-bartlett.me.uk/blog", + "https://ben-bartlett.me.uk/projects/orange-juice", + "https://ben-bartlett.me.uk/projects/mustangs", + "https://ben-bartlett.me.uk/projects/honeycombs", + }, + }, + { + name: "HTML documentation using https://simple.cooking", + filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html", + baseURL: "https://simple.cooking", + want: []string{ + "https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken", + "https://simple.cooking/recipes/beef-and-broccoli", + "https://simple.cooking/recipes/asian-glazed-salmon", + "https://simple.cooking/recipes/caesar-salad", + "https://simple.cooking/recipes/simple-tuna-salad", + "https://simple.cooking/recipes/wholemeal-pizza", + "https://simple.cooking/news", + "https://simple.cooking/about/contact", + "https://the-other-site.example.new/home", + }, + }, + } + + for _, tc := range slices.All(cases) { + t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want)) + } +} + +func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) { + failedTestPrefix := "Test TestGetURLsFromHTML FAILED:" + + return func(t *testing.T) { + t.Parallel() + + htmlDoc, err := os.ReadFile(path) + if err != nil { + t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err) + } + + got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL) + if err != nil { + t.Fatalf( + "Test TestGetURLsFromHTML FAILED: unexpected error: %v", + err, + ) + } + + if !reflect.DeepEqual(want, got) { + t.Errorf( + "Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v", + want, + got, + ) + } else { + t.Logf( + "Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v", + got, + ) + } + } +} diff --git a/internal/util/testdata/GetURLFromHTML/ben-bartlett.html b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html new file mode 100644 index 0000000..8c05bf6 --- /dev/null +++ b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html @@ -0,0 +1,34 @@ + + + + + Ben Bartlett + + + +
+

Ben Bartlett

+
+

Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I'm not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.

+ + +

Projects I'm working on

+ + + diff --git a/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html new file mode 100644 index 0000000..853fe1e --- /dev/null +++ b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html @@ -0,0 +1,10 @@ + + + + Boot.dev + + + Boot.dev + + + diff --git a/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html new file mode 100644 index 0000000..8d0041f --- /dev/null +++ b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html @@ -0,0 +1,37 @@ + + + + + + My simple cooking website + + + +
+

My simple cooking website

+
+

Find my favourite recipes here.

+

Recipes

+ + + + + diff --git a/internal/util/url.go b/internal/util/url.go new file mode 100644 index 0000000..5e270c5 --- /dev/null +++ b/internal/util/url.go @@ -0,0 +1,18 @@ +package util + +import ( + "fmt" + "net/url" + "strings" +) + +func NormaliseURL(rawURL string) (string, error) { + const normalisedFormat string = "%s%s" + + parsedURL, err := url.Parse(rawURL) + if err != nil { + return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err) + } + + return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil +} diff --git a/internal/util/url_test.go b/internal/util/url_test.go new file mode 100644 index 0000000..99f591b --- /dev/null +++ b/internal/util/url_test.go @@ -0,0 +1,81 @@ +package util_test + +import ( + "slices" + "testing" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" +) + +func TestNormaliseURL(t *testing.T) { + t.Parallel() + + wantNormalisedURL := "blog.boot.dev/path" + + cases := []struct { + name string + inputURL string + }{ + { + name: "remove HTTPS scheme", + inputURL: "https://blog.boot.dev/path", + }, + { + name: "remove HTTP scheme", + inputURL: "http://blog.boot.dev/path", + }, + { + name: "remove HTTPS scheme with a trailing slash", + inputURL: "https://blog.boot.dev/path/", + }, + { + name: "remove HTTP scheme with a trailing slash", + inputURL: "http://blog.boot.dev/path/", + }, + { + name: "remove HTTPS scheme with port 443", + inputURL: "https://blog.boot.dev:443/path", + }, + { + name: "remove HTTP scheme with port 80", + inputURL: "http://blog.boot.dev:80/path", + }, + { + name: "normalised URL", + inputURL: "blog.boot.dev/path", + }, + } + + for ind, tc := range slices.All(cases) { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got, err := util.NormaliseURL(tc.inputURL) + if err != nil { + t.Fatalf( + "Test %d - '%s' FAILED: unexpected error: %v", + ind, + tc.name, + err, + ) + } + + if got != wantNormalisedURL { + t.Errorf( + "Test %d - %s FAILED: unexpected normalised URL returned: want %s, got %s", + ind, + tc.name, + wantNormalisedURL, + got, + ) + } else { + t.Logf( + "Test %d - %s PASSED: expected normalised URL returned: got %s", + ind, + tc.name, + got, + ) + } + }) + } +} diff --git a/magefiles/go.mod b/magefiles/go.mod new file mode 100644 index 0000000..402c8a6 --- /dev/null +++ b/magefiles/go.mod @@ -0,0 +1,5 @@ +module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles + +go 1.23.0 + +require github.com/magefile/mage v1.15.0 diff --git a/magefiles/go.sum b/magefiles/go.sum new file mode 100644 index 0000000..4ee1b87 --- /dev/null +++ b/magefiles/go.sum @@ -0,0 +1,2 @@ +github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg= +github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= diff --git a/magefiles/mage.go b/magefiles/mage.go index 1a12267..cac393d 100644 --- a/magefiles/mage.go +++ b/magefiles/mage.go @@ -14,23 +14,23 @@ import ( ) const ( - app = "binary" + app = "crawler" defaultInstallPrefix = "/usr/local" - envInstallPrefix = "PROJECT_INSTALL_PREFIX" - envTestVerbose = "PROJECT_TEST_VERBOSE" - envTestCover = "PROJECT_TEST_COVER" - envBuildRebuildAll = "PROJECT_BUILD_REBUILD_ALL" - envBuildVerbose = "PROJECT_BUILD_VERBOSE" + envInstallPrefix = "CRAWLER_INSTALL_PREFIX" + envTestVerbose = "CRAWLER_TEST_VERBOSE" + envTestCover = "CRAWLER_TEST_COVER" + envBuildRebuildAll = "CRAWLER_BUILD_REBUILD_ALL" + envBuildVerbose = "CRAWLER_BUILD_VERBOSE" ) var ( Default = Build - binary = "./__build/" + app + binary = app ) // Test run the go tests. -// To enable verbose mode set PROJECT_TEST_VERBOSE=1. -// To enable coverage mode set PROJECT_TEST_COVER=1. +// To enable verbose mode set CRAWLER_TEST_VERBOSE=1. +// To enable coverage mode set CRAWLER_TEST_COVER=1. func Test() error { goTest := sh.RunCmd("go", "test") @@ -56,10 +56,10 @@ func Lint() error { // To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1 // To enable verbose mode set PROJECT_BUILD_VERBOSE=1 func Build() error { - main := "main.go" - flags := ldflags() + main := "." + //flags := ldflags() build := sh.RunCmd("go", "build") - args := []string{"-ldflags=" + flags, "-o", binary} + args := []string{"-ldflags=-s -w", "-o", binary} if os.Getenv(envBuildRebuildAll) == "1" { args = append(args, "-a") diff --git a/main.go b/main.go index a3255e8..41df9ce 100644 --- a/main.go +++ b/main.go @@ -3,22 +3,48 @@ package main import ( "fmt" "os" -) + "strconv" -var ( - binaryVersion string - buildTime string - goVersion string - gitCommit string + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" ) func main() { if err := run(); err != nil { - fmt.Printf("ERROR: %v.\n", err) + os.Stderr.WriteString("ERROR: " + err.Error() + "\n") + os.Exit(1) } } func run() error { + args := os.Args[1:] + + if len(args) != 3 { + return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args)) + } + + baseURL := args[0] + + maxConcurrency, err := strconv.Atoi(args[1]) + if err != nil { + return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err) + } + + maxPages, err := strconv.Atoi(args[2]) + if err != nil { + return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err) + } + + c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages) + if err != nil { + return fmt.Errorf("unable to create the crawler: %w", err) + } + + go c.Crawl(baseURL) + + c.Wait() + + c.PrintReport() + return nil }