web-crawler/internal/crawler/crawler_test.go
Dan Anglin d4633344f7
feat: add the web crawler
Add the source code for the web crawler. The web crawler is a simple Go
CLI application that traverses through a website and generates a report
of all the internal links found in the site.
2024-08-27 15:16:29 +01:00

172 lines
3.2 KiB
Go

package crawler_test
import (
"fmt"
"slices"
"testing"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestCrawler(t *testing.T) {
testBaseURL := "https://example.com"
testCrawler, err := crawler.NewCrawler(testBaseURL)
if err != nil {
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
}
testCasesForEqualDomains := []struct {
name string
rawURL string
want bool
}{
{
name: "Same domain",
rawURL: "https://example.com",
want: true,
},
{
name: "Same domain, different path",
rawURL: "https://example.com/about/contact",
want: true,
},
{
name: "Same domain, different protocol",
rawURL: "http://example.com",
want: true,
},
{
name: "Different domain",
rawURL: "https://blog.person.me.uk",
want: false,
},
{
name: "Different domain, same path",
rawURL: "https://example.org/blog",
want: false,
},
}
for ind, tc := range slices.All(testCasesForEqualDomains) {
t.Run(tc.name, testHasEqualDomains(
testCrawler,
ind+1,
tc.name,
tc.rawURL,
tc.want,
))
}
testCasesForPages := []struct {
rawURL string
wantVisited bool
}{
{
rawURL: "https://example.com/tags/linux",
wantVisited: false,
},
{
rawURL: "https://example.com/blog",
wantVisited: false,
},
{
rawURL: "https://example.com/about/contact.html",
wantVisited: false,
},
{
rawURL: "https://example.com/blog",
wantVisited: true,
},
}
for ind, tc := range slices.All(testCasesForPages) {
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
t.Run(name, testAddPageVisit(
testCrawler,
ind+1,
name,
tc.rawURL,
tc.wantVisited,
))
}
}
func testHasEqualDomains(
testCrawler *crawler.Crawler,
testNum int,
testName string,
rawURL string,
want bool,
) func(t *testing.T) {
return func(t *testing.T) {
t.Parallel()
got, err := testCrawler.HasEqualDomain(rawURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
testNum,
testName,
err,
)
}
if got != want {
t.Errorf(
"Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
testNum,
testName,
want,
got,
)
} else {
t.Logf(
"Test %d - '%s' PASSED: expected domain comparison received: got %t",
testNum,
testName,
got,
)
}
}
}
func testAddPageVisit(
testCrawler *crawler.Crawler,
testNum int,
testName string,
rawURL string,
wantVisited bool,
) func(t *testing.T) {
return func(t *testing.T) {
normalisedURL, err := util.NormaliseURL(rawURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
testNum,
testName,
err,
)
}
gotVisited := testCrawler.AddPageVisit(normalisedURL)
if gotVisited != wantVisited {
t.Errorf(
"Test %d - '%s' FAILED: unexpected bool returned after updated pages record: want %t, got %t",
testNum,
testName,
wantVisited,
gotVisited,
)
} else {
t.Logf(
"Test %d - '%s' PASSED: expected bool returned after updated pages record: got %t",
testNum,
testName,
gotVisited,
)
}
}
}