From e3c379d069ffa9661561d25cdbf2f5894a2f8ee8 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Sat, 14 Feb 2026 08:58:38 -0800 Subject: Refactor: project structure, implement dependency injection, and align v2 UI with v1 --- internal/crawler/crawler.go | 161 ++++++++++++++++++++ internal/crawler/crawler_test.go | 278 +++++++++++++++++++++++++++++++++++ internal/crawler/integration_test.go | 67 +++++++++ 3 files changed, 506 insertions(+) create mode 100644 internal/crawler/crawler.go create mode 100644 internal/crawler/crawler_test.go create mode 100644 internal/crawler/integration_test.go (limited to 'internal/crawler') diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go new file mode 100644 index 0000000..10253d8 --- /dev/null +++ b/internal/crawler/crawler.go @@ -0,0 +1,161 @@ +package crawler + +import ( + "io/ioutil" + "log" + "net/http" + "time" + + "adammathes.com/neko/internal/vlog" + "adammathes.com/neko/models/feed" + "adammathes.com/neko/models/item" + "github.com/mmcdole/gofeed" +) + +const MAX_CRAWLERS = 5 + +func Crawl() { + crawlJobs := make(chan *feed.Feed, 100) + results := make(chan string, 100) + + feeds, err := feed.All() + if err != nil { + log.Fatal(err) + } + + for i := 0; i < MAX_CRAWLERS; i++ { + vlog.Printf("spawning crawl worker %d\n", i) + go CrawlWorker(crawlJobs, results) + } + + for _, f := range feeds { + vlog.Printf("sending crawl job %s\n", f.Url) + crawlJobs <- f + } + close(crawlJobs) + + for i := 0; i < len(feeds); i++ { + vlog.Println(<-results) + } + close(results) +} + +func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) { + + for f := range feeds { + vlog.Printf("crawl job received %s\n", f.Url) + CrawlFeed(f, results) + vlog.Printf("crawl job finished %s\n", f.Url) + } +} + +/* +Simple HTTP Get fnx with custom user agent header +*/ +func GetFeedContent(feedURL string) string { + + // introduce delays for testing + // n := time.Duration(rand.Int63n(3)) + // time.Sleep(n * time.Second) + + c := &http.Client{ + // give up after 5 seconds + Timeout: 5 * time.Second, + } + + request, err := http.NewRequest("GET", feedURL, nil) + if err != nil { + log.Fatalln(err) + } + + userAgent := "neko RSS Crawler +https://github.com/adammathes/neko" + request.Header.Set("User-Agent", userAgent) + resp, err := c.Do(request) + + if err != nil { + return "" + } + + if resp != nil { + defer func() { + ce := resp.Body.Close() + if ce != nil { + err = ce + } + }() + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "" + } + + bodyBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "" + } + return string(bodyBytes) +} + +/* +TODO: sanitize input on crawl +*/ +func CrawlFeed(f *feed.Feed, ch chan<- string) { + c := &http.Client{ + // give up after 5 seconds + Timeout: 5 * time.Second, + } + + fp := gofeed.NewParser() + fp.Client = c + + content := GetFeedContent(f.Url) + feed, err := fp.ParseString(content) + if err != nil { + vlog.Println(err) + ch <- "failed parse for " + f.Url + "\n" + return + } + + f.Title = feed.Title + f.WebUrl = feed.Link + f.Update() + + for _, i := range feed.Items { + vlog.Printf("storing item: %s\n", i.Link) + var item item.Item + item.Title = i.Title + item.Url = i.Link + + item.Description = i.Description + if len(i.Content) > len(item.Description) { + item.Description = i.Content + } + + // a lot of RSS2.0 generated by wordpress and others + // uses + e, ok := i.Extensions["content"]["encoded"] + var encoded = "" + if ok { + encoded = e[0].Value + } + if len(encoded) > len(item.Description) { + item.Description = encoded + } + + if i.PublishedParsed != nil { + item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05") + } else { + item.PublishDate = time.Now().Format("2006-01-02 15:04:05") + } + + item.FeedId = f.Id + err := item.Create() + if err != nil { + vlog.Println(err) + } + // else { + // item.GetFullContent() + //} + } + ch <- "successfully crawled " + f.Url + "\n" +} diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go new file mode 100644 index 0000000..e0c4c6b --- /dev/null +++ b/internal/crawler/crawler_test.go @@ -0,0 +1,278 @@ +package crawler + +import ( + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "adammathes.com/neko/config" + "adammathes.com/neko/models" + "adammathes.com/neko/models/feed" +) + +func setupTestDB(t *testing.T) { + t.Helper() + config.Config.DBFile = ":memory:" + models.InitDB() + t.Cleanup(func() { + if models.DB != nil { + models.DB.Close() + } + }) +} + +func TestGetFeedContentSuccess(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ua := r.Header.Get("User-Agent") + if ua == "" { + t.Error("Request should include User-Agent") + } + w.WriteHeader(200) + w.Write([]byte("Test")) + })) + defer ts.Close() + + content := GetFeedContent(ts.URL) + if content == "" { + t.Error("GetFeedContent should return content for valid URL") + } + if content != "Test" { + t.Errorf("Unexpected content: %q", content) + } +} + +func TestGetFeedContentBadURL(t *testing.T) { + content := GetFeedContent("http://invalid.invalid.invalid:99999/feed") + if content != "" { + t.Errorf("GetFeedContent should return empty string for bad URL, got %q", content) + } +} + +func TestGetFeedContent404(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(404) + })) + defer ts.Close() + + content := GetFeedContent(ts.URL) + if content != "" { + t.Errorf("GetFeedContent should return empty for 404, got %q", content) + } +} + +func TestGetFeedContent500(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(500) + })) + defer ts.Close() + + content := GetFeedContent(ts.URL) + if content != "" { + t.Errorf("GetFeedContent should return empty for 500, got %q", content) + } +} + +func TestGetFeedContentUserAgent(t *testing.T) { + var receivedUA string + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + receivedUA = r.Header.Get("User-Agent") + w.WriteHeader(200) + w.Write([]byte("ok")) + })) + defer ts.Close() + + GetFeedContent(ts.URL) + expected := "neko RSS Crawler +https://github.com/adammathes/neko" + if receivedUA != expected { + t.Errorf("Expected UA %q, got %q", expected, receivedUA) + } +} + +func TestCrawlFeedWithTestServer(t *testing.T) { + setupTestDB(t) + + rssContent := ` + + + Test Feed + https://example.com + + Article 1 + https://example.com/article1 + First article + Mon, 01 Jan 2024 00:00:00 GMT + + + Article 2 + https://example.com/article2 + Second article + Tue, 02 Jan 2024 00:00:00 GMT + + +` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/rss+xml") + w.WriteHeader(200) + w.Write([]byte(rssContent)) + })) + defer ts.Close() + + // Create a feed pointing to the test server + f := &feed.Feed{Url: ts.URL, Title: "Test"} + f.Create() + + ch := make(chan string, 1) + CrawlFeed(f, ch) + result := <-ch + + if result == "" { + t.Error("CrawlFeed should send a result") + } + + // Verify items were created + var count int + models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count) + if count != 2 { + t.Errorf("Expected 2 items, got %d", count) + } +} + +func TestCrawlFeedBadContent(t *testing.T) { + setupTestDB(t) + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte("not xml at all")) + })) + defer ts.Close() + + f := &feed.Feed{Url: ts.URL, Title: "Bad"} + f.Create() + + ch := make(chan string, 1) + CrawlFeed(f, ch) + result := <-ch + + if result == "" { + t.Error("CrawlFeed should send a result even on failure") + } +} + +func TestCrawlWorker(t *testing.T) { + setupTestDB(t) + + rssContent := ` + + + Worker Feed + https://example.com + + Worker Article + https://example.com/worker-article + An article + + +` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(rssContent)) + })) + defer ts.Close() + + f := &feed.Feed{Url: ts.URL, Title: "Worker Test"} + f.Create() + + feeds := make(chan *feed.Feed, 1) + results := make(chan string, 1) + + feeds <- f + close(feeds) + + CrawlWorker(feeds, results) + result := <-results + + if result == "" { + t.Error("CrawlWorker should produce a result") + } +} + +func TestCrawl(t *testing.T) { + setupTestDB(t) + + rssContent := ` + + + Crawl Feed + https://example.com + + Crawl Article + https://example.com/crawl-article + Article for crawl test + + +` + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(rssContent)) + })) + defer ts.Close() + + f := &feed.Feed{Url: ts.URL, Title: "Full Crawl"} + f.Create() + + // Should not panic + Crawl() + + var count int + models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count) + if count != 1 { + t.Errorf("Expected 1 item after crawl, got %d", count) + } +} + +func TestCrawlFeedWithExtensions(t *testing.T) { + setupTestDB(t) + + rssContent := ` + + + Extension Feed + + Extension Article + https://example.com/ext + Short description + + + +` + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(rssContent)) + })) + defer ts.Close() + + f := &feed.Feed{Url: ts.URL, Title: "Extension Test"} + f.Create() + + ch := make(chan string, 1) + CrawlFeed(f, ch) + <-ch + + var itemTitle, itemDesc string + err := models.DB.QueryRow("SELECT title, description FROM item WHERE feed_id = ?", f.Id).Scan(&itemTitle, &itemDesc) + if err != nil { + log.Fatal(err) + } + + if itemTitle != "Extension Article" { + t.Errorf("Expected title 'Extension Article', got %q", itemTitle) + } + if !strings.Contains(itemDesc, "Much longer content") { + t.Errorf("Expected description to contain encoded content, got %q", itemDesc) + } +} diff --git a/internal/crawler/integration_test.go b/internal/crawler/integration_test.go new file mode 100644 index 0000000..633b60f --- /dev/null +++ b/internal/crawler/integration_test.go @@ -0,0 +1,67 @@ +package crawler + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "testing" + + "adammathes.com/neko/models/feed" + "adammathes.com/neko/models/item" +) + +func TestCrawlIntegration(t *testing.T) { + setupTestDB(t) + + // Mock RSS feed server + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/rss+xml") + os.Stdout.Write([]byte("serving mock rss\n")) + fmt.Fprint(w, ` + + + Test Feed + http://example.com/ + Test Description + + Test Item 1 + http://example.com/item1 + Item 1 Description + Mon, 01 Jan 2024 00:00:00 +0000 + + +`) + })) + defer ts.Close() + + // Add the feed + f := &feed.Feed{Url: ts.URL} + err := f.Create() + if err != nil { + t.Fatalf("Failed to create feed: %v", err) + } + + // Crawl + ch := make(chan string, 1) + CrawlFeed(f, ch) + + res := <-ch + if res == "" { + t.Fatal("CrawlFeed returned empty result") + } + + // Verify items were stored + items, err := item.Filter(0, f.Id, "", false, false, 0, "") + if err != nil { + t.Fatalf("Failed to filter items: %v", err) + } + + if len(items) != 1 { + t.Fatalf("Expected 1 item, got %d", len(items)) + } + + if items[0].Title != "Test Item 1" { + t.Errorf("Expected 'Test Item 1', got %q", items[0].Title) + } +} -- cgit v1.2.3