diff options
| author | Adam Mathes <adam@adammathes.com> | 2026-02-14 08:58:38 -0800 |
|---|---|---|
| committer | Adam Mathes <adam@adammathes.com> | 2026-02-14 08:58:38 -0800 |
| commit | e3c379d069ffa9661561d25cdbf2f5894a2f8ee8 (patch) | |
| tree | 24d0e9f5610dd9c8f873c5b78e6bc1c88d32840a /crawler | |
| parent | 4b06155fbde91a1bef6361ef36efb28789861928 (diff) | |
| download | neko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.tar.gz neko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.tar.bz2 neko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.zip | |
Refactor: project structure, implement dependency injection, and align v2 UI with v1
Diffstat (limited to 'crawler')
| -rw-r--r-- | crawler/crawler.go | 160 | ||||
| -rw-r--r-- | crawler/crawler_test.go | 278 | ||||
| -rw-r--r-- | crawler/integration_test.go | 67 |
3 files changed, 0 insertions, 505 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go deleted file mode 100644 index f8794d4..0000000 --- a/crawler/crawler.go +++ /dev/null @@ -1,160 +0,0 @@ -package crawler - -import ( - "adammathes.com/neko/models/feed" - "adammathes.com/neko/models/item" - "adammathes.com/neko/vlog" - "github.com/mmcdole/gofeed" - "io/ioutil" - "log" - "net/http" - "time" -) - -const MAX_CRAWLERS = 5 - -func Crawl() { - crawlJobs := make(chan *feed.Feed, 100) - results := make(chan string, 100) - - feeds, err := feed.All() - if err != nil { - log.Fatal(err) - } - - for i := 0; i < MAX_CRAWLERS; i++ { - vlog.Printf("spawning crawl worker %d\n", i) - go CrawlWorker(crawlJobs, results) - } - - for _, f := range feeds { - vlog.Printf("sending crawl job %s\n", f.Url) - crawlJobs <- f - } - close(crawlJobs) - - for i := 0; i < len(feeds); i++ { - vlog.Println(<-results) - } - close(results) -} - -func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) { - - for f := range feeds { - vlog.Printf("crawl job received %s\n", f.Url) - CrawlFeed(f, results) - vlog.Printf("crawl job finished %s\n", f.Url) - } -} - -/* -Simple HTTP Get fnx with custom user agent header -*/ -func GetFeedContent(feedURL string) string { - - // introduce delays for testing - // n := time.Duration(rand.Int63n(3)) - // time.Sleep(n * time.Second) - - c := &http.Client{ - // give up after 5 seconds - Timeout: 5 * time.Second, - } - - request, err := http.NewRequest("GET", feedURL, nil) - if err != nil { - log.Fatalln(err) - } - - userAgent := "neko RSS Crawler +https://github.com/adammathes/neko" - request.Header.Set("User-Agent", userAgent) - resp, err := c.Do(request) - - if err != nil { - return "" - } - - if resp != nil { - defer func() { - ce := resp.Body.Close() - if ce != nil { - err = ce - } - }() - } - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return "" - } - - bodyBytes, err := ioutil.ReadAll(resp.Body) - if err != nil { - return "" - } - return string(bodyBytes) -} - -/* - TODO: sanitize input on crawl -*/ -func CrawlFeed(f *feed.Feed, ch chan<- string) { - c := &http.Client{ - // give up after 5 seconds - Timeout: 5 * time.Second, - } - - fp := gofeed.NewParser() - fp.Client = c - - content := GetFeedContent(f.Url) - feed, err := fp.ParseString(content) - if err != nil { - vlog.Println(err) - ch <- "failed parse for " + f.Url + "\n" - return - } - - f.Title = feed.Title - f.WebUrl = feed.Link - f.Update() - - for _, i := range feed.Items { - vlog.Printf("storing item: %s\n", i.Link) - var item item.Item - item.Title = i.Title - item.Url = i.Link - - item.Description = i.Description - if len(i.Content) > len(item.Description) { - item.Description = i.Content - } - - // a lot of RSS2.0 generated by wordpress and others - // uses <content:encoded> - e, ok := i.Extensions["content"]["encoded"] - var encoded = "" - if ok { - encoded = e[0].Value - } - if len(encoded) > len(item.Description) { - item.Description = encoded - } - - if i.PublishedParsed != nil { - item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05") - } else { - item.PublishDate = time.Now().Format("2006-01-02 15:04:05") - } - - item.FeedId = f.Id - err := item.Create() - if err != nil { - vlog.Println(err) - } - // else { - // item.GetFullContent() - //} - } - ch <- "successfully crawled " + f.Url + "\n" -} diff --git a/crawler/crawler_test.go b/crawler/crawler_test.go deleted file mode 100644 index e0c4c6b..0000000 --- a/crawler/crawler_test.go +++ /dev/null @@ -1,278 +0,0 @@ -package crawler - -import ( - "log" - "net/http" - "net/http/httptest" - "strings" - "testing" - - "adammathes.com/neko/config" - "adammathes.com/neko/models" - "adammathes.com/neko/models/feed" -) - -func setupTestDB(t *testing.T) { - t.Helper() - config.Config.DBFile = ":memory:" - models.InitDB() - t.Cleanup(func() { - if models.DB != nil { - models.DB.Close() - } - }) -} - -func TestGetFeedContentSuccess(t *testing.T) { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - ua := r.Header.Get("User-Agent") - if ua == "" { - t.Error("Request should include User-Agent") - } - w.WriteHeader(200) - w.Write([]byte("<rss><channel><title>Test</title></channel></rss>")) - })) - defer ts.Close() - - content := GetFeedContent(ts.URL) - if content == "" { - t.Error("GetFeedContent should return content for valid URL") - } - if content != "<rss><channel><title>Test</title></channel></rss>" { - t.Errorf("Unexpected content: %q", content) - } -} - -func TestGetFeedContentBadURL(t *testing.T) { - content := GetFeedContent("http://invalid.invalid.invalid:99999/feed") - if content != "" { - t.Errorf("GetFeedContent should return empty string for bad URL, got %q", content) - } -} - -func TestGetFeedContent404(t *testing.T) { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(404) - })) - defer ts.Close() - - content := GetFeedContent(ts.URL) - if content != "" { - t.Errorf("GetFeedContent should return empty for 404, got %q", content) - } -} - -func TestGetFeedContent500(t *testing.T) { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(500) - })) - defer ts.Close() - - content := GetFeedContent(ts.URL) - if content != "" { - t.Errorf("GetFeedContent should return empty for 500, got %q", content) - } -} - -func TestGetFeedContentUserAgent(t *testing.T) { - var receivedUA string - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - receivedUA = r.Header.Get("User-Agent") - w.WriteHeader(200) - w.Write([]byte("ok")) - })) - defer ts.Close() - - GetFeedContent(ts.URL) - expected := "neko RSS Crawler +https://github.com/adammathes/neko" - if receivedUA != expected { - t.Errorf("Expected UA %q, got %q", expected, receivedUA) - } -} - -func TestCrawlFeedWithTestServer(t *testing.T) { - setupTestDB(t) - - rssContent := `<?xml version="1.0" encoding="UTF-8"?> -<rss version="2.0"> - <channel> - <title>Test Feed</title> - <link>https://example.com</link> - <item> - <title>Article 1</title> - <link>https://example.com/article1</link> - <description>First article</description> - <pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate> - </item> - <item> - <title>Article 2</title> - <link>https://example.com/article2</link> - <description>Second article</description> - <pubDate>Tue, 02 Jan 2024 00:00:00 GMT</pubDate> - </item> - </channel> -</rss>` - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/rss+xml") - w.WriteHeader(200) - w.Write([]byte(rssContent)) - })) - defer ts.Close() - - // Create a feed pointing to the test server - f := &feed.Feed{Url: ts.URL, Title: "Test"} - f.Create() - - ch := make(chan string, 1) - CrawlFeed(f, ch) - result := <-ch - - if result == "" { - t.Error("CrawlFeed should send a result") - } - - // Verify items were created - var count int - models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count) - if count != 2 { - t.Errorf("Expected 2 items, got %d", count) - } -} - -func TestCrawlFeedBadContent(t *testing.T) { - setupTestDB(t) - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) - w.Write([]byte("not xml at all")) - })) - defer ts.Close() - - f := &feed.Feed{Url: ts.URL, Title: "Bad"} - f.Create() - - ch := make(chan string, 1) - CrawlFeed(f, ch) - result := <-ch - - if result == "" { - t.Error("CrawlFeed should send a result even on failure") - } -} - -func TestCrawlWorker(t *testing.T) { - setupTestDB(t) - - rssContent := `<?xml version="1.0" encoding="UTF-8"?> -<rss version="2.0"> - <channel> - <title>Worker Feed</title> - <link>https://example.com</link> - <item> - <title>Worker Article</title> - <link>https://example.com/worker-article</link> - <description>An article</description> - </item> - </channel> -</rss>` - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) - w.Write([]byte(rssContent)) - })) - defer ts.Close() - - f := &feed.Feed{Url: ts.URL, Title: "Worker Test"} - f.Create() - - feeds := make(chan *feed.Feed, 1) - results := make(chan string, 1) - - feeds <- f - close(feeds) - - CrawlWorker(feeds, results) - result := <-results - - if result == "" { - t.Error("CrawlWorker should produce a result") - } -} - -func TestCrawl(t *testing.T) { - setupTestDB(t) - - rssContent := `<?xml version="1.0" encoding="UTF-8"?> -<rss version="2.0"> - <channel> - <title>Crawl Feed</title> - <link>https://example.com</link> - <item> - <title>Crawl Article</title> - <link>https://example.com/crawl-article</link> - <description>Article for crawl test</description> - </item> - </channel> -</rss>` - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) - w.Write([]byte(rssContent)) - })) - defer ts.Close() - - f := &feed.Feed{Url: ts.URL, Title: "Full Crawl"} - f.Create() - - // Should not panic - Crawl() - - var count int - models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count) - if count != 1 { - t.Errorf("Expected 1 item after crawl, got %d", count) - } -} - -func TestCrawlFeedWithExtensions(t *testing.T) { - setupTestDB(t) - - rssContent := `<?xml version="1.0" encoding="UTF-8"?> -<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> - <channel> - <title>Extension Feed</title> - <item> - <title>Extension Article</title> - <link>https://example.com/ext</link> - <description>Short description</description> - <content:encoded><![CDATA[Much longer content that should be used as description]]></content:encoded> - </item> - </channel> -</rss>` - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) - w.Write([]byte(rssContent)) - })) - defer ts.Close() - - f := &feed.Feed{Url: ts.URL, Title: "Extension Test"} - f.Create() - - ch := make(chan string, 1) - CrawlFeed(f, ch) - <-ch - - var itemTitle, itemDesc string - err := models.DB.QueryRow("SELECT title, description FROM item WHERE feed_id = ?", f.Id).Scan(&itemTitle, &itemDesc) - if err != nil { - log.Fatal(err) - } - - if itemTitle != "Extension Article" { - t.Errorf("Expected title 'Extension Article', got %q", itemTitle) - } - if !strings.Contains(itemDesc, "Much longer content") { - t.Errorf("Expected description to contain encoded content, got %q", itemDesc) - } -} diff --git a/crawler/integration_test.go b/crawler/integration_test.go deleted file mode 100644 index 633b60f..0000000 --- a/crawler/integration_test.go +++ /dev/null @@ -1,67 +0,0 @@ -package crawler - -import ( - "fmt" - "net/http" - "net/http/httptest" - "os" - "testing" - - "adammathes.com/neko/models/feed" - "adammathes.com/neko/models/item" -) - -func TestCrawlIntegration(t *testing.T) { - setupTestDB(t) - - // Mock RSS feed server - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/rss+xml") - os.Stdout.Write([]byte("serving mock rss\n")) - fmt.Fprint(w, `<?xml version="1.0" encoding="UTF-8" ?> -<rss version="2.0"> -<channel> - <title>Test Feed</title> - <link>http://example.com/</link> - <description>Test Description</description> - <item> - <title>Test Item 1</title> - <link>http://example.com/item1</link> - <description>Item 1 Description</description> - <pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate> - </item> -</channel> -</rss>`) - })) - defer ts.Close() - - // Add the feed - f := &feed.Feed{Url: ts.URL} - err := f.Create() - if err != nil { - t.Fatalf("Failed to create feed: %v", err) - } - - // Crawl - ch := make(chan string, 1) - CrawlFeed(f, ch) - - res := <-ch - if res == "" { - t.Fatal("CrawlFeed returned empty result") - } - - // Verify items were stored - items, err := item.Filter(0, f.Id, "", false, false, 0, "") - if err != nil { - t.Fatalf("Failed to filter items: %v", err) - } - - if len(items) != 1 { - t.Fatalf("Expected 1 item, got %d", len(items)) - } - - if items[0].Title != "Test Item 1" { - t.Errorf("Expected 'Test Item 1', got %q", items[0].Title) - } -} |
