aboutsummaryrefslogtreecommitdiffstats
path: root/crawler
diff options
context:
space:
mode:
authorAdam Mathes <adam@adammathes.com>2026-02-14 08:58:38 -0800
committerAdam Mathes <adam@adammathes.com>2026-02-14 08:58:38 -0800
commite3c379d069ffa9661561d25cdbf2f5894a2f8ee8 (patch)
tree24d0e9f5610dd9c8f873c5b78e6bc1c88d32840a /crawler
parent4b06155fbde91a1bef6361ef36efb28789861928 (diff)
downloadneko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.tar.gz
neko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.tar.bz2
neko-e3c379d069ffa9661561d25cdbf2f5894a2f8ee8.zip
Refactor: project structure, implement dependency injection, and align v2 UI with v1
Diffstat (limited to 'crawler')
-rw-r--r--crawler/crawler.go160
-rw-r--r--crawler/crawler_test.go278
-rw-r--r--crawler/integration_test.go67
3 files changed, 0 insertions, 505 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go
deleted file mode 100644
index f8794d4..0000000
--- a/crawler/crawler.go
+++ /dev/null
@@ -1,160 +0,0 @@
-package crawler
-
-import (
- "adammathes.com/neko/models/feed"
- "adammathes.com/neko/models/item"
- "adammathes.com/neko/vlog"
- "github.com/mmcdole/gofeed"
- "io/ioutil"
- "log"
- "net/http"
- "time"
-)
-
-const MAX_CRAWLERS = 5
-
-func Crawl() {
- crawlJobs := make(chan *feed.Feed, 100)
- results := make(chan string, 100)
-
- feeds, err := feed.All()
- if err != nil {
- log.Fatal(err)
- }
-
- for i := 0; i < MAX_CRAWLERS; i++ {
- vlog.Printf("spawning crawl worker %d\n", i)
- go CrawlWorker(crawlJobs, results)
- }
-
- for _, f := range feeds {
- vlog.Printf("sending crawl job %s\n", f.Url)
- crawlJobs <- f
- }
- close(crawlJobs)
-
- for i := 0; i < len(feeds); i++ {
- vlog.Println(<-results)
- }
- close(results)
-}
-
-func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) {
-
- for f := range feeds {
- vlog.Printf("crawl job received %s\n", f.Url)
- CrawlFeed(f, results)
- vlog.Printf("crawl job finished %s\n", f.Url)
- }
-}
-
-/*
-Simple HTTP Get fnx with custom user agent header
-*/
-func GetFeedContent(feedURL string) string {
-
- // introduce delays for testing
- // n := time.Duration(rand.Int63n(3))
- // time.Sleep(n * time.Second)
-
- c := &http.Client{
- // give up after 5 seconds
- Timeout: 5 * time.Second,
- }
-
- request, err := http.NewRequest("GET", feedURL, nil)
- if err != nil {
- log.Fatalln(err)
- }
-
- userAgent := "neko RSS Crawler +https://github.com/adammathes/neko"
- request.Header.Set("User-Agent", userAgent)
- resp, err := c.Do(request)
-
- if err != nil {
- return ""
- }
-
- if resp != nil {
- defer func() {
- ce := resp.Body.Close()
- if ce != nil {
- err = ce
- }
- }()
- }
-
- if resp.StatusCode < 200 || resp.StatusCode >= 300 {
- return ""
- }
-
- bodyBytes, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- return ""
- }
- return string(bodyBytes)
-}
-
-/*
- TODO: sanitize input on crawl
-*/
-func CrawlFeed(f *feed.Feed, ch chan<- string) {
- c := &http.Client{
- // give up after 5 seconds
- Timeout: 5 * time.Second,
- }
-
- fp := gofeed.NewParser()
- fp.Client = c
-
- content := GetFeedContent(f.Url)
- feed, err := fp.ParseString(content)
- if err != nil {
- vlog.Println(err)
- ch <- "failed parse for " + f.Url + "\n"
- return
- }
-
- f.Title = feed.Title
- f.WebUrl = feed.Link
- f.Update()
-
- for _, i := range feed.Items {
- vlog.Printf("storing item: %s\n", i.Link)
- var item item.Item
- item.Title = i.Title
- item.Url = i.Link
-
- item.Description = i.Description
- if len(i.Content) > len(item.Description) {
- item.Description = i.Content
- }
-
- // a lot of RSS2.0 generated by wordpress and others
- // uses <content:encoded>
- e, ok := i.Extensions["content"]["encoded"]
- var encoded = ""
- if ok {
- encoded = e[0].Value
- }
- if len(encoded) > len(item.Description) {
- item.Description = encoded
- }
-
- if i.PublishedParsed != nil {
- item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05")
- } else {
- item.PublishDate = time.Now().Format("2006-01-02 15:04:05")
- }
-
- item.FeedId = f.Id
- err := item.Create()
- if err != nil {
- vlog.Println(err)
- }
- // else {
- // item.GetFullContent()
- //}
- }
- ch <- "successfully crawled " + f.Url + "\n"
-}
diff --git a/crawler/crawler_test.go b/crawler/crawler_test.go
deleted file mode 100644
index e0c4c6b..0000000
--- a/crawler/crawler_test.go
+++ /dev/null
@@ -1,278 +0,0 @@
-package crawler
-
-import (
- "log"
- "net/http"
- "net/http/httptest"
- "strings"
- "testing"
-
- "adammathes.com/neko/config"
- "adammathes.com/neko/models"
- "adammathes.com/neko/models/feed"
-)
-
-func setupTestDB(t *testing.T) {
- t.Helper()
- config.Config.DBFile = ":memory:"
- models.InitDB()
- t.Cleanup(func() {
- if models.DB != nil {
- models.DB.Close()
- }
- })
-}
-
-func TestGetFeedContentSuccess(t *testing.T) {
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- ua := r.Header.Get("User-Agent")
- if ua == "" {
- t.Error("Request should include User-Agent")
- }
- w.WriteHeader(200)
- w.Write([]byte("<rss><channel><title>Test</title></channel></rss>"))
- }))
- defer ts.Close()
-
- content := GetFeedContent(ts.URL)
- if content == "" {
- t.Error("GetFeedContent should return content for valid URL")
- }
- if content != "<rss><channel><title>Test</title></channel></rss>" {
- t.Errorf("Unexpected content: %q", content)
- }
-}
-
-func TestGetFeedContentBadURL(t *testing.T) {
- content := GetFeedContent("http://invalid.invalid.invalid:99999/feed")
- if content != "" {
- t.Errorf("GetFeedContent should return empty string for bad URL, got %q", content)
- }
-}
-
-func TestGetFeedContent404(t *testing.T) {
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(404)
- }))
- defer ts.Close()
-
- content := GetFeedContent(ts.URL)
- if content != "" {
- t.Errorf("GetFeedContent should return empty for 404, got %q", content)
- }
-}
-
-func TestGetFeedContent500(t *testing.T) {
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(500)
- }))
- defer ts.Close()
-
- content := GetFeedContent(ts.URL)
- if content != "" {
- t.Errorf("GetFeedContent should return empty for 500, got %q", content)
- }
-}
-
-func TestGetFeedContentUserAgent(t *testing.T) {
- var receivedUA string
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- receivedUA = r.Header.Get("User-Agent")
- w.WriteHeader(200)
- w.Write([]byte("ok"))
- }))
- defer ts.Close()
-
- GetFeedContent(ts.URL)
- expected := "neko RSS Crawler +https://github.com/adammathes/neko"
- if receivedUA != expected {
- t.Errorf("Expected UA %q, got %q", expected, receivedUA)
- }
-}
-
-func TestCrawlFeedWithTestServer(t *testing.T) {
- setupTestDB(t)
-
- rssContent := `<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0">
- <channel>
- <title>Test Feed</title>
- <link>https://example.com</link>
- <item>
- <title>Article 1</title>
- <link>https://example.com/article1</link>
- <description>First article</description>
- <pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
- </item>
- <item>
- <title>Article 2</title>
- <link>https://example.com/article2</link>
- <description>Second article</description>
- <pubDate>Tue, 02 Jan 2024 00:00:00 GMT</pubDate>
- </item>
- </channel>
-</rss>`
-
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.Header().Set("Content-Type", "application/rss+xml")
- w.WriteHeader(200)
- w.Write([]byte(rssContent))
- }))
- defer ts.Close()
-
- // Create a feed pointing to the test server
- f := &feed.Feed{Url: ts.URL, Title: "Test"}
- f.Create()
-
- ch := make(chan string, 1)
- CrawlFeed(f, ch)
- result := <-ch
-
- if result == "" {
- t.Error("CrawlFeed should send a result")
- }
-
- // Verify items were created
- var count int
- models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count)
- if count != 2 {
- t.Errorf("Expected 2 items, got %d", count)
- }
-}
-
-func TestCrawlFeedBadContent(t *testing.T) {
- setupTestDB(t)
-
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(200)
- w.Write([]byte("not xml at all"))
- }))
- defer ts.Close()
-
- f := &feed.Feed{Url: ts.URL, Title: "Bad"}
- f.Create()
-
- ch := make(chan string, 1)
- CrawlFeed(f, ch)
- result := <-ch
-
- if result == "" {
- t.Error("CrawlFeed should send a result even on failure")
- }
-}
-
-func TestCrawlWorker(t *testing.T) {
- setupTestDB(t)
-
- rssContent := `<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0">
- <channel>
- <title>Worker Feed</title>
- <link>https://example.com</link>
- <item>
- <title>Worker Article</title>
- <link>https://example.com/worker-article</link>
- <description>An article</description>
- </item>
- </channel>
-</rss>`
-
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(200)
- w.Write([]byte(rssContent))
- }))
- defer ts.Close()
-
- f := &feed.Feed{Url: ts.URL, Title: "Worker Test"}
- f.Create()
-
- feeds := make(chan *feed.Feed, 1)
- results := make(chan string, 1)
-
- feeds <- f
- close(feeds)
-
- CrawlWorker(feeds, results)
- result := <-results
-
- if result == "" {
- t.Error("CrawlWorker should produce a result")
- }
-}
-
-func TestCrawl(t *testing.T) {
- setupTestDB(t)
-
- rssContent := `<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0">
- <channel>
- <title>Crawl Feed</title>
- <link>https://example.com</link>
- <item>
- <title>Crawl Article</title>
- <link>https://example.com/crawl-article</link>
- <description>Article for crawl test</description>
- </item>
- </channel>
-</rss>`
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(200)
- w.Write([]byte(rssContent))
- }))
- defer ts.Close()
-
- f := &feed.Feed{Url: ts.URL, Title: "Full Crawl"}
- f.Create()
-
- // Should not panic
- Crawl()
-
- var count int
- models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count)
- if count != 1 {
- t.Errorf("Expected 1 item after crawl, got %d", count)
- }
-}
-
-func TestCrawlFeedWithExtensions(t *testing.T) {
- setupTestDB(t)
-
- rssContent := `<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
- <channel>
- <title>Extension Feed</title>
- <item>
- <title>Extension Article</title>
- <link>https://example.com/ext</link>
- <description>Short description</description>
- <content:encoded><![CDATA[Much longer content that should be used as description]]></content:encoded>
- </item>
- </channel>
-</rss>`
-
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.WriteHeader(200)
- w.Write([]byte(rssContent))
- }))
- defer ts.Close()
-
- f := &feed.Feed{Url: ts.URL, Title: "Extension Test"}
- f.Create()
-
- ch := make(chan string, 1)
- CrawlFeed(f, ch)
- <-ch
-
- var itemTitle, itemDesc string
- err := models.DB.QueryRow("SELECT title, description FROM item WHERE feed_id = ?", f.Id).Scan(&itemTitle, &itemDesc)
- if err != nil {
- log.Fatal(err)
- }
-
- if itemTitle != "Extension Article" {
- t.Errorf("Expected title 'Extension Article', got %q", itemTitle)
- }
- if !strings.Contains(itemDesc, "Much longer content") {
- t.Errorf("Expected description to contain encoded content, got %q", itemDesc)
- }
-}
diff --git a/crawler/integration_test.go b/crawler/integration_test.go
deleted file mode 100644
index 633b60f..0000000
--- a/crawler/integration_test.go
+++ /dev/null
@@ -1,67 +0,0 @@
-package crawler
-
-import (
- "fmt"
- "net/http"
- "net/http/httptest"
- "os"
- "testing"
-
- "adammathes.com/neko/models/feed"
- "adammathes.com/neko/models/item"
-)
-
-func TestCrawlIntegration(t *testing.T) {
- setupTestDB(t)
-
- // Mock RSS feed server
- ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.Header().Set("Content-Type", "application/rss+xml")
- os.Stdout.Write([]byte("serving mock rss\n"))
- fmt.Fprint(w, `<?xml version="1.0" encoding="UTF-8" ?>
-<rss version="2.0">
-<channel>
- <title>Test Feed</title>
- <link>http://example.com/</link>
- <description>Test Description</description>
- <item>
- <title>Test Item 1</title>
- <link>http://example.com/item1</link>
- <description>Item 1 Description</description>
- <pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
- </item>
-</channel>
-</rss>`)
- }))
- defer ts.Close()
-
- // Add the feed
- f := &feed.Feed{Url: ts.URL}
- err := f.Create()
- if err != nil {
- t.Fatalf("Failed to create feed: %v", err)
- }
-
- // Crawl
- ch := make(chan string, 1)
- CrawlFeed(f, ch)
-
- res := <-ch
- if res == "" {
- t.Fatal("CrawlFeed returned empty result")
- }
-
- // Verify items were stored
- items, err := item.Filter(0, f.Id, "", false, false, 0, "")
- if err != nil {
- t.Fatalf("Failed to filter items: %v", err)
- }
-
- if len(items) != 1 {
- t.Fatalf("Expected 1 item, got %d", len(items))
- }
-
- if items[0].Title != "Test Item 1" {
- t.Errorf("Expected 'Test Item 1', got %q", items[0].Title)
- }
-}