aboutsummaryrefslogtreecommitdiffstats
path: root/internal/crawler
diff options
context:
space:
mode:
Diffstat (limited to 'internal/crawler')
-rw-r--r--internal/crawler/crawler.go161
-rw-r--r--internal/crawler/crawler_test.go278
-rw-r--r--internal/crawler/integration_test.go67
3 files changed, 506 insertions, 0 deletions
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
new file mode 100644
index 0000000..10253d8
--- /dev/null
+++ b/internal/crawler/crawler.go
@@ -0,0 +1,161 @@
+package crawler
+
+import (
+ "io/ioutil"
+ "log"
+ "net/http"
+ "time"
+
+ "adammathes.com/neko/internal/vlog"
+ "adammathes.com/neko/models/feed"
+ "adammathes.com/neko/models/item"
+ "github.com/mmcdole/gofeed"
+)
+
+const MAX_CRAWLERS = 5
+
+func Crawl() {
+ crawlJobs := make(chan *feed.Feed, 100)
+ results := make(chan string, 100)
+
+ feeds, err := feed.All()
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for i := 0; i < MAX_CRAWLERS; i++ {
+ vlog.Printf("spawning crawl worker %d\n", i)
+ go CrawlWorker(crawlJobs, results)
+ }
+
+ for _, f := range feeds {
+ vlog.Printf("sending crawl job %s\n", f.Url)
+ crawlJobs <- f
+ }
+ close(crawlJobs)
+
+ for i := 0; i < len(feeds); i++ {
+ vlog.Println(<-results)
+ }
+ close(results)
+}
+
+func CrawlWorker(feeds <-chan *feed.Feed, results chan<- string) {
+
+ for f := range feeds {
+ vlog.Printf("crawl job received %s\n", f.Url)
+ CrawlFeed(f, results)
+ vlog.Printf("crawl job finished %s\n", f.Url)
+ }
+}
+
+/*
+Simple HTTP Get fnx with custom user agent header
+*/
+func GetFeedContent(feedURL string) string {
+
+ // introduce delays for testing
+ // n := time.Duration(rand.Int63n(3))
+ // time.Sleep(n * time.Second)
+
+ c := &http.Client{
+ // give up after 5 seconds
+ Timeout: 5 * time.Second,
+ }
+
+ request, err := http.NewRequest("GET", feedURL, nil)
+ if err != nil {
+ log.Fatalln(err)
+ }
+
+ userAgent := "neko RSS Crawler +https://github.com/adammathes/neko"
+ request.Header.Set("User-Agent", userAgent)
+ resp, err := c.Do(request)
+
+ if err != nil {
+ return ""
+ }
+
+ if resp != nil {
+ defer func() {
+ ce := resp.Body.Close()
+ if ce != nil {
+ err = ce
+ }
+ }()
+ }
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return ""
+ }
+
+ bodyBytes, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return ""
+ }
+ return string(bodyBytes)
+}
+
+/*
+TODO: sanitize input on crawl
+*/
+func CrawlFeed(f *feed.Feed, ch chan<- string) {
+ c := &http.Client{
+ // give up after 5 seconds
+ Timeout: 5 * time.Second,
+ }
+
+ fp := gofeed.NewParser()
+ fp.Client = c
+
+ content := GetFeedContent(f.Url)
+ feed, err := fp.ParseString(content)
+ if err != nil {
+ vlog.Println(err)
+ ch <- "failed parse for " + f.Url + "\n"
+ return
+ }
+
+ f.Title = feed.Title
+ f.WebUrl = feed.Link
+ f.Update()
+
+ for _, i := range feed.Items {
+ vlog.Printf("storing item: %s\n", i.Link)
+ var item item.Item
+ item.Title = i.Title
+ item.Url = i.Link
+
+ item.Description = i.Description
+ if len(i.Content) > len(item.Description) {
+ item.Description = i.Content
+ }
+
+ // a lot of RSS2.0 generated by wordpress and others
+ // uses <content:encoded>
+ e, ok := i.Extensions["content"]["encoded"]
+ var encoded = ""
+ if ok {
+ encoded = e[0].Value
+ }
+ if len(encoded) > len(item.Description) {
+ item.Description = encoded
+ }
+
+ if i.PublishedParsed != nil {
+ item.PublishDate = i.PublishedParsed.Format("2006-01-02 15:04:05")
+ } else {
+ item.PublishDate = time.Now().Format("2006-01-02 15:04:05")
+ }
+
+ item.FeedId = f.Id
+ err := item.Create()
+ if err != nil {
+ vlog.Println(err)
+ }
+ // else {
+ // item.GetFullContent()
+ //}
+ }
+ ch <- "successfully crawled " + f.Url + "\n"
+}
diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go
new file mode 100644
index 0000000..e0c4c6b
--- /dev/null
+++ b/internal/crawler/crawler_test.go
@@ -0,0 +1,278 @@
+package crawler
+
+import (
+ "log"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+
+ "adammathes.com/neko/config"
+ "adammathes.com/neko/models"
+ "adammathes.com/neko/models/feed"
+)
+
+func setupTestDB(t *testing.T) {
+ t.Helper()
+ config.Config.DBFile = ":memory:"
+ models.InitDB()
+ t.Cleanup(func() {
+ if models.DB != nil {
+ models.DB.Close()
+ }
+ })
+}
+
+func TestGetFeedContentSuccess(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ ua := r.Header.Get("User-Agent")
+ if ua == "" {
+ t.Error("Request should include User-Agent")
+ }
+ w.WriteHeader(200)
+ w.Write([]byte("<rss><channel><title>Test</title></channel></rss>"))
+ }))
+ defer ts.Close()
+
+ content := GetFeedContent(ts.URL)
+ if content == "" {
+ t.Error("GetFeedContent should return content for valid URL")
+ }
+ if content != "<rss><channel><title>Test</title></channel></rss>" {
+ t.Errorf("Unexpected content: %q", content)
+ }
+}
+
+func TestGetFeedContentBadURL(t *testing.T) {
+ content := GetFeedContent("http://invalid.invalid.invalid:99999/feed")
+ if content != "" {
+ t.Errorf("GetFeedContent should return empty string for bad URL, got %q", content)
+ }
+}
+
+func TestGetFeedContent404(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(404)
+ }))
+ defer ts.Close()
+
+ content := GetFeedContent(ts.URL)
+ if content != "" {
+ t.Errorf("GetFeedContent should return empty for 404, got %q", content)
+ }
+}
+
+func TestGetFeedContent500(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(500)
+ }))
+ defer ts.Close()
+
+ content := GetFeedContent(ts.URL)
+ if content != "" {
+ t.Errorf("GetFeedContent should return empty for 500, got %q", content)
+ }
+}
+
+func TestGetFeedContentUserAgent(t *testing.T) {
+ var receivedUA string
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ receivedUA = r.Header.Get("User-Agent")
+ w.WriteHeader(200)
+ w.Write([]byte("ok"))
+ }))
+ defer ts.Close()
+
+ GetFeedContent(ts.URL)
+ expected := "neko RSS Crawler +https://github.com/adammathes/neko"
+ if receivedUA != expected {
+ t.Errorf("Expected UA %q, got %q", expected, receivedUA)
+ }
+}
+
+func TestCrawlFeedWithTestServer(t *testing.T) {
+ setupTestDB(t)
+
+ rssContent := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Test Feed</title>
+ <link>https://example.com</link>
+ <item>
+ <title>Article 1</title>
+ <link>https://example.com/article1</link>
+ <description>First article</description>
+ <pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
+ </item>
+ <item>
+ <title>Article 2</title>
+ <link>https://example.com/article2</link>
+ <description>Second article</description>
+ <pubDate>Tue, 02 Jan 2024 00:00:00 GMT</pubDate>
+ </item>
+ </channel>
+</rss>`
+
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/rss+xml")
+ w.WriteHeader(200)
+ w.Write([]byte(rssContent))
+ }))
+ defer ts.Close()
+
+ // Create a feed pointing to the test server
+ f := &feed.Feed{Url: ts.URL, Title: "Test"}
+ f.Create()
+
+ ch := make(chan string, 1)
+ CrawlFeed(f, ch)
+ result := <-ch
+
+ if result == "" {
+ t.Error("CrawlFeed should send a result")
+ }
+
+ // Verify items were created
+ var count int
+ models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count)
+ if count != 2 {
+ t.Errorf("Expected 2 items, got %d", count)
+ }
+}
+
+func TestCrawlFeedBadContent(t *testing.T) {
+ setupTestDB(t)
+
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte("not xml at all"))
+ }))
+ defer ts.Close()
+
+ f := &feed.Feed{Url: ts.URL, Title: "Bad"}
+ f.Create()
+
+ ch := make(chan string, 1)
+ CrawlFeed(f, ch)
+ result := <-ch
+
+ if result == "" {
+ t.Error("CrawlFeed should send a result even on failure")
+ }
+}
+
+func TestCrawlWorker(t *testing.T) {
+ setupTestDB(t)
+
+ rssContent := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Worker Feed</title>
+ <link>https://example.com</link>
+ <item>
+ <title>Worker Article</title>
+ <link>https://example.com/worker-article</link>
+ <description>An article</description>
+ </item>
+ </channel>
+</rss>`
+
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(rssContent))
+ }))
+ defer ts.Close()
+
+ f := &feed.Feed{Url: ts.URL, Title: "Worker Test"}
+ f.Create()
+
+ feeds := make(chan *feed.Feed, 1)
+ results := make(chan string, 1)
+
+ feeds <- f
+ close(feeds)
+
+ CrawlWorker(feeds, results)
+ result := <-results
+
+ if result == "" {
+ t.Error("CrawlWorker should produce a result")
+ }
+}
+
+func TestCrawl(t *testing.T) {
+ setupTestDB(t)
+
+ rssContent := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Crawl Feed</title>
+ <link>https://example.com</link>
+ <item>
+ <title>Crawl Article</title>
+ <link>https://example.com/crawl-article</link>
+ <description>Article for crawl test</description>
+ </item>
+ </channel>
+</rss>`
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(rssContent))
+ }))
+ defer ts.Close()
+
+ f := &feed.Feed{Url: ts.URL, Title: "Full Crawl"}
+ f.Create()
+
+ // Should not panic
+ Crawl()
+
+ var count int
+ models.DB.QueryRow("SELECT COUNT(*) FROM item").Scan(&count)
+ if count != 1 {
+ t.Errorf("Expected 1 item after crawl, got %d", count)
+ }
+}
+
+func TestCrawlFeedWithExtensions(t *testing.T) {
+ setupTestDB(t)
+
+ rssContent := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
+ <channel>
+ <title>Extension Feed</title>
+ <item>
+ <title>Extension Article</title>
+ <link>https://example.com/ext</link>
+ <description>Short description</description>
+ <content:encoded><![CDATA[Much longer content that should be used as description]]></content:encoded>
+ </item>
+ </channel>
+</rss>`
+
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(rssContent))
+ }))
+ defer ts.Close()
+
+ f := &feed.Feed{Url: ts.URL, Title: "Extension Test"}
+ f.Create()
+
+ ch := make(chan string, 1)
+ CrawlFeed(f, ch)
+ <-ch
+
+ var itemTitle, itemDesc string
+ err := models.DB.QueryRow("SELECT title, description FROM item WHERE feed_id = ?", f.Id).Scan(&itemTitle, &itemDesc)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ if itemTitle != "Extension Article" {
+ t.Errorf("Expected title 'Extension Article', got %q", itemTitle)
+ }
+ if !strings.Contains(itemDesc, "Much longer content") {
+ t.Errorf("Expected description to contain encoded content, got %q", itemDesc)
+ }
+}
diff --git a/internal/crawler/integration_test.go b/internal/crawler/integration_test.go
new file mode 100644
index 0000000..633b60f
--- /dev/null
+++ b/internal/crawler/integration_test.go
@@ -0,0 +1,67 @@
+package crawler
+
+import (
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+
+ "adammathes.com/neko/models/feed"
+ "adammathes.com/neko/models/item"
+)
+
+func TestCrawlIntegration(t *testing.T) {
+ setupTestDB(t)
+
+ // Mock RSS feed server
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/rss+xml")
+ os.Stdout.Write([]byte("serving mock rss\n"))
+ fmt.Fprint(w, `<?xml version="1.0" encoding="UTF-8" ?>
+<rss version="2.0">
+<channel>
+ <title>Test Feed</title>
+ <link>http://example.com/</link>
+ <description>Test Description</description>
+ <item>
+ <title>Test Item 1</title>
+ <link>http://example.com/item1</link>
+ <description>Item 1 Description</description>
+ <pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
+ </item>
+</channel>
+</rss>`)
+ }))
+ defer ts.Close()
+
+ // Add the feed
+ f := &feed.Feed{Url: ts.URL}
+ err := f.Create()
+ if err != nil {
+ t.Fatalf("Failed to create feed: %v", err)
+ }
+
+ // Crawl
+ ch := make(chan string, 1)
+ CrawlFeed(f, ch)
+
+ res := <-ch
+ if res == "" {
+ t.Fatal("CrawlFeed returned empty result")
+ }
+
+ // Verify items were stored
+ items, err := item.Filter(0, f.Id, "", false, false, 0, "")
+ if err != nil {
+ t.Fatalf("Failed to filter items: %v", err)
+ }
+
+ if len(items) != 1 {
+ t.Fatalf("Expected 1 item, got %d", len(items))
+ }
+
+ if items[0].Title != "Test Item 1" {
+ t.Errorf("Expected 'Test Item 1', got %q", items[0].Title)
+ }
+}