From 8ad7e867b96df3336c4fea8840297553125cc1d1 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Wed, 1 Feb 2017 20:44:37 -0800 Subject: switch rss parsers, properly support content:encoded --- crawler/crawler.go | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'crawler') diff --git a/crawler/crawler.go b/crawler/crawler.go index faf0b70..4d91241 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -6,7 +6,7 @@ import ( "neko/models/item" "net/http" "time" - "github.com/SlyMarbo/rss" + "github.com/mmcdole/gofeed" ) @@ -37,7 +37,10 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { Timeout: 5 * time.Second, } - feed, err := rss.FetchByClient(f.Url, c) + fp := gofeed.NewParser() + fp.Client = c + + feed, err := fp.ParseURL(f.Url) if err != nil { log.Print(err) ch <- "failed to fetch and parse for " + f.Url @@ -52,10 +55,26 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { var item item.Item item.Title = i.Title item.Url = i.Link - item.Description = i.Content - if item.Description == "" { - item.Description = i.Summary + + item.Description = i.Description + if len(i.Content) > len(item.Description) { + item.Description = i.Content + } + + // a lot of RSS2.0 generated by wordpress and others + // uses + + e,ok := i.Extensions["content"]["encoded"] + var encoded = "" + if ok { + encoded = e[0].Value + } + + if len(encoded) > len(item.Description) { + item.Description = encoded } + + item.FeedId = f.Id err := item.Create() if err != nil { -- cgit v1.2.3