diff options
author | Adam Mathes <adam@trenchant.org> | 2017-02-01 20:44:37 -0800 |
---|---|---|
committer | Adam Mathes <adam@trenchant.org> | 2017-02-01 20:44:37 -0800 |
commit | 8ad7e867b96df3336c4fea8840297553125cc1d1 (patch) | |
tree | 97d36c28fc7e976829c78037b5b24a86851c4f91 /crawler | |
parent | 09a859501fc1fb1f4f781cfdd448f359b7ef4983 (diff) | |
download | neko-8ad7e867b96df3336c4fea8840297553125cc1d1.tar.gz neko-8ad7e867b96df3336c4fea8840297553125cc1d1.tar.bz2 neko-8ad7e867b96df3336c4fea8840297553125cc1d1.zip |
switch rss parsers, properly support content:encoded
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/crawler.go | 29 |
1 files changed, 24 insertions, 5 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go index faf0b70..4d91241 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -6,7 +6,7 @@ import ( "neko/models/item" "net/http" "time" - "github.com/SlyMarbo/rss" + "github.com/mmcdole/gofeed" ) @@ -37,7 +37,10 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { Timeout: 5 * time.Second, } - feed, err := rss.FetchByClient(f.Url, c) + fp := gofeed.NewParser() + fp.Client = c + + feed, err := fp.ParseURL(f.Url) if err != nil { log.Print(err) ch <- "failed to fetch and parse for " + f.Url @@ -52,10 +55,26 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) { var item item.Item item.Title = i.Title item.Url = i.Link - item.Description = i.Content - if item.Description == "" { - item.Description = i.Summary + + item.Description = i.Description + if len(i.Content) > len(item.Description) { + item.Description = i.Content + } + + // a lot of RSS2.0 generated by wordpress and others + // uses <content:encoded> + + e,ok := i.Extensions["content"]["encoded"] + var encoded = "" + if ok { + encoded = e[0].Value + } + + if len(encoded) > len(item.Description) { + item.Description = encoded } + + item.FeedId = f.Id err := item.Create() if err != nil { |