aboutsummaryrefslogtreecommitdiffstats
path: root/crawler
diff options
context:
space:
mode:
authorAdam Mathes <adam@trenchant.org>2017-02-01 20:44:37 -0800
committerAdam Mathes <adam@trenchant.org>2017-02-01 20:44:37 -0800
commit8ad7e867b96df3336c4fea8840297553125cc1d1 (patch)
tree97d36c28fc7e976829c78037b5b24a86851c4f91 /crawler
parent09a859501fc1fb1f4f781cfdd448f359b7ef4983 (diff)
downloadneko-8ad7e867b96df3336c4fea8840297553125cc1d1.tar.gz
neko-8ad7e867b96df3336c4fea8840297553125cc1d1.tar.bz2
neko-8ad7e867b96df3336c4fea8840297553125cc1d1.zip
switch rss parsers, properly support content:encoded
Diffstat (limited to 'crawler')
-rw-r--r--crawler/crawler.go29
1 files changed, 24 insertions, 5 deletions
diff --git a/crawler/crawler.go b/crawler/crawler.go
index faf0b70..4d91241 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -6,7 +6,7 @@ import (
"neko/models/item"
"net/http"
"time"
- "github.com/SlyMarbo/rss"
+ "github.com/mmcdole/gofeed"
)
@@ -37,7 +37,10 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) {
Timeout: 5 * time.Second,
}
- feed, err := rss.FetchByClient(f.Url, c)
+ fp := gofeed.NewParser()
+ fp.Client = c
+
+ feed, err := fp.ParseURL(f.Url)
if err != nil {
log.Print(err)
ch <- "failed to fetch and parse for " + f.Url
@@ -52,10 +55,26 @@ func CrawlFeed(f *feed.Feed, ch chan<- string) {
var item item.Item
item.Title = i.Title
item.Url = i.Link
- item.Description = i.Content
- if item.Description == "" {
- item.Description = i.Summary
+
+ item.Description = i.Description
+ if len(i.Content) > len(item.Description) {
+ item.Description = i.Content
+ }
+
+ // a lot of RSS2.0 generated by wordpress and others
+ // uses <content:encoded>
+
+ e,ok := i.Extensions["content"]["encoded"]
+ var encoded = ""
+ if ok {
+ encoded = e[0].Value
+ }
+
+ if len(encoded) > len(item.Description) {
+ item.Description = encoded
}
+
+
item.FeedId = f.Id
err := item.Create()
if err != nil {